RubyGems - llama_cpp - Versions diffs - 0.0.6 → 0.0.7 - Mend

llama_cpp 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -1
data/ext/llama_cpp/extconf.rb +9 -0
data/ext/llama_cpp/llama_cpp.cpp +26 -0
data/ext/llama_cpp/src/ggml-cuda.h +32 -0
data/ext/llama_cpp/src/ggml-opencl.c +216 -0
data/ext/llama_cpp/src/ggml-opencl.h +24 -0
data/ext/llama_cpp/src/ggml.c +1436 -624
data/ext/llama_cpp/src/ggml.h +654 -627
data/ext/llama_cpp/src/llama.cpp +212 -29
data/ext/llama_cpp/src/llama.h +17 -13
data/ext/llama_cpp/src/llama_util.h +15 -2
data/lib/llama_cpp/client.rb +151 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +16 -8
data/sig/llama_cpp.rbs +16 -1
metadata +5 -2

data/ext/llama_cpp/src/llama.cpp CHANGED Viewed

@@ -27,6 +27,7 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
+#include <sstream>
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
         { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
     };
     return _MEM_REQ_SCRATCH0;
 }
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
         { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
     };
     return _MEM_REQ_SCRATCH1;
-};
+}
 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
         { MODEL_65B,  5120ull * MB },
     };
     return _MEM_REQ_KV_SELF;
-};
+}
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
         { MODEL_65B, 1536ull * MB },
     };
     return _MEM_REQ_EVAL;
-};
+}
 // default hparams (LLaMA 7B)
 struct llama_hparams {
@@ -483,6 +484,9 @@ struct llama_file_loader {
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q4_3:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
+                case GGML_TYPE_Q8_0:
                     break;
                 default: {
                     throw format("unrecognized tensor type %u\n", shard.type);
@@ -557,6 +561,9 @@ struct llama_file_saver {
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q4_3:
+            case GGML_TYPE_Q5_0:
+            case GGML_TYPE_Q5_1:
+            case GGML_TYPE_Q8_0:
                 break;
             default: LLAMA_ASSERT(false);
         }
@@ -847,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
                                       return "mostly Q4_1, some F16";
         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
         case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
         default:                      return "unknown, may not work";
     }
 }
@@ -1075,7 +1085,7 @@ static bool llama_eval_internal(
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1249,9 +1259,11 @@ static bool llama_eval_internal(
     ggml_build_forward_expand(&gf, inpL);
     ggml_graph_compute       (ctx0, &gf);
+#ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
     // requires GGML_PERF to be defined
-    //ggml_graph_print(&gf);
+    ggml_graph_print(&gf);
+#endif
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
@@ -1582,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
         default: throw format("invalid output file type %d\n", ftype);
     };
@@ -1618,8 +1633,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // quantize only 2D tensors
         quantize &= (tensor.ne.size() == 2);
-        // GG: uncomment this to keep the output layer in FP16
-        //if (tensor.name.rfind("output")) {
+        // uncomment this to keep the output layer in FP16
+        //if (tensor.name == "output.weight") {
         //    quantize = false;
         //}
@@ -1787,7 +1802,7 @@ struct llama_context * llama_init_from_file(
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
         } else {
-            ctx->logits.reserve(hparams.n_ctx);
+            ctx->logits.reserve(hparams.n_vocab);
         }
         if (params.embedding){
@@ -2069,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
     }
 }
-// Returns the KV cache that will contain the context for the
-// ongoing prediction with the model.
-const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.addr;
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
 }
-// Returns the size of the KV cache
-size_t llama_get_kv_cache_size(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.size;
+#define LLAMA_MAX_RNG_STATE 64*1024
+void llama_set_rng_seed(struct llama_context * ctx, int seed) {
+    if (seed <= 0) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
 }
-int llama_get_kv_cache_token_count(struct llama_context * ctx) {
-    return ctx->model.kv_self.n;
+// Returns the size of the state
+size_t llama_get_state_size(struct llama_context * ctx) {
+    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
+    // for reference, std::mt19937(1337) serializes to 6701 bytes.
+    const size_t s_rng_size        = sizeof(size_t);
+    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
+    const size_t s_logits_capacity = sizeof(size_t);
+    const size_t s_logits_size     = sizeof(size_t);
+    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
+    const size_t s_embedding_size  = sizeof(size_t);
+    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
+    const size_t s_kv_size         = sizeof(size_t);
+    const size_t s_kv_ntok         = sizeof(int);
+    const size_t s_kv              = ctx->model.kv_self.buf.size;
+    const size_t s_total = (
+        + s_rng_size
+        + s_rng
+        + s_logits_capacity
+        + s_logits_size
+        + s_logits
+        + s_embedding_size
+        + s_embedding
+        + s_kv_size
+        + s_kv_ntok
+        + s_kv
+    );
+    return s_total;
 }
-// Sets the KV cache containing the current context for the model
-void llama_set_kv_cache(
-        struct llama_context * ctx,
-               const uint8_t * kv_cache,
-                      size_t   n_size,
-                         int   n_token_count) {
-    // Make sure we have the same kv cache setup
-    LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
-    memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
-    ctx->model.kv_self.n = n_token_count;
+// Copies the state to the specified destination address
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
+    uint8_t * out = dest;
+    // copy rng
+    {
+        std::stringstream rng_ss;
+        rng_ss << ctx->rng;
+        const size_t rng_size = rng_ss.str().size();
+        char rng_buf[LLAMA_MAX_RNG_STATE];
+        memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
+        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+        memcpy(out, &rng_size,   sizeof(rng_size));    out += sizeof(rng_size);
+        memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
+    }
+    // copy logits
+    {
+        const size_t logits_cap  = ctx->logits.capacity();
+        const size_t logits_size = ctx->logits.size();
+        memcpy(out, &logits_cap,  sizeof(logits_cap));  out += sizeof(logits_cap);
+        memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
+        if (logits_size) {
+            memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
+        }
+        out += logits_cap * sizeof(float);
+    }
+    // copy embeddings
+    {
+        const size_t embedding_size = ctx->embedding.size();
+        memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
+        if (embedding_size) {
+            memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
+            out += embedding_size * sizeof(float);
+        }
+    }
+    // copy kv cache
+    {
+        const size_t kv_size = ctx->model.kv_self.buf.size;
+        const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
+        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
+        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
+        if (kv_size) {
+            memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
+        }
+    }
+    const size_t written  = out - dest;
+    const size_t expected = llama_get_state_size(ctx);
+    LLAMA_ASSERT(written == expected);
+    return written;
+}
+// Sets the state reading from the specified source address
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    const uint8_t * in = src;
+    // set rng
+    {
+        size_t rng_size;
+        char   rng_buf[LLAMA_MAX_RNG_STATE];
+        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
+        memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
+        std::stringstream rng_ss;
+        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        rng_ss >> ctx->rng;
+        LLAMA_ASSERT(rng_ss.fail() == false);
+    }
+    // set logits
+    {
+        size_t logits_cap;
+        size_t logits_size;
+        memcpy(&logits_cap,  in, sizeof(logits_cap));  in += sizeof(logits_cap);
+        memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
+        LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
+        if (logits_size) {
+            ctx->logits.resize(logits_size);
+            memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+        }
+        in += logits_cap * sizeof(float);
+    }
+    // set embeddings
+    {
+        size_t embedding_size;
+        memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
+        LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
+        if (embedding_size) {
+            memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
+            in += embedding_size * sizeof(float);
+        }
+    }
+    // set kv cache
+    {
+        size_t kv_size;
+        int kv_ntok;
+        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
+        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+        if (kv_size) {
+            LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
+            void * k_data = ctx->model.kv_self.k->data; // remember data pointers
+            void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
+            memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
+            ctx->model.kv_self.k->data = k_data; // restore correct data pointers
+            ctx->model.kv_self.v->data = v_data;
+        }
+        ctx->model.kv_self.n = kv_ntok;
+    }
+    const size_t nread    = in - src;
+    const size_t expected = llama_get_state_size(ctx);
+    LLAMA_ASSERT(nread == expected);
+    return nread;
 }
 int llama_eval(
@@ -2248,3 +2430,4 @@ const char * llama_print_system_info(void) {
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
     return ctx->model.tensors_by_name;
 }

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -74,6 +74,9 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
     };
     LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +115,23 @@ extern "C" {
                       const char * path_base_model,
                              int   n_threads);
-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process

data/ext/llama_cpp/src/llama_util.h CHANGED Viewed

@@ -21,6 +21,9 @@
         #if defined(_POSIX_MAPPED_FILES)
             #include <sys/mman.h>
         #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
     #endif
 #endif
@@ -303,8 +306,18 @@ struct llama_mlock {
         if (!mlock(addr, size)) {
             return true;
         } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
-                    size, this->size, std::strerror(errno));
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
             return false;
         }
     }

data/lib/llama_cpp/client.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# frozen_string_literal: true
+module LLaMACpp
+  # Client provides a high-level interface to the LLM model.
+  class Client
+    # Creates a new client.
+    #
+    # @param model_path [String] The path to the model file.
+    # @param lora_adapter_path [String] The path to the LoRA adapter file.
+    # @param lora_base_path [String] The path to the LoRA base model file.
+    # @param n_ctx [Integer] The context size.
+    # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
+    # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
+    # @param use_mmap [Boolean] The flag whether to use mmap.
+    # @param use_mlock [Boolean] The flag hether to use mlock.
+    # @param embedding [Boolean] The flag whether to calculate embedding.
+    # @param n_threads [Integer] The number of threads to use.
+    # @param seed [Integer] The seed for the random number generator.
+    # @return [Client]
+    # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
+    def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
+                   n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
+                   embedding: false,
+                   n_threads: 1, seed: 0)
+      @params = {
+        model_path: model_path,
+        lora_adapter_path: lora_adapter_path,
+        lora_base_path: lora_base_path,
+        n_ctx: n_ctx,
+        n_parts: n_parts,
+        memory_f16: memory_f16,
+        use_mmap: use_mmap,
+        use_mlock: use_mlock,
+        embedding: embedding,
+        n_threads: n_threads,
+        seed: seed
+      }
+      @context_params = ContextParams.new
+      @context_params.n_ctx = n_ctx
+      @context_params.n_parts = n_parts
+      @context_params.f16_kv = memory_f16
+      @context_params.use_mmap = use_mmap
+      @context_params.use_mlock = use_mlock
+      @context_params.embedding = embedding
+      @context_params.seed = seed
+      @context = Context.new(model_path: model_path, params: @context_params)
+      return unless lora_adapter_path.is_a?(String)
+      if lora_base_path.is_a?(String)
+        @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
+      else
+        @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
+      end
+    end
+    # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
+    # Generates completions for a given prompt.
+    #
+    # @param prompt [String] The prompt to generate completions for.
+    # @param max_tokens [Integer] The maximum number of tokens to generate.
+    # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
+    # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
+    # @param n_batch [Integer] The batch size.
+    # @param top_k [Integer] The top-k value.
+    # @param top_p [Float] The top-p value.
+    # @param temperature [Float] The temperature value.
+    # @param repeat_penalty [Float] The repeat penalty value.
+    # @return [String]
+    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
+    def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
+                    top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
+      embd_input = tokenize_prompt(prompt)
+      n_ctx = @context.n_ctx
+      raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
+      last_n_tokens = [0] * n_ctx
+      embd = []
+      n_consumed = 0
+      n_past = 0
+      n_remain = max_tokens
+      output = []
+      while n_remain != 0
+        unless embd.empty?
+          if n_past + embd.size > n_ctx
+            n_left = n_past - n_keep
+            n_past = n_keep
+            embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
+          end
+          @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
+        end
+        n_past += embd.size
+        embd.clear
+        if embd_input.size <= n_consumed
+          start = n_ctx - repeat_last_n
+          id = @context.sample_top_p_top_k(
+            last_n_tokens[start...(start + repeat_last_n)],
+            top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
+          )
+          last_n_tokens.shift
+          last_n_tokens.push(id)
+          embd.push(id)
+          n_remain -= 1
+        else
+          while embd_input.size > n_consumed
+            embd.push(embd_input[n_consumed])
+            last_n_tokens.shift
+            last_n_tokens.push(embd_input[n_consumed])
+            n_consumed += 1
+            break if embd.size >= n_batch
+          end
+        end
+        embd.each { |token| output << @context.token_to_str(token) }
+        break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
+      end
+      output.join.delete_prefix(" #{prompt}").strip
+    end
+    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
+    # def chat(prompt); end
+    # Obtains the embedding for a given text.
+    #
+    # @param text [String] The text to obtain the embedding for.
+    # @return [Array<Float>]
+    def embeddings(text)
+      raise 'The embedding option is set to false' unless @params[:embedding]
+      embd_input = tokenize_prompt(text)
+      raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
+      @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
+      @context.embeddings
+    end
+    private
+    def tokenize_prompt(prompt)
+      @context.tokenize(text: " #{prompt}", add_bos: true)
+    end
+  end
+end

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.0.6'
+  VERSION = '0.0.7'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-12b5900'
+  LLAMA_CPP_VERSION = 'master-11d9023'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative 'llama_cpp/version'
 require_relative 'llama_cpp/llama_cpp'
+require_relative 'llama_cpp/client'
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
   # Generates sentences following the given prompt for operation check.
   #
-  # @param context [LLaMACpp::Context]
-  # @param prompt [String]
-  # @param n_threads [Integer]
+  # @param context [LLaMACpp::Context] The context to use.
+  # @param prompt [String] The prompt to start generation with.
+  # @param n_predict [Integer] The number of tokens to predict.
+  # @param n_threads [Integer] The number of threads.
   # @return [String]
-  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
-    spaced_prompt = " #{prompt}"
+  def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+    raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
+    raise ArgumentError, 'context must have loaded the model' if context.empty?
+    raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
+    spaced_prompt = " #{prompt}"
     embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
     n_ctx = context.n_ctx
+    raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
     last_n_tokens = [0] * n_ctx
     embd = []
     n_consumed = 0
     n_keep = 10
     n_past = 0
-    n_remain = 128
+    n_remain = n_predict
     repeat_last_n = 64
+    n_batch = 512
     output = []
     while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
           last_n_tokens.shift
           last_n_tokens.push(embd_input[n_consumed])
           n_consumed += 1
-          break if embd.size >= 512
+          break if embd.size >= n_batch
         end
       end
       embd.each { |token| output << context.token_to_str(token) }
-      break if embd[-1] == LLaMACpp.token_eos
+      break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
     end
     output.join.delete_prefix(spaced_prompt).strip