RubyGems - llama-rb - Versions diffs - 0.1.0 → 0.2.0 - Mend

llama-rb 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/Gemfile +1 -1
data/Gemfile.lock +1 -3
data/bin/console +7 -0
data/ext/Makefile +4 -0
data/ext/extconf.rb +10 -0
data/lib/llama/model.rb +36 -64
data/lib/llama/version.rb +1 -1
data/lib/llama.rb +0 -1
data/llama-rb.gemspec +25 -25
data/llama.cpp/LICENSE +21 -0
data/llama.cpp/Makefile +175 -0
data/llama.cpp/README.md +389 -0
data/{ext/llama → llama.cpp/examples}/common.cpp +10 -3
data/llama.cpp/examples/main/main.cpp +460 -0
data/{ext/llama → llama.cpp}/ggml.c +587 -485
data/{ext/llama → llama.cpp}/ggml.h +36 -26
data/{ext/llama → llama.cpp}/llama.cpp +85 -35
data/{ext/llama → llama.cpp}/llama.h +17 -0
metadata +18 -27
data/ext/llama/extconf.rb +0 -12
data/ext/llama/model.cpp +0 -192
/data/{ext/llama → llama.cpp/examples}/common.h +0 -0

data/{ext/llama → llama.cpp}/ggml.h RENAMED Viewed

@@ -258,11 +258,11 @@ struct ggml_tensor {
     enum ggml_type type;
     int    n_dims;
-    int    ne[GGML_MAX_DIMS]; // number of elements
-    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
-                              // nb[0] = sizeof(type)
-                              // nb[1] = nb[0]   * ne[0] + padding
-                              // nb[i] = nb[i-1] * ne[i-1]
+    int64_t ne[GGML_MAX_DIMS]; // number of elements
+    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                               // nb[0] = sizeof(type)
+                               // nb[1] = nb[0]   * ne[0] + padding
+                               // nb[i] = nb[i-1] * ne[i-1]
     // compute data
     enum ggml_op op;
@@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);
-int    ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
 int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,
         int    n_dims,
-        const int *ne);
+        const int64_t *ne);
 struct ggml_tensor * ggml_new_tensor_1d(
         struct ggml_context * ctx,
         enum   ggml_type type,
-        int    ne0);
+        int64_t ne0);
 struct ggml_tensor * ggml_new_tensor_2d(
         struct ggml_context * ctx,
         enum   ggml_type type,
-        int    ne0,
-        int    ne1);
+        int64_t ne0,
+        int64_t ne1);
 struct ggml_tensor * ggml_new_tensor_3d(
         struct ggml_context * ctx,
         enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2);
 struct ggml_tensor * ggml_new_tensor_4d(
         struct ggml_context * ctx,
         enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2,
-        int    ne3);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3);
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -531,33 +531,43 @@ struct ggml_tensor * ggml_reshape(
 struct ggml_tensor * ggml_reshape_2d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1);
+        int64_t               ne0,
+        int64_t               ne1);
 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
-        int                   ne2);
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2);
 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   ne0,
+        int64_t               ne0,
         size_t                offset);
 struct ggml_tensor * ggml_view_2d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
+        int64_t               ne0,
+        int64_t               ne1,
         size_t                nb1, // row stride in bytes
         size_t                offset);
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1, // row   stride in bytes
+        size_t                nb2, // slice stride in bytes
+        size_t                offset);
 struct ggml_tensor * ggml_permute(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,

data/{ext/llama → llama.cpp}/llama.cpp RENAMED Viewed

@@ -256,8 +256,8 @@ static bool kv_cache_init(
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
-    const int n_mem      = n_layer*n_ctx;
-    const int n_elements = n_embd*n_mem;
+    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_elements = n_embd*n_mem;
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -679,7 +679,7 @@ static bool llama_model_load(
                 return false;
             }
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
                         __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                 return false;
             }
@@ -810,37 +810,35 @@ static bool llama_eval_internal(
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+            // compute Q and K and RoPE them
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
             // store key and value to memory
-            if (N >= 1) {
+            {
+                // compute the transposed [N, n_embd] V matrix
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                // important: storing RoPE-ed version of K in the KV cache!
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
             }
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
-                        ggml_rope(ctx0,
-                            ggml_cpy(ctx0,
-                                Qcur,
-                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                            n_past, n_rot, 0),
+                        Qcur,
                         0, 2, 1, 3);
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
-                        ggml_rope(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            n_past, n_rot, 1),
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
             // K * Q
@@ -858,18 +856,23 @@ static bool llama_eval_internal(
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                    ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                    ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
+            // split cached V into n_head heads
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, kv_self.v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(kv_self.v),
+                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+#if 1
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+#else
+            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
+            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
+            // is there a better way?
+            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+#endif
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -955,9 +958,13 @@ static bool llama_eval_internal(
     ggml_build_forward_expand(&gf, inpL);
     ggml_graph_compute       (ctx0, &gf);
+    // print timing information per ggml operation (for debugging purposes)
+    // requires GGML_PERF to be defined
+    //ggml_graph_print(&gf);
+    // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
     //}
     //embd_w.resize(n_vocab*N);
@@ -1194,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
     const auto & logits = lctx.logits;
     const auto * plogits = logits.data() + logits.size() - n_logits;
+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        llama_vocab::id max_id = 0;
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
     std::vector<std::pair<float, llama_vocab::id>> logits_id;
     logits_id.reserve(n_logits);
@@ -1215,7 +1236,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
         }
     }
-    sample_top_k(logits_id, top_k);
+    if (top_k > 0 && top_k < n_logits) {
+        sample_top_k(logits_id, top_k);
+    }
     float maxl = -std::numeric_limits<float>::infinity();
     for (const auto & kv : logits_id) {
@@ -1608,7 +1631,7 @@ struct llama_context * llama_init_from_file(
     }
     // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
         if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
             fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
@@ -1668,6 +1691,33 @@ int llama_model_quantize(
     return 0;
 }
+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.data();
+}
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.size();
+}
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
+}
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
+}
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,

data/{ext/llama → llama.cpp}/llama.h RENAMED Viewed

@@ -83,6 +83,23 @@ extern "C" {
             const char * fname_out,
                    int   itype);
+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls

metadata CHANGED Viewed

@@ -1,35 +1,21 @@
 --- !ruby/object:Gem::Specification
 name: llama-rb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - zfletch
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-04-02 00:00:00.000000000 Z
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: rice
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 4.0.4
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 4.0.4
+date: 2023-04-06 00:00:00.000000000 Z
+dependencies: []
 description: ggerganov/llama.cpp with Ruby hooks
 email:
-- zfletch2@gmail.com
+- zf.rubygems@gmail.com
 executables: []
 extensions:
-- ext/llama/extconf.rb
+- ext/extconf.rb
 extra_rdoc_files: []
 files:
 - Gemfile
@@ -37,18 +23,23 @@ files:
 - LICENSE
 - README.md
 - Rakefile
-- ext/llama/common.cpp
-- ext/llama/common.h
-- ext/llama/extconf.rb
-- ext/llama/ggml.c
-- ext/llama/ggml.h
-- ext/llama/llama.cpp
-- ext/llama/llama.h
-- ext/llama/model.cpp
+- bin/console
+- ext/Makefile
+- ext/extconf.rb
 - lib/llama.rb
 - lib/llama/model.rb
 - lib/llama/version.rb
 - llama-rb.gemspec
+- llama.cpp/LICENSE
+- llama.cpp/Makefile
+- llama.cpp/README.md
+- llama.cpp/examples/common.cpp
+- llama.cpp/examples/common.h
+- llama.cpp/examples/main/main.cpp
+- llama.cpp/ggml.c
+- llama.cpp/ggml.h
+- llama.cpp/llama.cpp
+- llama.cpp/llama.h
 - models/.gitkeep
 homepage: https://github.com/zfletch/llama-rb
 licenses:

data/ext/llama/extconf.rb DELETED Viewed

@@ -1,12 +0,0 @@
-require 'mkmf-rice'
-# Compile llama.cpp
-# root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
-# llama_cpp = File.join(root, 'llama.cpp')
-#
-# Dir.chdir(llama_cpp) do
-#   system("make", exception: true)
-# end
-# Create Makefile for Ruby bindings
-create_makefile 'llama/model'

data/ext/llama/model.cpp DELETED Viewed

@@ -1,192 +0,0 @@
-#include <rice/rice.hpp>
-#include "common.h"
-#include "llama.h"
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-class ModelCpp
-{
-	public:
-		llama_context *ctx;
-		ModelCpp()
-		{
-			ctx = NULL;
-		}
-		void model_initialize(
-			const char *model,
-			const int32_t n_ctx,
-			const int32_t n_parts,
-			const int32_t seed,
-			const bool memory_f16,
-			const bool use_mlock
-		);
-		Rice::Object model_predict(
-			const char *prompt,
-			const int32_t n_predict
-		);
-		~ModelCpp()
-		{
-			if (ctx != NULL) {
-				llama_free(ctx);
-			}
-		}
-};
-void ModelCpp::model_initialize(
-	const char *model,     // path to model file, e.g. "models/7B/ggml-model-q4_0.bin"
-	const int32_t n_ctx,   // context size
-	const int32_t n_parts, // amount of model parts (-1 = determine from model dimensions)
-	const int32_t seed,    // RNG seed
-	const bool memory_f16, // use f16 instead of f32 for memory kv
-	const bool use_mlock   // use mlock to keep model in memory
-)
-{
-	auto lparams = llama_context_default_params();
-	lparams.n_ctx     = n_ctx;
-	lparams.n_parts   = n_parts;
-	lparams.seed      = seed;
-	lparams.f16_kv    = memory_f16;
-	lparams.use_mlock = use_mlock;
-	ctx = llama_init_from_file(model, lparams);
-}
-Rice::Object ModelCpp::model_predict(
-	const char *prompt,     // string used as prompt
-	const int32_t n_predict // number of tokens to predict
-)
-{
-	std::string return_val = "";
-    gpt_params params;
-	params.prompt = prompt;
-	params.n_predict = n_predict;
-    // add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-    const int n_ctx = llama_n_ctx(ctx);
-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-	// generate output
-	{
-		std::vector<llama_token> last_n_tokens(n_ctx);
-		std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-		int n_past     = 0;
-		int n_remain   = params.n_predict;
-		int n_consumed = 0;
-		std::vector<llama_token> embd;
-		while (n_remain != 0) {
-			if (embd.size() > 0) {
-				// infinite text generation via context swapping
-				// if we run out of context:
-				// - take the n_keep first tokens from the original prompt (via n_past)
-				// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
-				if (n_past + (int) embd.size() > n_ctx) {
-					const int n_left = n_past - params.n_keep;
-					n_past = params.n_keep;
-					// insert n_left/2 tokens at the start of embd from last_n_tokens
-					embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
-				}
-				if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
-					throw Rice::Exception(rb_eRuntimeError, "Failed to eval");
-				}
-			}
-			n_past += embd.size();
-			embd.clear();
-			if ((int) embd_inp.size() <= n_consumed) {
-				// out of user input, sample next token
-				const int32_t top_k          = params.top_k;
-				const float   top_p          = params.top_p;
-				const float   temp           = params.temp;
-				const float   repeat_penalty = params.repeat_penalty;
-				llama_token id = 0;
-				{
-					auto logits = llama_get_logits(ctx);
-					if (params.ignore_eos) {
-						logits[llama_token_eos()] = 0;
-					}
-					id = llama_sample_top_p_top_k(ctx,
-							last_n_tokens.data() + n_ctx - params.repeat_last_n,
-							params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
-					last_n_tokens.erase(last_n_tokens.begin());
-					last_n_tokens.push_back(id);
-				}
-				// replace end of text token with newline token when in interactive mode
-				if (id == llama_token_eos() && params.interactive && !params.instruct) {
-					id = llama_token_newline.front();
-					if (params.antiprompt.size() != 0) {
-						// tokenize and inject first reverse prompt
-						const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-						embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-					}
-				}
-				// add it to the context
-				embd.push_back(id);
-				// decrement remaining sampling budget
-				--n_remain;
-			} else {
-				// some user input remains from prompt or interaction, forward it to processing
-				while ((int) embd_inp.size() > n_consumed) {
-					embd.push_back(embd_inp[n_consumed]);
-					last_n_tokens.erase(last_n_tokens.begin());
-					last_n_tokens.push_back(embd_inp[n_consumed]);
-					++n_consumed;
-					if ((int) embd.size() >= params.n_batch) {
-						break;
-					}
-				}
-			}
-            for (auto id : embd) {
-				return_val += llama_token_to_str(ctx, id);
-            }
-		}
-	}
-	Rice::String ruby_return_val(return_val);
-	return ruby_return_val;
-}
-extern "C"
-void Init_model()
-{
-	Rice::Module rb_mLlama = Rice::define_module("Llama");
-	Rice::Data_Type<ModelCpp> rb_cModel =Rice::define_class_under<ModelCpp>(rb_mLlama, "Model");
-	rb_cModel.define_constructor(Rice::Constructor<ModelCpp>());
-	rb_cModel.define_method("initialize_cpp", &ModelCpp::model_initialize);
-	rb_cModel.define_method("predict_cpp", &ModelCpp::model_predict);
-}

/data/{ext/llama → llama.cpp/examples}/common.h RENAMED Viewed

File without changes