RubyGems - llama_cpp - Versions diffs - 0.13.0 → 0.14.0 - Mend

llama_cpp 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/ext/llama_cpp/llama_cpp.cpp +59 -26
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +6 -4
data/vendor/tmp/llama.cpp/Makefile +2 -3
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
data/vendor/tmp/llama.cpp/ggml.c +227 -15
data/vendor/tmp/llama.cpp/ggml.h +30 -4
data/vendor/tmp/llama.cpp/llama.cpp +631 -211
data/vendor/tmp/llama.cpp/llama.h +28 -10
metadata +2 -2

data/vendor/tmp/llama.cpp/llama.h CHANGED Viewed

@@ -129,6 +129,7 @@ extern "C" {
     };
     enum llama_pooling_type {
+        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
         LLAMA_POOLING_TYPE_NONE = 0,
         LLAMA_POOLING_TYPE_MEAN = 1,
         LLAMA_POOLING_TYPE_CLS  = 2,
@@ -162,7 +163,7 @@ extern "C" {
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
     // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -172,7 +173,7 @@ extern "C" {
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-        int8_t       *  logits;
+        int8_t       *  logits; // TODO: rename this to "output"
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -236,7 +237,10 @@ extern "C" {
         uint32_t n_batch;           // prompt processing maximum batch size
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+                                                        // (ignored if no pooling layer)
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -255,10 +259,15 @@ extern "C" {
         enum ggml_type type_v; // data type for V cache
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
+        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        // Abort callback
+        // if it returns true, execution of llama_decode() will be aborted
+        // currently works only with CPU execution
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
     };
     // model quantization parameters
@@ -632,7 +641,10 @@ extern "C" {
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
-    // Token logits obtained from the last call to llama_eval()
+    // Set abort callback
+    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Token logits obtained from the last call to llama_decode()
     // The logits for the last token are stored in the last row
     // Logits for which llama_batch.logits[i] == 0 are undefined
     // Rows: n_tokens provided with llama_batch
@@ -643,14 +655,20 @@ extern "C" {
     // llama_get_logits(ctx) + i*n_vocab
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
+    // Get all output token embeddings
+    // shape: [n_tokens*n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-    // Get the embeddings for the ith sequence
+    // Get the embeddings for the ith token
     // llama_get_embeddings(ctx) + i*n_embd
+    // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+    // Get the embeddings for a sequence id
+    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
     //
     // Vocab
     //

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.13.0
+  version: 0.14.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-03-02 00:00:00.000000000 Z
+date: 2024-03-09 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: