RubyGems - llama_cpp - Versions diffs - 0.4.0 → 0.5.1 - Mend

llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/examples/chat.rb +2 -2
data/ext/llama_cpp/extconf.rb +1 -1
data/ext/llama_cpp/llama_cpp.cpp +23 -11
data/ext/llama_cpp/src/ggml-alloc.c +118 -73
data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
data/ext/llama_cpp/src/ggml-metal.h +1 -0
data/ext/llama_cpp/src/ggml-metal.m +165 -72
data/ext/llama_cpp/src/ggml-metal.metal +160 -89
data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
data/ext/llama_cpp/src/ggml.c +661 -380
data/ext/llama_cpp/src/ggml.h +45 -19
data/ext/llama_cpp/src/k_quants.c +47 -14
data/ext/llama_cpp/src/llama.cpp +571 -166
data/ext/llama_cpp/src/llama.h +54 -5
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -1
data/sig/llama_cpp.rbs +5 -3
metadata +2 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdbool.h>
 #ifdef LLAMA_SHARED
@@ -163,6 +164,7 @@ extern "C" {
         enum llama_ftype ftype;      // quantize to this llama_ftype
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
+        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
     } llama_model_quantize_params;
     // grammar types
@@ -254,7 +256,11 @@ extern "C" {
     LLAMA_API int llama_model_n_embd (const struct llama_model * model);
     // Get a string describing the model type
-    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    // Returns the total size of all the tensors in the model in bytes
+    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+    // Returns the total number of parameters in the model
+    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
     // Returns 0 on success
     LLAMA_API int llama_model_quantize(
@@ -377,15 +383,17 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
-    // Token Id -> String. Uses the vocabulary in the provided context
-    // Does not write null terminator to the buffer
-    LLAMA_API int llama_token_to_str(
+    // Token Id -> Piece.
+    // Uses the vocabulary in the provided context.
+    // Does not write null terminator to the buffer.
+    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+    LLAMA_API int llama_token_to_piece(
             const struct llama_context * ctx,
                            llama_token   token,
                                   char * buf,
                                   int    length);
-    LLAMA_API int llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_piece_with_model(
               const struct llama_model * model,
                            llama_token   token,
                                   char * buf,
@@ -402,6 +410,8 @@ extern "C" {
     LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
     //
     // Sampling functions
     //
@@ -465,6 +475,43 @@ extern "C" {
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
+    //
+    // Beam search
+    //
+    struct llama_beam_view {
+        const llama_token * tokens;
+        size_t n_tokens;
+        float p;   // Cumulative beam probability (renormalized relative to all beams)
+        bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+    };
+    // Passed to beam_search_callback function.
+    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+    // These pointers are valid only during the synchronous callback, so should not be saved.
+    struct llama_beams_state {
+        struct llama_beam_view * beam_views;
+        size_t n_beams;               // Number of elements in beam_views[].
+        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+        bool last_call;               // True iff this is the last callback invocation.
+    };
+    // Type of pointer to the beam_search_callback function.
+    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
+    /// @details Deterministically returns entire sentence constructed by a beam search.
+    /// @param ctx Pointer to the llama_context.
+    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+    /// @param callback_data A pointer that is simply passed back to callback.
+    /// @param n_beams Number of beams to use.
+    /// @param n_past Number of tokens already evaluated.
+    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+    /// @param n_threads Number of threads as passed to llama_eval().
+    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -477,6 +524,8 @@ extern "C" {
     // If this is not called, or NULL is supplied, everything is output on stderr.
     LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 }
 #endif

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.4.0'
+  VERSION = '0.5.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1060'
+  LLAMA_CPP_VERSION = 'b1198'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -99,7 +99,7 @@ module LLaMACpp
         end
       end
-      embd.each { |token| output << context.token_to_str(token) }
+      embd.each { |token| output << context.token_to_piece(token) }
       break if !embd.empty? && embd[-1] == context.token_eos
     end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -76,9 +76,11 @@ module LLaMACpp
     def n_vocab: () -> Integer
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
-    def token_to_str: (Integer) -> String
+    def token_to_piece: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
-    def type: () -> String
+    def desc: () -> String
+    def size: () -> Integer
+    def n_params: () -> Integer
   end
   class Timings
@@ -116,7 +118,7 @@ module LLaMACpp
     def timings: () -> ::LLaMACpp::Timings
     def print_timings: () -> void
     def reset_timings: () -> void
-    def token_to_str: (Integer) -> String
+    def token_to_piece: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
     def kv_cache_token_count: () -> Integer
     def set_rng_seed: (Integer) -> void

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-08-26 00:00:00.000000000 Z
+date: 2023-09-08 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: