RubyGems - llama_cpp - Versions diffs - 0.3.7 → 0.4.0 - Mend

llama_cpp 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +17 -0
data/README.md +1 -1
data/examples/chat.rb +2 -4
data/ext/llama_cpp/extconf.rb +3 -3
data/ext/llama_cpp/llama_cpp.cpp +118 -117
data/ext/llama_cpp/src/ggml-alloc.c +97 -53
data/ext/llama_cpp/src/ggml-alloc.h +4 -0
data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
data/ext/llama_cpp/src/ggml-cuda.h +32 -23
data/ext/llama_cpp/src/ggml-metal.h +9 -3
data/ext/llama_cpp/src/ggml-metal.m +142 -161
data/ext/llama_cpp/src/ggml-metal.metal +577 -500
data/ext/llama_cpp/src/ggml.c +2064 -233
data/ext/llama_cpp/src/ggml.h +238 -13
data/ext/llama_cpp/src/k_quants.c +110 -54
data/ext/llama_cpp/src/llama-util.h +10 -8
data/ext/llama_cpp/src/llama.cpp +4544 -2890
data/ext/llama_cpp/src/llama.h +133 -123
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -1
data/sig/llama_cpp.rbs +8 -8
metadata +2 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -34,29 +34,18 @@
 #    define DEPRECATED(func, hint) func
 #endif
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
-#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
-#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
-#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-#define LLAMA_FILE_VERSION           3
-#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
-#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
-#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION        1
+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
-#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
+#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
+#define LLAMA_SESSION_VERSION 1
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
-#ifndef LLAMA_DEFAULT_RMS_EPS
-#define LLAMA_DEFAULT_RMS_EPS 5e-6f
-#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -72,6 +61,52 @@ extern "C" {
     typedef int llama_token;
+    enum llama_log_level {
+        LLAMA_LOG_LEVEL_ERROR = 2,
+        LLAMA_LOG_LEVEL_WARN  = 3,
+        LLAMA_LOG_LEVEL_INFO  = 4
+    };
+    enum llama_vocab_type {
+        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+    };
+    enum llama_token_type {
+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
+        LLAMA_TOKEN_TYPE_BYTE         = 6,
+    };
+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
+    };
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -86,25 +121,10 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
-    enum llama_log_level {
-        LLAMA_LOG_LEVEL_ERROR = 2,
-        LLAMA_LOG_LEVEL_WARN  = 3,
-        LLAMA_LOG_LEVEL_INFO  = 4
-    };
-    // Signature for logging events
-    // Note that text includes the new line character at the end for most events.
-    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
-    // if it exists.
-    // It might not exist for progress report where '.' is output repeatedly.
-    typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
     struct llama_context_params {
         uint32_t seed;         // RNG seed, -1 for random
         int32_t  n_ctx;        // text context
         int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
-        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
         int32_t  n_gpu_layers; // number of layers to store in VRAM
         int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
@@ -129,33 +149,18 @@ extern "C" {
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
     };
-    // model file types
-    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
-    };
+    // Signature for logging events
+    // Note that text includes the new line character at the end for most events.
+    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
+    // if it exists.
+    // It might not exist for progress report where '.' is output repeatedly.
+    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
     // model quantization parameters
     typedef struct llama_model_quantize_params {
         int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        enum llama_ftype ftype;      // quantize to this llama_ftype
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
     } llama_model_quantize_params;
@@ -208,27 +213,16 @@ extern "C" {
         int32_t n_eval;
     };
-    // Set callback for all future logging events.
-    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
-    LLAMA_API int llama_max_devices();
-    LLAMA_API struct llama_context_params llama_context_default_params();
-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
+    LLAMA_API struct llama_context_params llama_context_default_params(void);
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
-    LLAMA_API bool llama_mmap_supported();
-    LLAMA_API bool llama_mlock_supported();
-    // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
     LLAMA_API void llama_backend_init(bool numa);
-    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free();
-    LLAMA_API int64_t llama_time_us();
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free(void);
     LLAMA_API struct llama_model * llama_load_model_from_file(
                              const char * path_model,
@@ -240,17 +234,28 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params),
-            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
+    LLAMA_API int64_t llama_time_us(void);
+    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API bool llama_mmap_supported (void);
+    LLAMA_API bool llama_mlock_supported(void);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
+    LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
+    LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
+    LLAMA_API int llama_model_n_embd (const struct llama_model * model);
+    // Get a string describing the model type
+    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
     // Returns 0 on success
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
@@ -272,9 +277,9 @@ extern "C" {
     LLAMA_API int llama_model_apply_lora_from_file(
             const struct llama_model * model,
-                      const char * path_lora,
-                      const char * path_base_model,
-                             int   n_threads);
+                          const char * path_lora,
+                          const char * path_base_model,
+                                 int   n_threads);
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -324,11 +329,40 @@ extern "C" {
     // IMPORTANT: do not use for anything else other than debugging and testing!
     LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+    // Token logits obtained from the last call to llama_eval()
+    // The logits for the last token are stored in the last row
+    // Can be mutated in order to change the probabilities of the next token
+    // Rows: n_tokens
+    // Cols: n_vocab
+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+    //
+    // Vocab
+    //
+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    // Special tokens
+    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    //
+    // Tokenization
+    //
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens
     // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
     LLAMA_API int llama_tokenize(
             struct llama_context * ctx,
                       const char * text,
@@ -343,55 +377,24 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
-    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
-    // Get the vocabulary as output parameters.
-    // Returns number of results.
-    LLAMA_API int llama_get_vocab(
-            const struct llama_context * ctx,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
-    LLAMA_API int llama_get_vocab_from_model(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(
+    // Does not write null terminator to the buffer
+    LLAMA_API int llama_token_to_str(
             const struct llama_context * ctx,
-                           llama_token   token);
+                           llama_token   token,
+                                  char * buf,
+                                  int    length);
-    LLAMA_API const char * llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_str_with_model(
               const struct llama_model * model,
-                           llama_token   token);
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl();   // next-line
+                           llama_token   token,
+                                  char * buf,
+                                  int    length);
+    //
     // Grammar
     //
     LLAMA_API struct llama_grammar * llama_grammar_init(
             const llama_grammar_element ** rules,
                                  size_t    n_rules,
@@ -399,7 +402,9 @@ extern "C" {
     LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+    //
     // Sampling functions
+    //
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
@@ -468,6 +473,10 @@ extern "C" {
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
 #ifdef __cplusplus
 }
 #endif
@@ -477,10 +486,11 @@ extern "C" {
 #include <vector>
 #include <string>
 struct ggml_tensor;
 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-#endif
+#endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.7'
+  VERSION = '0.4.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-9ca4abe'
+  LLAMA_CPP_VERSION = 'b1060'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -101,7 +101,7 @@ module LLaMACpp
       embd.each { |token| output << context.token_to_str(token) }
-      break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
+      break if !embd.empty? && embd[-1] == context.token_eos
     end
     output.join.delete_prefix(spaced_prompt).strip

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -1,9 +1,6 @@
 module LLaMACpp
   VERSION: String
   LLAMA_CPP_VERSION: String
-  LLAMA_FILE_VERSION: String
-  LLAMA_FILE_MAGIC: String
-  LLAMA_FILE_MAGIC_UNVERSIONED: String
   LLAMA_DEFALUT_SEED: String
   LLAMA_MAX_DEVICES: Integer
@@ -42,9 +39,7 @@ module LLaMACpp
     ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
     ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
   def self?.print_system_info: () -> void
-  def self?.token_bos: () -> Integer
-  def self?.token_eos: () -> Integer
-  def self?.token_nl: () -> Integer
+  def self?.time_us: () -> Integer
   def self?.mmap_supported?: () -> bool
   def self?.mlock_supported?: () -> bool
   def self?.max_devices: () -> Integer
@@ -81,9 +76,9 @@ module LLaMACpp
     def n_vocab: () -> Integer
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
-    def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
     def token_to_str: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
+    def type: () -> String
   end
   class Timings
@@ -105,6 +100,12 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model) -> void
     def embeddings: () -> Array[Float]
+    def text: (Integer) -> String
+    def score: (Integer) -> Float
+    def type: (Integer) -> Integer
+    def token_bos: () -> Integer
+    def token_eos: () -> Integer
+    def token_nl: () -> Integer
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
     def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
     def eval_export: (String) -> bool
@@ -112,7 +113,6 @@ module LLaMACpp
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
     def n_vocab: () -> Integer
-    def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
     def timings: () -> ::LLaMACpp::Timings
     def print_timings: () -> void
     def reset_timings: () -> void

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.7
+  version: 0.4.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-08-11 00:00:00.000000000 Z
+date: 2023-08-26 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: