RubyGems - llama_cpp - Versions diffs - 0.10.3 → 0.10.4 - Mend

llama_cpp 0.10.3 → 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/ext/llama_cpp/src/ggml-backend.c +6 -2
data/ext/llama_cpp/src/ggml-cuda.cu +73 -63
data/ext/llama_cpp/src/ggml-impl.h +1 -0
data/ext/llama_cpp/src/ggml-metal.m +43 -20
data/ext/llama_cpp/src/ggml-metal.metal +464 -245
data/ext/llama_cpp/src/ggml-opencl.h +9 -9
data/ext/llama_cpp/src/ggml-quants.c +61 -57
data/ext/llama_cpp/src/ggml.c +171 -5
data/ext/llama_cpp/src/ggml.h +1 -0
data/ext/llama_cpp/src/llama.cpp +222 -105
data/ext/llama_cpp/src/llama.h +31 -32
data/lib/llama_cpp/version.rb +2 -2
metadata +3 -3

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -226,7 +226,7 @@ extern "C" {
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         enum llama_ftype ftype;      // quantize to this llama_ftype
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
@@ -310,21 +310,20 @@ extern "C" {
     LLAMA_API int64_t llama_time_us(void);
-    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API int32_t  llama_max_devices(void);
     LLAMA_API bool llama_mmap_supported (void);
     LLAMA_API bool llama_mlock_supported(void);
     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
-    // TODO: become more consistent with returned int types across the API
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
-    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -335,19 +334,19 @@ extern "C" {
     // - GGUF array values are not supported by these functions
     // Get metadata value as a string by key name
-    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
     // Get the number of metadata key/value pairs
-    LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
     // Get metadata key name by index
-    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
     // Get metadata value as a string by index
-    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
     // Get a string describing the model type
-    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@@ -359,7 +358,7 @@ extern "C" {
     LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
     // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
+    LLAMA_API uint32_t llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
             const llama_model_quantize_params * params);
@@ -370,20 +369,20 @@ extern "C" {
     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
     // will be applied on top of the previous one
     // Returns 0 on success
-    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
             struct llama_context * ctx,
                       const char * path_lora,
                            float   scale,
                       const char * path_base_model,
-                             int   n_threads),
+                         int32_t   n_threads),
             "use llama_model_apply_lora_from_file instead");
-    LLAMA_API int llama_model_apply_lora_from_file(
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
             const struct llama_model * model,
                       const char * path_lora,
                            float   scale,
                       const char * path_base_model,
-                             int   n_threads);
+                         int32_t   n_threads);
     //
     // KV cache
@@ -439,10 +438,10 @@ extern "C" {
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
     // Clear the KV cache
     LLAMA_API void llama_kv_cache_clear(
@@ -533,7 +532,7 @@ extern "C" {
             struct llama_context * ctx,
                      llama_token * tokens,
                          int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
             "use llama_decode() instead");
     // Same as llama_eval, but use float matrix input directly.
@@ -542,7 +541,7 @@ extern "C" {
             struct llama_context * ctx,
                            float * embd,
                          int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
             "use llama_decode() instead");
     // Return batch for single sequence of tokens starting at pos_0
@@ -574,7 +573,7 @@ extern "C" {
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
     // < 0 - error
-    LLAMA_API int llama_decode(
+    LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);
@@ -614,10 +613,10 @@ extern "C" {
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
     // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
     // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int         llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
     // codellama infill tokens
     LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -635,12 +634,12 @@ extern "C" {
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
     /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
     ///                Does not insert a leading space.
-    LLAMA_API int llama_tokenize(
+    LLAMA_API int32_t llama_tokenize(
         const struct llama_model * model,
                       const char * text,
-                             int   text_len,
+                         int32_t   text_len,
                      llama_token * tokens,
-                             int   n_max_tokens,
+                         int32_t   n_max_tokens,
                             bool   add_bos,
                             bool   special);
@@ -648,11 +647,11 @@ extern "C" {
     // Uses the vocabulary in the provided context.
     // Does not write null terminator to the buffer.
     // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int llama_token_to_piece(
+    LLAMA_API int32_t llama_token_to_piece(
               const struct llama_model * model,
                            llama_token   token,
                                   char * buf,
-                                  int    length);
+                               int32_t   length);
     //
     // Grammar
@@ -704,7 +703,7 @@ extern "C" {
     LLAMA_API void llama_sample_top_k(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
-                             int   k,
+                         int32_t   k,
                           size_t   min_keep);
     /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -763,7 +762,7 @@ extern "C" {
           llama_token_data_array * candidates,
                            float   tau,
                            float   eta,
-                             int   m,
+                         int32_t   m,
                            float * mu);
     /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -836,8 +835,8 @@ extern "C" {
         llama_beam_search_callback_fn_t   callback,
                                    void * callback_data,
                                  size_t   n_beams,
-                                    int   n_past,
-                                    int   n_predict);
+                                int32_t   n_past,
+                                int32_t   n_predict);
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.10.3'
+  VERSION = '0.10.4'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1710'
+  LLAMA_CPP_VERSION = 'b1768'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.10.3
+  version: 0.10.4
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-12-30 00:00:00.000000000 Z
+date: 2024-01-06 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.22
+rubygems_version: 3.5.3
 signing_key:
 specification_version: 4
 summary: Ruby bindings for the llama.cpp.