RubyGems - llama_cpp - Versions diffs - 0.1.4 → 0.2.1 - Mend

llama_cpp 0.1.4 → 0.2.1

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +36 -0
data/examples/README.md +60 -0
data/examples/chat.rb +195 -0
data/ext/llama_cpp/extconf.rb +26 -1
data/ext/llama_cpp/llama_cpp.cpp +262 -13
data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
data/ext/llama_cpp/src/ggml-cuda.h +18 -2
data/ext/llama_cpp/src/ggml-metal.h +64 -0
data/ext/llama_cpp/src/ggml-metal.m +834 -0
data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
data/ext/llama_cpp/src/ggml-opencl.h +4 -1
data/ext/llama_cpp/src/ggml.c +2236 -404
data/ext/llama_cpp/src/ggml.h +170 -8
data/ext/llama_cpp/src/k_quants.c +2244 -0
data/ext/llama_cpp/src/k_quants.h +122 -0
data/ext/llama_cpp/src/llama-util.h +16 -0
data/ext/llama_cpp/src/llama.cpp +631 -179
data/ext/llama_cpp/src/llama.h +51 -11
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +36 -1
metadata +10 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -1,6 +1,13 @@
 #ifndef LLAMA_H
 #define LLAMA_H
+#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -31,7 +38,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@@ -65,9 +72,13 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
     struct llama_context_params {
-        int n_ctx;        // text context
-        int n_gpu_layers; // number of layers to store in VRAM
-        int seed;         // RNG seed, -1 for random
+        int n_ctx;                             // text context
+        int n_batch;                           // prompt processing batch size
+        int n_gpu_layers;                      // number of layers to store in VRAM
+        int main_gpu;                          // the GPU that is used for scratch and small tensors
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
+        int seed;                              // RNG seed, -1 for random
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +105,27 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
     };
+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
     LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
     LLAMA_API bool llama_mmap_supported();
     LLAMA_API bool llama_mlock_supported();
@@ -118,14 +147,11 @@ extern "C" {
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
-    // TODO: not great API - very likely to change
     // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
+            const llama_model_quantize_params * params);
     // Apply a LoRA adapter to a loaded model
     // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +199,12 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens
@@ -189,6 +221,14 @@ extern "C" {
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    // Get the vocabulary as output parameters.
+    // Returns number of results.
+    LLAMA_API int llama_get_vocab(
+            const struct llama_context * ctx,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
@@ -204,9 +244,9 @@ extern "C" {
     LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
     // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-    LLAMA_API llama_token llama_token_nl();
+    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl();   // next-line
     // Sampling functions

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.1.4'
+  VERSION = '0.2.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-ffb06a3'
+  LLAMA_CPP_VERSION = 'master-a09f919'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -5,6 +5,8 @@ module LLaMACpp
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
+  LLAMA_MAX_DEVICES: Integer
   LLAMA_FTYPE_ALL_F32: Integer
   LLAMA_FTYPE_MOSTLY_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q8_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_1: Integer
+  LLAMA_FTYPE_MOSTLY_Q2_K: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
   def self?.init_backend: () -> void
-  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
+  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
@@ -52,12 +63,14 @@ module LLaMACpp
     def embeddings: () -> Array[Float]
     def empty?: () -> bool
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_export: (String) -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
     def n_vocab: () -> Integer
+    def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
     def print_timings: () -> void
     def reset_timings: () -> void
     def token_to_str: (Integer) -> String
@@ -92,6 +105,15 @@ module LLaMACpp
     def logits_all=: (bool) -> bool
     def n_ctx: () -> Integer
     def n_ctx=: (Integer) -> Integer
+    def n_batch: () -> Integer
+    def n_batch=: (Integer) -> Integer
+    def n_gpu_layers: () -> Integer
+    def n_gpu_layers=: (Integer) -> Integer
+    def main_gpu: () -> Integer
+    def main_gpu=: (Integer) -> Integer
+    def tensor_split: () -> Array[Float]
+    def low_vram: () -> bool
+    def low_vram=: (bool) -> bool
     def seed: () -> Integer
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
@@ -102,6 +124,19 @@ module LLaMACpp
     def vocab_only=: (bool) -> bool
   end
+  class ModelQuantizeParams
+    public
+    def n_thread: () -> Integer
+    def n_thread=: (Integer) -> Integer
+    def ftype: () -> Integer
+    def ftype=: (Integer) -> Integer
+    def allow_quantization: () -> bool
+    def allow_quantization=: (bool) -> bool
+    def quantize_output_tensor: () -> bool
+    def quantize_output_tensor=: (bool) -> bool
+  end
   class Params = ContextParams
   class Client

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-06-03 00:00:00.000000000 Z
+date: 2023-06-17 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -22,15 +22,23 @@ files:
 - CODE_OF_CONDUCT.md
 - LICENSE.txt
 - README.md
+- examples/README.md
+- examples/chat.rb
 - ext/llama_cpp/extconf.rb
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-metal.h
+- ext/llama_cpp/src/ggml-metal.m
+- ext/llama_cpp/src/ggml-metal.metal
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
+- ext/llama_cpp/src/k_quants.c
+- ext/llama_cpp/src/k_quants.h
 - ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h