RubyGems - llama_cpp - Versions diffs - 0.1.3 → 0.2.0 - Mend

llama_cpp 0.1.3 → 0.2.0

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +39 -8
data/ext/llama_cpp/extconf.rb +26 -1
data/ext/llama_cpp/llama_cpp.cpp +210 -13
data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
data/ext/llama_cpp/src/ggml-cuda.h +15 -2
data/ext/llama_cpp/src/ggml-metal.h +63 -0
data/ext/llama_cpp/src/ggml-metal.m +783 -0
data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
data/ext/llama_cpp/src/ggml-opencl.h +4 -0
data/ext/llama_cpp/src/ggml.c +835 -82
data/ext/llama_cpp/src/ggml.h +64 -8
data/ext/llama_cpp/src/k_quants.c +2244 -0
data/ext/llama_cpp/src/k_quants.h +122 -0
data/ext/llama_cpp/src/llama-util.h +16 -0
data/ext/llama_cpp/src/llama.cpp +489 -134
data/ext/llama_cpp/src/llama.h +43 -7
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +33 -1
metadata +8 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -1,6 +1,13 @@
 #ifndef LLAMA_H
 #define LLAMA_H
+#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -31,6 +38,11 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -60,9 +72,12 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
     struct llama_context_params {
-        int n_ctx;        // text context
-        int n_gpu_layers; // number of layers to store in VRAM
-        int seed;         // RNG seed, -1 for random
+        int n_ctx;                             // text context
+        int n_batch;                           // prompt processing batch size
+        int n_gpu_layers;                      // number of layers to store in VRAM
+        int main_gpu;                          // the GPU that is used for scratch and small tensors
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        int seed;                              // RNG seed, -1 for random
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -89,9 +104,27 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
     };
+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
     LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
     LLAMA_API bool llama_mmap_supported();
     LLAMA_API bool llama_mlock_supported();
@@ -113,14 +146,11 @@ extern "C" {
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
-    // TODO: not great API - very likely to change
     // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
+            const llama_model_quantize_params * params);
     // Apply a LoRA adapter to a loaded model
     // path_base_model is the path to a higher quality model to use as a base for
@@ -168,6 +198,12 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.1.3'
+  VERSION = '0.2.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-66874d4'
+  LLAMA_CPP_VERSION = 'master-4de0334'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -5,6 +5,8 @@ module LLaMACpp
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
+  LLAMA_MAX_DEVICES: Integer
   LLAMA_FTYPE_ALL_F32: Integer
   LLAMA_FTYPE_MOSTLY_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q8_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_1: Integer
+  LLAMA_FTYPE_MOSTLY_Q2_K: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
   def self?.init_backend: () -> void
-  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
+  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
@@ -52,6 +63,7 @@ module LLaMACpp
     def embeddings: () -> Array[Float]
     def empty?: () -> bool
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_export: (String) -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def logits: () -> Array[Float]
@@ -92,6 +104,13 @@ module LLaMACpp
     def logits_all=: (bool) -> bool
     def n_ctx: () -> Integer
     def n_ctx=: (Integer) -> Integer
+    def n_batch: () -> Integer
+    def n_batch=: (Integer) -> Integer
+    def n_gpu_layers: () -> Integer
+    def n_gpu_layers=: (Integer) -> Integer
+    def main_gpu: () -> Integer
+    def main_gpu=: (Integer) -> Integer
+    def tensor_split: () -> Array[Float]
     def seed: () -> Integer
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
@@ -102,6 +121,19 @@ module LLaMACpp
     def vocab_only=: (bool) -> bool
   end
+  class ModelQuantizeParams
+    public
+    def n_thread: () -> Integer
+    def n_thread=: (Integer) -> Integer
+    def ftype: () -> Integer
+    def ftype=: (Integer) -> Integer
+    def allow_quantization: () -> bool
+    def allow_quantization=: (bool) -> bool
+    def quantize_output_tensor: () -> bool
+    def quantize_output_tensor=: (bool) -> bool
+  end
   class Params = ContextParams
   class Client

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.2.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-05-26 00:00:00.000000000 Z
+date: 2023-06-11 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -26,11 +26,17 @@ files:
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-metal.h
+- ext/llama_cpp/src/ggml-metal.m
+- ext/llama_cpp/src/ggml-metal.metal
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
+- ext/llama_cpp/src/k_quants.c
+- ext/llama_cpp/src/k_quants.h
 - ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h