RubyGems - llama_cpp - Versions diffs - 0.1.4 → 0.2.0 - Mend

llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +29 -0
data/ext/llama_cpp/extconf.rb +26 -1
data/ext/llama_cpp/llama_cpp.cpp +210 -13
data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
data/ext/llama_cpp/src/ggml-cuda.h +15 -2
data/ext/llama_cpp/src/ggml-metal.h +63 -0
data/ext/llama_cpp/src/ggml-metal.m +783 -0
data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
data/ext/llama_cpp/src/ggml-opencl.h +4 -0
data/ext/llama_cpp/src/ggml.c +340 -109
data/ext/llama_cpp/src/ggml.h +44 -6
data/ext/llama_cpp/src/k_quants.c +2244 -0
data/ext/llama_cpp/src/k_quants.h +122 -0
data/ext/llama_cpp/src/llama-util.h +16 -0
data/ext/llama_cpp/src/llama.cpp +484 -136
data/ext/llama_cpp/src/llama.h +39 -8
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +33 -1
metadata +8 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -1,6 +1,13 @@
 #ifndef LLAMA_H
 #define LLAMA_H
+#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -31,7 +38,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@@ -65,9 +72,12 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
     struct llama_context_params {
-        int n_ctx;        // text context
-        int n_gpu_layers; // number of layers to store in VRAM
-        int seed;         // RNG seed, -1 for random
+        int n_ctx;                             // text context
+        int n_batch;                           // prompt processing batch size
+        int n_gpu_layers;                      // number of layers to store in VRAM
+        int main_gpu;                          // the GPU that is used for scratch and small tensors
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        int seed;                              // RNG seed, -1 for random
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -94,9 +104,27 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
     };
+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
     LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
     LLAMA_API bool llama_mmap_supported();
     LLAMA_API bool llama_mlock_supported();
@@ -118,14 +146,11 @@ extern "C" {
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
-    // TODO: not great API - very likely to change
     // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
+            const llama_model_quantize_params * params);
     // Apply a LoRA adapter to a loaded model
     // path_base_model is the path to a higher quality model to use as a base for
@@ -173,6 +198,12 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.1.4'
+  VERSION = '0.2.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-ffb06a3'
+  LLAMA_CPP_VERSION = 'master-4de0334'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -5,6 +5,8 @@ module LLaMACpp
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
+  LLAMA_MAX_DEVICES: Integer
   LLAMA_FTYPE_ALL_F32: Integer
   LLAMA_FTYPE_MOSTLY_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -13,9 +15,18 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q8_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_1: Integer
+  LLAMA_FTYPE_MOSTLY_Q2_K: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q3_K_L: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
+  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
   def self?.init_backend: () -> void
-  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
+  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
@@ -52,6 +63,7 @@ module LLaMACpp
     def embeddings: () -> Array[Float]
     def empty?: () -> bool
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_export: (String) -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def logits: () -> Array[Float]
@@ -92,6 +104,13 @@ module LLaMACpp
     def logits_all=: (bool) -> bool
     def n_ctx: () -> Integer
     def n_ctx=: (Integer) -> Integer
+    def n_batch: () -> Integer
+    def n_batch=: (Integer) -> Integer
+    def n_gpu_layers: () -> Integer
+    def n_gpu_layers=: (Integer) -> Integer
+    def main_gpu: () -> Integer
+    def main_gpu=: (Integer) -> Integer
+    def tensor_split: () -> Array[Float]
     def seed: () -> Integer
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
@@ -102,6 +121,19 @@ module LLaMACpp
     def vocab_only=: (bool) -> bool
   end
+  class ModelQuantizeParams
+    public
+    def n_thread: () -> Integer
+    def n_thread=: (Integer) -> Integer
+    def ftype: () -> Integer
+    def ftype=: (Integer) -> Integer
+    def allow_quantization: () -> bool
+    def allow_quantization=: (bool) -> bool
+    def quantize_output_tensor: () -> bool
+    def quantize_output_tensor=: (bool) -> bool
+  end
   class Params = ContextParams
   class Client

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-06-03 00:00:00.000000000 Z
+date: 2023-06-11 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -26,11 +26,17 @@ files:
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-metal.h
+- ext/llama_cpp/src/ggml-metal.m
+- ext/llama_cpp/src/ggml-metal.metal
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
+- ext/llama_cpp/src/k_quants.c
+- ext/llama_cpp/src/k_quants.h
 - ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h