RubyGems - llama_cpp - Versions diffs - 0.0.4 → 0.0.6 - Mend

llama_cpp 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +28 -0
data/README.md +3 -2
data/ext/llama_cpp/extconf.rb +26 -0
data/ext/llama_cpp/llama_cpp.cpp +106 -0
data/ext/llama_cpp/src/ggml-cuda.h +12 -0
data/ext/llama_cpp/src/ggml.c +2038 -895
data/ext/llama_cpp/src/ggml.h +21 -1
data/ext/llama_cpp/src/llama.cpp +376 -62
data/ext/llama_cpp/src/llama.h +17 -1
data/ext/llama_cpp/src/llama_util.h +22 -16
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +3 -3
data/sig/llama_cpp.rbs +13 -1
metadata +3 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -72,6 +72,8 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
     };
     LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +93,24 @@ extern "C" {
     // TODO: not great API - very likely to change
     // Returns 0 on success
+    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-      enum llama_ftype   ftype);
+      enum llama_ftype   ftype,
+            int          nthread);
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);
     // Returns the KV cache that will contain the context for the
     // ongoing prediction with the model.

data/ext/llama_cpp/src/llama_util.h CHANGED Viewed

@@ -43,8 +43,12 @@
     } while (0)
 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
     va_list ap, ap2;
     va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
     va_end(ap2);
     va_end(ap);
     return std::string(buf.data(), size);
-};
+}
 struct llama_file {
     // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
     static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
         size = file->size;
         int fd = fileno(file->fp);
         int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
         flags |= MAP_POPULATE;
 #endif
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        close(fd);
         if (addr == MAP_FAILED) {
             throw format("mmap failed: %s", strerror(errno));
         }
-        // Advise the kernel to preload the mapped memory
-        if (madvise(addr, file->size, MADV_WILLNEED)) {
-            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                    strerror(errno));
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
         }
     }
@@ -190,14 +195,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
     static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
         size = file->size;
         HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
         HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
         DWORD error = GetLastError();
-        CloseHandle(hFile);
         if (hMapping == NULL) {
             throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -212,13 +216,15 @@ struct llama_mmap {
         }
         #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        // Advise the kernel to preload the mapped memory
-        WIN32_MEMORY_RANGE_ENTRY range;
-        range.VirtualAddress = addr;
-        range.NumberOfBytes = (SIZE_T)size;
-        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
         }
         #else
         #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.0.4'
+  VERSION = '0.0.6'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-c85e03d'
+  LLAMA_CPP_VERSION = 'master-12b5900'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -17,9 +17,9 @@ module LLaMACpp
   # @param n_threads [Integer]
   # @return [String]
   def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
-    prompt.insert(0, ' ')
+    spaced_prompt = " #{prompt}"
-    embd_input = context.tokenize(text: prompt, add_bos: true)
+    embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
     n_ctx = context.n_ctx
     last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
       break if embd[-1] == LLaMACpp.token_eos
     end
-    output.join.delete_prefix(prompt).strip
+    output.join.delete_prefix(spaced_prompt).strip
   end
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -5,10 +5,21 @@ module LLaMACpp
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
+  LLAMA_FTYPE_ALL_F32: Integer
+  LLAMA_FTYPE_MOSTLY_F16: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_3: Integer
+  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
   def self?.token_eos: () -> Integer
+  def self?.mmap_supported?: () -> bool
+  def self?.mlock_supported?: () -> bool
   class Context
     public
@@ -16,7 +27,7 @@ module LLaMACpp
     def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
                   | () -> void
     def embeddings: () -> Array[Float]
-    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
+    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def logits: () -> Array[Float]
@@ -28,6 +39,7 @@ module LLaMACpp
     def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
     def token_to_str: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
+    def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
   end
   class ContextParams

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.6
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-04-15 00:00:00.000000000 Z
+date: 2023-04-22 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -26,6 +26,7 @@ files:
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-cuda.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
 - ext/llama_cpp/src/llama.cpp