RubyGems - llama_cpp - Versions diffs - 0.5.2 → 0.6.0 - Mend

llama_cpp 0.5.2 → 0.6.0

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/README.md +6 -5
data/examples/chat.rb +13 -13
data/examples/embedding.rb +9 -9
data/ext/llama_cpp/llama_cpp.cpp +547 -272
data/ext/llama_cpp/src/ggml-alloc.c +14 -8
data/ext/llama_cpp/src/ggml-alloc.h +1 -0
data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
data/ext/llama_cpp/src/ggml-cuda.h +1 -0
data/ext/llama_cpp/src/ggml-metal.h +4 -0
data/ext/llama_cpp/src/ggml-metal.m +200 -94
data/ext/llama_cpp/src/ggml-metal.metal +264 -82
data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
data/ext/llama_cpp/src/ggml.c +1647 -865
data/ext/llama_cpp/src/ggml.h +143 -52
data/ext/llama_cpp/src/llama.cpp +1427 -635
data/ext/llama_cpp/src/llama.h +308 -119
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +5 -9
data/sig/llama_cpp.rbs +65 -34
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
-  data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
+  metadata.gz: 854493444a65cd1239649b991c8e6538c542c02a052932f6a69c56c984e28f58
+  data.tar.gz: 4e0b70de25eb2661b693af0d488efd25f570c3f62d4b9044fdd5c14fb5b9fac6
 SHA512:
-  metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
-  data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
+  metadata.gz: b2524b8eb6e8568116f3c33eb57b764044083ee2ff2bbb7f15fc6301b024197ea8fca75968535b302a9e70449c9f9f28e0760cf4bfefb00ed8137c18e84137d5
+  data.tar.gz: faf26b552a8a862a97129b5bd25e05b3ae3edd2f8b118622b119634e4b004c05d200653c40085e4a28243c8994c517699baa35d3a8096ad8ac598fd637cf0565

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,16 @@
+## [[0.6.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.3...v0.6.0)] - 2023-09-30
+**Breaking Changes**
+- Bump bundled llama.cpp from b1266 to b1292.
+  - There are many API changes, so please refer to the commits.
+It is becoming difficult to keep up with major changes in llama.cpp,
+and I may give up on developing this gem in the future to prioritize my own life.
+## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
+- Bump bundled llama.cpp from b1 to b1266.
 ## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
 - Bump bundled llama.cpp from b1198 to b1.

data/README.md CHANGED Viewed

@@ -59,13 +59,14 @@ An example of Ruby code that generates sentences with the quantization model is
 ```ruby
 require 'llama_cpp'
-params = LLaMACpp::ContextParams.new
-params.seed = 42
+model_params = LLaMACpp::ModelParams.new
+model = LLaMACpp::Model.new(model_path: '/home/user/llama.cpp/models/open_llama_7b/ggml-model-q4_0.bin', params: model_params)
-model = LLaMACpp::Model.new(model_path: '/home/user/llama.cpp/models/open_llama_7b/ggml-model-q4_0.bin', params: params)
-context = LLaMACpp::Context.new(model: model)
+context_params = LLaMACpp::ContextParams.new
+context_params.seed = 42
+context = LLaMACpp::Context.new(model: model, params: context_params)
-puts LLaMACpp.generate(context, 'Hello, World.', n_threads: 4)
+puts LLaMACpp.generate(context, 'Hello, World.')
 ```
 ## Examples

data/examples/chat.rb CHANGED Viewed

@@ -14,7 +14,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   default_command :main
   desc 'main', 'Start chat'
   option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
-  option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
   option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
   option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
   option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
@@ -32,16 +31,17 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   option :temp, type: :numeric, desc: 'temperature', default: 0.8
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
   def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
-    params = LLaMACpp::ContextParams.new
-    params.seed = options[:seed] if options[:seed] != -1
-    params.n_gpu_layers = options[:n_gpu_layers]
-    model = LLaMACpp::Model.new(model_path: options[:model], params: params)
-    context = LLaMACpp::Context.new(model: model)
+    mdl_params = LLaMACpp::ModelParams.new
+    mdl_params.n_gpu_layers = options[:n_gpu_layers]
+    model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
+    ctx_params = LLaMACpp::ContextParams.new
+    ctx_params.seed = options[:seed] if options[:seed] != -1
+    context = LLaMACpp::Context.new(model: model, params: ctx_params)
     antiprompt = options[:reverse_prompt] || 'User:'
     start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
-    embd_input = context.tokenize(text: start_prompt, add_bos: true)
+    embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
     n_ctx = context.n_ctx
     raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
@@ -58,7 +58,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
     n_consumed = 0
     n_past = 0
     n_remain = options[:n_predict]
-    n_vocab = context.n_vocab
+    n_vocab = context.model.n_vocab
     while interactive
       unless embd.empty?
@@ -70,7 +70,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         0.step(embd.size - 1, options[:batch_size]) do |i|
           n_eval = [options[:batch_size], embd.size - i].min
-          context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
+          context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
           n_past += n_eval
         end
       end
@@ -102,7 +102,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         if id == context.token_eos
           id = context.token_nl
           unless antiprompt.empty?
-            first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
+            first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
             embd_input.concat(first_antiprompt)
           end
         end
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
       if input_echo
         output = []
-        embd.each { |token| output << context.token_to_piece(token) }
+        embd.each { |token| output << context.model.token_to_piece(token) }
         output_str = output.join
         output_str.chomp!(antiprompt) if first_input
         print(output_str)
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
       if embd_input.size <= n_consumed
         if antiprompt.size.positive?
           last_output = []
-          last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
+          last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
           last_output_str = last_output.join
           search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -150,7 +150,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
           break interactive = false if buffer.nil?
           if buffer.size > 1
-            line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
+            line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
             embd_input.concat(line_input)
             n_remain -= line_input.size
           end

data/examples/embedding.rb CHANGED Viewed

@@ -12,23 +12,23 @@ class Embedding < Thor # rubocop:disable Style/Documentation
   default_command :main
   desc 'main', 'Extract embedding from prompt'
   option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
-  option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
   option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
   option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
   def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-    params = LLaMACpp::ContextParams.new
-    params.seed = options[:seed] if options[:seed] != -1
-    params.n_gpu_layers = options[:n_gpu_layers]
-    params.embedding = true
-    model = LLaMACpp::Model.new(model_path: options[:model], params: params)
-    context = LLaMACpp::Context.new(model: model)
+    mdl_params = LLaMACpp::ModelParams.new
+    mdl_params.n_gpu_layers = options[:n_gpu_layers]
+    model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
+    ctx_params = LLaMACpp::ContextParams.new
+    ctx_params.embedding = true
+    ctx_params.seed = options[:seed] if options[:seed] != -1
+    context = LLaMACpp::Context.new(model: model, params: ctx_params)
-    embd_input = context.tokenize(text: options[:prompt], add_bos: true)
+    embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
     return unless embd_input.size.positive?
-    context.eval(tokens: embd_input, n_past: 0, n_threads: options[:threads])
+    context.eval(tokens: embd_input, n_past: 0)
     context.embeddings.each { |val| print("#{val} ") }
     print("\n")