RubyGems - llama_cpp - Versions diffs - 0.17.10 → 0.18.1 - Mend

llama_cpp 0.17.10 → 0.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +1 -1
data/README.md +8 -31
data/ext/llama_cpp/extconf.rb +0 -3
data/ext/llama_cpp/llama_cpp.c +5174 -0
data/ext/llama_cpp/llama_cpp.h +0 -5
data/lib/llama_cpp/version.rb +3 -3
data/lib/llama_cpp.rb +38 -83
metadata +4 -13
data/examples/README.md +0 -92
data/examples/chat.rb +0 -198
data/examples/embedding.rb +0 -42
data/examples/prompt_jp.txt +0 -8
data/examples/simple.rb +0 -96
data/ext/llama_cpp/llama_cpp.cpp +0 -3764
data/sig/llama_cpp.rbs +0 -425

data/ext/llama_cpp/llama_cpp.h CHANGED Viewed

@@ -1,11 +1,6 @@
 #ifndef LLAMA_CPP_RB_H
 #define LLAMA_CPP_RB_H 1
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
 #include <llama.h>
 #include <ruby.h>

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 # frozen_string_literal: true
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
-module LLaMACpp
+module LlamaCpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.17.10'
+  VERSION = '0.18.1'
   # The supported version of llama.cpp.
-  LLAMA_CPP_VERSION = 'b3676'
+  LLAMA_CPP_VERSION = 'b4713'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
 require_relative 'llama_cpp/llama_cpp'
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
-module LLaMACpp
+module LlamaCpp
   module_function
   # Generates sentences following the given prompt for operation check.
   #
-  # @param context [LLaMACpp::Context] The context to use.
+  # @param context [LlamaCpp::LlamaContext] The context to use.
   # @param prompt [String] The prompt to start generation with.
   # @param n_predict [Integer] The number of tokens to predict.
-  # @param n_keep [Integer] The number of tokens to keep in the context.
-  # @param n_batch [Integer] The number of tokens to process in a batch.
-  # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
-  # @param repeat_penalty [Float] The repetition penalty.
-  # @param frequency [Float] The frequency penalty.
-  # @param presence [Float] The presence penalty.
-  # @param top_k [Integer] The number of tokens to consider for top-k sampling.
-  # @param top_p [Float] The probability threshold for nucleus sampling.
-  # @param tfs_z [Float] The z parameter for tail-free sampling.
-  # @param typical_p [Float] The probability for typical sampling.
-  # @param temperature [Float] The temperature for temperature sampling.
   # @return [String]
-  def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
-               n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
-               repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
-               top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
-    raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
+  def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
     raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
-    spaced_prompt = " #{prompt}"
-    embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
+    model = LlamaCpp.llama_get_model(context)
+    vocab = LlamaCpp.llama_model_get_vocab(model)
-    n_ctx = context.n_ctx
-    raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
+    n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
-    last_n_tokens = [0] * n_ctx
+    prompt_tokens = []
+    raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
+                                                                     true).negative?
-    embd = []
-    n_consumed = 0
-    n_past = 0
-    n_remain = n_predict
-    n_vocab = context.model.n_vocab
+    ctx_params = LlamaCpp::LlamaContextParams.new
+    ctx_params.n_ctx = n_prompt + n_predict - 1
+    ctx_params.n_batch = n_prompt
+    ctx_params.no_perf = false
+    ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
+    sparams = LlamaCpp::LlamaSamplerChainParams.new
+    sparams.no_perf = false
+    smpl = LlamaCpp.llama_sampler_chain_init(sparams)
+    LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
+    batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
+    n_pos = 0
     output = []
+    while n_pos + batch.n_tokens < n_prompt + n_predict
+      break if LlamaCpp.llama_decode(ctx, batch) != 0
+      n_pos += batch.n_tokens
+      new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
+      break if llama_vocab_is_eog?(vocab, new_token_id)
+      buf = llama_token_to_piece(vocab, new_token_id, 0, true)
+      output << buf
-    while n_remain != 0
-      unless embd.empty?
-        if n_past + embd.size > n_ctx
-          n_left = n_past - n_keep
-          n_past = n_keep
-          embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
-        end
-        context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
-      end
-      n_past += embd.size
-      embd.clear
-      if embd_input.size <= n_consumed
-        logits = context.logits
-        base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
-        candidates = LLaMACpp::TokenDataArray.new(base_candidates)
-        # apply penalties
-        last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
-        context.sample_repetition_penalties(
-          candidates, last_n_tokens[-last_n_repeat..],
-          penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
-        )
-        # temperature sampling
-        context.sample_top_k(candidates, k: top_k)
-        context.sample_tail_free(candidates, z: tfs_z)
-        context.sample_typical(candidates, prob: typical_p)
-        context.sample_top_p(candidates, prob: top_p)
-        context.sample_temp(candidates, temp: temperature)
-        id = context.sample_token(candidates)
-        last_n_tokens.shift
-        last_n_tokens.push(id)
-        embd.push(id)
-        n_remain -= 1
-      else
-        while embd_input.size > n_consumed
-          embd.push(embd_input[n_consumed])
-          last_n_tokens.shift
-          last_n_tokens.push(embd_input[n_consumed])
-          n_consumed += 1
-          break if embd.size >= n_batch
-        end
-      end
-      embd.each { |token| output << context.model.token_to_piece(token) }
-      break if !embd.empty? && embd[-1] == context.model.token_eos
+      batch = LlamaCpp.llama_batch_get_one([new_token_id])
     end
-    output.join.scrub('?').strip.delete_prefix(prompt).strip
+    output.join
   end
 end
-LLaMACpp.backend_init
-at_exit { LLaMACpp.backend_free }
+LLaMACpp = LlamaCpp

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.17.10
+  version: 0.18.1
 platform: ruby
 authors:
 - yoshoku
-autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-09-07 00:00:00.000000000 Z
+date: 2025-02-15 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -22,17 +21,11 @@ files:
 - CODE_OF_CONDUCT.md
 - LICENSE.txt
 - README.md
-- examples/README.md
-- examples/chat.rb
-- examples/embedding.rb
-- examples/prompt_jp.txt
-- examples/simple.rb
 - ext/llama_cpp/extconf.rb
-- ext/llama_cpp/llama_cpp.cpp
+- ext/llama_cpp/llama_cpp.c
 - ext/llama_cpp/llama_cpp.h
 - lib/llama_cpp.rb
 - lib/llama_cpp/version.rb
-- sig/llama_cpp.rbs
 homepage: https://github.com/yoshoku/llama_cpp.rb
 licenses:
 - MIT
@@ -42,7 +35,6 @@ metadata:
   changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
   documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
   rubygems_mfa_required: 'true'
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -57,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.9
-signing_key:
+rubygems_version: 3.6.2
 specification_version: 4
 summary: Ruby bindings for the llama.cpp.
 test_files: []

data/examples/README.md DELETED Viewed

@@ -1,92 +0,0 @@
-# llama_cpp.rb/examples
-## chat.rb
-### Usage
-```sh
-$ cd examples
-$ gem install llama_cpp thor
-$ ./chat.rb -m /path/to/quantized-model.bin -t 4
-...
-User: Please tell me the largest city in Japan.
-Bob: Sure. The largest city in Japan is Tokyo.
-User:
-```
-### Options
-```sh
-$ ./chat.rb help main
-Usage:
-  chat.rb main -m, --model=MODEL
-Options:
-  -s, [--seed=N]                         # random seed
-                                         # Default: -1
-  -t, [--threads=N]                      # number of threads
-                                         # Default: 2
-  -m, --model=MODEL                      # path to model file
-  -f, [--file=FILE]                      # prompt file to start generation
-  -r, [--reverse-prompt=REVERSE_PROMPT]  # halt generation at PROMPT, return control in interactive mode
-  -b, [--batch-size=N]                   # batch size for prompt processing
-                                         # Default: 1024
-  -n, [--n-predict=N]                    # number of tokens to predict
-                                         # Default: 256
-      [--keep=N]                         # number of tokens to keep from the initial prompt
-                                         # Default: 48
-      [--repeat-last-n=N]                # last n tokens to consider for penalize
-                                         # Default: 64
-      [--repeat-penalty=N]               # penalize repeat sequence of tokens
-                                         # Default: 1.0
-      [--presence-penalty=N]             # repeat alpha presence penalty
-                                         # Default: 0.0
-      [--frequency-penalty=N]            # repeat alpha frequency penalty
-                                         # Default: 0.0
-      [--top-k=N]                        # top k sampling
-                                         # Default: 40
-      [--top-p=N]                        # top p sampling
-                                         # Default: 0.95
-      [--tfs-z=N]                        # tail free sampling, parameter z
-                                         # Default: 1.0
-      [--typical-p=N]                    # locally typical sampling, parameter p
-                                         # Default: 1.0
-      [--temp=N]                         # temperature
-                                         # Default: 0.8
-      [--n-gpu-layers=N]                 # number of layers on GPU
-                                         # Default: 0
-Start chat
-```
-## embedding.rb
-### Usage
-```sh
-$ cd examples
-$ gem install llama_cpp thor
-$ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
-...
-0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
-```
-### Options
-```
-$ ./embedding.rb help main
-Usage:
-  embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
-Options:
-  -s, [--seed=N]          # random seed
-                          # Default: -1
-  -t, [--threads=N]       # number of threads
-                          # Default: 2
-  -m, --model=MODEL       # path to model file
-  -p, --prompt=PROMPT     # prompt to generate embedding
-      [--n-gpu-layers=N]  # number of layers on GPU
-                          # Default: 0
-Extract embedding from prompt
-```

data/examples/chat.rb DELETED Viewed

@@ -1,198 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# chat.rb is a simple chatbot that uses llama_cpp to generate text.
-# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
-# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
-# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
-require 'llama_cpp'
-require 'thor'
-require 'readline'
-require 'etc'
-class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
-  default_command :main
-  desc 'main', 'Start chat'
-  option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
-  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
-  option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
-  option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
-  option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
-  option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
-  option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
-  option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
-  option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
-  option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
-  option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
-  option :top_k, type: :numeric, desc: 'top k sampling', default: 40
-  option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
-  option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
-  option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
-  option :temp, type: :numeric, desc: 'temperature', default: 0.8
-  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
-  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
-  def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
-    mdl_params = LLaMACpp::ModelParams.new
-    mdl_params.n_gpu_layers = options[:n_gpu_layers]
-    model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
-    ctx_params = LLaMACpp::ContextParams.new
-    ctx_params.seed = options[:seed] if options[:seed] != -1
-    ctx_params.n_threads = options[:n_threads]
-    ctx_params.n_threads_batch = options[:n_threads]
-    context = LLaMACpp::Context.new(model: model, params: ctx_params)
-    antiprompt = options[:reverse_prompt] || 'User:'
-    start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
-    embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
-    n_ctx = context.n_ctx
-    raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
-    n_keep = options[:keep]
-    n_keep = embd_input.size if n_keep > embd_input.size
-    last_n_tokens = [0] * n_ctx
-    interactive = true
-    is_interacting = false
-    input_echo = true
-    first_input = true
-    embd = []
-    n_consumed = 0
-    n_past = 0
-    n_remain = options[:n_predict]
-    n_vocab = context.model.n_vocab
-    while interactive
-      unless embd.empty?
-        if n_past + embd.size > n_ctx
-          n_left = n_past - n_keep
-          n_past = [1, n_keep].max
-          embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
-        end
-        0.step(embd.size - 1, options[:batch_size]) do |i|
-          n_eval = [options[:batch_size], embd.size - i].min
-          context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
-          n_past += n_eval
-        end
-      end
-      embd.clear
-      if embd_input.size <= n_consumed && !is_interacting
-        logits = context.logits
-        base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
-        candidates = LLaMACpp::TokenDataArray.new(base_candidates)
-        last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
-        context.sample_repetition_penalties(
-          candidates,
-          last_n_tokens[-last_n_repeat..],
-          penalty_repeat: options[:repeat_penalty],
-          penalty_freq: options[:frequency_penalty],
-          penalty_present: options[:presence_penalty]
-        )
-        context.sample_top_k(candidates, k: options[:top_k])
-        context.sample_tail_free(candidates, z: options[:tfs_z])
-        context.sample_typical(candidates, prob: options[:typical_p])
-        context.sample_top_p(candidates, prob: options[:top_p])
-        context.sample_temp(candidates, temp: options[:temp])
-        id = context.sample_token(candidates)
-        last_n_tokens.shift
-        last_n_tokens.push(id)
-        if id == context.model.token_eos
-          id = context.model.token_nl
-          unless antiprompt.empty?
-            first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
-            embd_input.concat(first_antiprompt)
-          end
-        end
-        embd.push(id)
-        input_echo = true
-        n_remain -= 1
-      else
-        while embd_input.size > n_consumed
-          embd.push(embd_input[n_consumed])
-          last_n_tokens.shift
-          last_n_tokens.push(embd_input[n_consumed])
-          n_consumed += 1
-          break if embd.size >= options[:batch_size]
-        end
-      end
-      if input_echo
-        output = embd.map { |token| context.model.token_to_piece(token) }
-        output_str = output.join
-        output_str.chomp!(antiprompt) if first_input
-        print(output_str)
-      end
-      if embd_input.size <= n_consumed
-        if antiprompt.size.positive?
-          last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
-          last_output_str = last_output.join
-          search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
-          unless last_output_str.index(antiprompt, search_start_pos).nil?
-            is_interacting = true
-            true
-          end
-        end
-        if n_past.positive? && is_interacting
-          if first_input
-            print("\r#{antiprompt}")
-            first_input = false
-          end
-          buffer = Readline.readline(' ')
-          break interactive = false if buffer.nil?
-          if buffer.size > 1
-            line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
-            embd_input.concat(line_input)
-            n_remain -= line_input.size
-          end
-          input_echo = false
-        end
-        is_interacting = false if n_past.positive?
-      end
-      if n_remain <= 0 && options[:n_predict] != -1
-        n_remain = options[:n_predict]
-        is_interacting = true
-      end
-    end
-  end
-  private
-  def read_prompt(filename)
-    return if filename.nil?
-    File.read(filename).chomp
-  end
-  def default_prompt(antiprompt)
-    # Reference:
-    # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
-    prompt = <<~MSG
-      Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-      User: Hello, Bob.
-      Bob: Hello. How may I help you today?
-      User: Please tell me the largest city in Europe.
-      Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-    MSG
-    prompt + antiprompt
-  end
-end
-Chat.start(ARGV)

data/examples/embedding.rb DELETED Viewed

@@ -1,42 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# embedding.rb extracts embedding from prompt.
-# It is created with reference to embedding.cpp in llama.cpp examples:
-# - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
-require 'llama_cpp'
-require 'thor'
-require 'etc'
-class Embedding < Thor # rubocop:disable Style/Documentation
-  default_command :main
-  desc 'main', 'Extract embedding from prompt'
-  option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
-  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
-  option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
-  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
-  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
-  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-    mdl_params = LLaMACpp::ModelParams.new
-    mdl_params.n_gpu_layers = options[:n_gpu_layers]
-    model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
-    ctx_params = LLaMACpp::ContextParams.new
-    ctx_params.embedding = true
-    ctx_params.seed = options[:seed] if options[:seed] != -1
-    ctx_params.n_threads = options[:n_threads]
-    ctx_params.n_threads_batch = options[:n_threads]
-    context = LLaMACpp::Context.new(model: model, params: ctx_params)
-    embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
-    return unless embd_input.size.positive?
-    context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
-    context.embeddings.each { |val| print("#{val} ") }
-    print("\n")
-  end
-end
-Embedding.start(ARGV)

data/examples/prompt_jp.txt DELETED Viewed

@@ -1,8 +0,0 @@
-UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
-Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
-User: こんにちには、Taro。
-Taro: こんにちは、今日はどのような要件ですか？
-User: 日本で最大の都市について教えてください。
-Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
-User:

data/examples/simple.rb DELETED Viewed

@@ -1,96 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-# simple.rb is a simple text completion script.
-# It is created with reference to simple.cpp in llama.cpp examples:
-# https://github.com/ggerganov/llama.cpp/blob/master/examples/simple/simple.cpp
-require 'llama_cpp'
-require 'thor'
-require 'etc'
-class Simple < Thor # rubocop:disable Style/Documentation
-  default_command :main
-  desc 'main', 'Simple completion'
-  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
-  option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
-  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
-  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-    n_len = 32
-    model_params = LLaMACpp::ModelParams.new
-    model = LLaMACpp::Model.new(model_path: options[:model], params: model_params)
-    context_params = LLaMACpp::ContextParams.new
-    context_params.seed = 1234
-    context_params.n_ctx = 2048
-    context_params.logits_all = true
-    context_params.n_threads = options[:n_threads]
-    context_params.n_threads_batch = options[:n_threads]
-    context = LLaMACpp::Context.new(model: model, params: context_params)
-    tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
-    n_ctx = context.n_ctx
-    n_kv_req = tokens_list.size + (n_len - tokens_list.size)
-    raise 'n_kv_req > n_ctx, the required KV cache size is not big enough' if n_kv_req > n_ctx
-    print("\nmain: n_len = #{n_len}, n_ctx = #{n_ctx}, n_kv_req = #{n_kv_req}\n\n")
-    tokens_list.each { |token| print(context.model.token_to_piece(token)) }
-    batch = LLaMACpp::Batch.new(max_n_token: 512, n_embd: 0, max_n_seq: 1)
-    tokens_list.each_with_index do |token, id|
-      batch.set_token(batch.n_tokens, token)
-      batch.set_pos(batch.n_tokens, id)
-      batch.set_n_seq_id(batch.n_tokens, 1)
-      batch.set_seq_id(batch.n_tokens, 0, 0)
-      batch.set_logits(batch.n_tokens, false)
-      batch.n_tokens = batch.n_tokens + 1
-    end
-    batch.set_logits(batch.n_tokens - 1, true)
-    context.decode(batch)
-    n_cur = batch.n_tokens
-    n_decode = 0
-    n_vocab = context.model.n_vocab
-    t_start = Time.now
-    while n_cur <= n_len
-      logits = context.logits[((batch.n_tokens - 1) * n_vocab)..]
-      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i] || 0.0, p: 0.0) }
-      candidates = LLaMACpp::TokenDataArray.new(base_candidates)
-      new_token_id = context.sample_token_greedy(candidates)
-      if new_token_id == context.model.token_eos || n_cur == n_len
-        print("\n")
-        break
-      end
-      print(context.model.token_to_piece(new_token_id))
-      batch.n_tokens = 0
-      batch.set_token(batch.n_tokens, new_token_id)
-      batch.set_pos(batch.n_tokens, n_cur)
-      batch.set_n_seq_id(batch.n_tokens, 1)
-      batch.set_seq_id(batch.n_tokens, 0, 0)
-      batch.set_logits(batch.n_tokens, true)
-      batch.n_tokens = batch.n_tokens + 1
-      n_decode += 1
-      n_cur += 1
-      context.decode(batch)
-    end
-    t_end = Time.now
-    print("\nmain: decoded #{n_decode} tokens in #{(t_end - t_start).floor(2)} s, speed: #{n_decode.fdiv(t_end - t_start).floor(2)} t/s\n\n")
-    LLaMACpp.backend_free
-  end
-end
-Simple.start(ARGV)