llama_cpp 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param top_k [Integer] The top-k value.
65
+ # @param top_p [Float] The top-p value.
66
+ # @param temperature [Float] The temperature value.
67
+ # @param repeat_penalty [Float] The repeat penalty value.
68
+ # @return [String]
69
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
70
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
71
+ top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
72
+ embd_input = tokenize_prompt(prompt)
73
+
74
+ n_ctx = @context.n_ctx
75
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
76
+
77
+ last_n_tokens = [0] * n_ctx
78
+
79
+ embd = []
80
+ n_consumed = 0
81
+ n_past = 0
82
+ n_remain = max_tokens
83
+ output = []
84
+
85
+ while n_remain != 0
86
+ unless embd.empty?
87
+ if n_past + embd.size > n_ctx
88
+ n_left = n_past - n_keep
89
+ n_past = n_keep
90
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
91
+ end
92
+
93
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
94
+ end
95
+
96
+ n_past += embd.size
97
+ embd.clear
98
+
99
+ if embd_input.size <= n_consumed
100
+ start = n_ctx - repeat_last_n
101
+ id = @context.sample_top_p_top_k(
102
+ last_n_tokens[start...(start + repeat_last_n)],
103
+ top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
104
+ )
105
+ last_n_tokens.shift
106
+ last_n_tokens.push(id)
107
+
108
+ embd.push(id)
109
+ n_remain -= 1
110
+ else
111
+ while embd_input.size > n_consumed
112
+ embd.push(embd_input[n_consumed])
113
+ last_n_tokens.shift
114
+ last_n_tokens.push(embd_input[n_consumed])
115
+ n_consumed += 1
116
+ break if embd.size >= n_batch
117
+ end
118
+ end
119
+
120
+ embd.each { |token| output << @context.token_to_str(token) }
121
+
122
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
123
+ end
124
+
125
+ output.join.delete_prefix(" #{prompt}").strip
126
+ end
127
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
128
+
129
+ # def chat(prompt); end
130
+
131
+ # Obtains the embedding for a given text.
132
+ #
133
+ # @param text [String] The text to obtain the embedding for.
134
+ # @return [Array<Float>]
135
+ def embeddings(text)
136
+ raise 'The embedding option is set to false' unless @params[:embedding]
137
+
138
+ embd_input = tokenize_prompt(text)
139
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
140
+
141
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
142
+ @context.embeddings
143
+ end
144
+
145
+ private
146
+
147
+ def tokenize_prompt(prompt)
148
+ @context.tokenize(text: " #{prompt}", add_bos: true)
149
+ end
150
+ end
151
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.5'
6
+ VERSION = '0.0.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-315a95a'
9
+ LLAMA_CPP_VERSION = 'master-11d9023'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ n_batch = 512
33
41
  output = []
34
42
 
35
43
  while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
62
70
  last_n_tokens.shift
63
71
  last_n_tokens.push(embd_input[n_consumed])
64
72
  n_consumed += 1
65
- break if embd.size >= 512
73
+ break if embd.size >= n_batch
66
74
  end
67
75
  end
68
76
 
69
77
  embd.each { |token| output << context.token_to_str(token) }
70
78
 
71
- break if embd[-1] == LLaMACpp.token_eos
79
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
80
  end
73
81
 
74
82
  output.join.delete_prefix(spaced_prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -5,7 +5,19 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
- def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+ LLAMA_FTYPE_MOSTLY_Q8_0: Integer
16
+ LLAMA_FTYPE_MOSTLY_Q5_0: Integer
17
+ LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+
19
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
20
+ def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
9
21
  def self?.print_system_info: () -> void
10
22
  def self?.token_bos: () -> Integer
11
23
  def self?.token_eos: () -> Integer
@@ -18,7 +30,8 @@ module LLaMACpp
18
30
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
31
  | () -> void
20
32
  def embeddings: () -> Array[Float]
21
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
33
+ def empty?: () -> bool
34
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
22
35
  def free: () -> void
23
36
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
24
37
  def logits: () -> Array[Float]
@@ -50,9 +63,20 @@ module LLaMACpp
50
63
  def seed=: (Integer) -> Integer
51
64
  def use_mlock: () -> bool
52
65
  def use_mlock=: (bool) -> bool
66
+ def use_mmap: () -> bool
67
+ def use_mmap=: (bool) -> bool
53
68
  def vocab_only: () -> bool
54
69
  def vocab_only=: (bool) -> bool
55
70
  end
56
71
 
57
72
  class Params = ContextParams
73
+
74
+ class Client
75
+ def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
76
+ ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
77
+ ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
78
+ def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
79
+ ?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
80
+ def embeddings(String) -> Array[Float]
81
+ end
58
82
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-20 00:00:00.000000000 Z
11
+ date: 2023-04-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,12 +26,16 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
30
+ - ext/llama_cpp/src/ggml-opencl.c
31
+ - ext/llama_cpp/src/ggml-opencl.h
29
32
  - ext/llama_cpp/src/ggml.c
30
33
  - ext/llama_cpp/src/ggml.h
31
34
  - ext/llama_cpp/src/llama.cpp
32
35
  - ext/llama_cpp/src/llama.h
33
36
  - ext/llama_cpp/src/llama_util.h
34
37
  - lib/llama_cpp.rb
38
+ - lib/llama_cpp/client.rb
35
39
  - lib/llama_cpp/version.rb
36
40
  - sig/llama_cpp.rbs
37
41
  homepage: https://github.com/yoshoku/llama_cpp.rb