llama_cpp 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param top_k [Integer] The top-k value.
65
+ # @param top_p [Float] The top-p value.
66
+ # @param temperature [Float] The temperature value.
67
+ # @param repeat_penalty [Float] The repeat penalty value.
68
+ # @return [String]
69
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
70
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
71
+ top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
72
+ embd_input = tokenize_prompt(prompt)
73
+
74
+ n_ctx = @context.n_ctx
75
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
76
+
77
+ last_n_tokens = [0] * n_ctx
78
+
79
+ embd = []
80
+ n_consumed = 0
81
+ n_past = 0
82
+ n_remain = max_tokens
83
+ output = []
84
+
85
+ while n_remain != 0
86
+ unless embd.empty?
87
+ if n_past + embd.size > n_ctx
88
+ n_left = n_past - n_keep
89
+ n_past = n_keep
90
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
91
+ end
92
+
93
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
94
+ end
95
+
96
+ n_past += embd.size
97
+ embd.clear
98
+
99
+ if embd_input.size <= n_consumed
100
+ start = n_ctx - repeat_last_n
101
+ id = @context.sample_top_p_top_k(
102
+ last_n_tokens[start...(start + repeat_last_n)],
103
+ top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
104
+ )
105
+ last_n_tokens.shift
106
+ last_n_tokens.push(id)
107
+
108
+ embd.push(id)
109
+ n_remain -= 1
110
+ else
111
+ while embd_input.size > n_consumed
112
+ embd.push(embd_input[n_consumed])
113
+ last_n_tokens.shift
114
+ last_n_tokens.push(embd_input[n_consumed])
115
+ n_consumed += 1
116
+ break if embd.size >= n_batch
117
+ end
118
+ end
119
+
120
+ embd.each { |token| output << @context.token_to_str(token) }
121
+
122
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
123
+ end
124
+
125
+ output.join.delete_prefix(" #{prompt}").strip
126
+ end
127
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
128
+
129
+ # def chat(prompt); end
130
+
131
+ # Obtains the embedding for a given text.
132
+ #
133
+ # @param text [String] The text to obtain the embedding for.
134
+ # @return [Array<Float>]
135
+ def embeddings(text)
136
+ raise 'The embedding option is set to false' unless @params[:embedding]
137
+
138
+ embd_input = tokenize_prompt(text)
139
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
140
+
141
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
142
+ @context.embeddings
143
+ end
144
+
145
+ private
146
+
147
+ def tokenize_prompt(prompt)
148
+ @context.tokenize(text: " #{prompt}", add_bos: true)
149
+ end
150
+ end
151
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.5'
6
+ VERSION = '0.0.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-315a95a'
9
+ LLAMA_CPP_VERSION = 'master-11d9023'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ n_batch = 512
33
41
  output = []
34
42
 
35
43
  while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
62
70
  last_n_tokens.shift
63
71
  last_n_tokens.push(embd_input[n_consumed])
64
72
  n_consumed += 1
65
- break if embd.size >= 512
73
+ break if embd.size >= n_batch
66
74
  end
67
75
  end
68
76
 
69
77
  embd.each { |token| output << context.token_to_str(token) }
70
78
 
71
- break if embd[-1] == LLaMACpp.token_eos
79
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
80
  end
73
81
 
74
82
  output.join.delete_prefix(spaced_prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -5,7 +5,19 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
- def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+ LLAMA_FTYPE_MOSTLY_Q8_0: Integer
16
+ LLAMA_FTYPE_MOSTLY_Q5_0: Integer
17
+ LLAMA_FTYPE_MOSTLY_Q5_1: Integer
18
+
19
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
20
+ def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
9
21
  def self?.print_system_info: () -> void
10
22
  def self?.token_bos: () -> Integer
11
23
  def self?.token_eos: () -> Integer
@@ -18,7 +30,8 @@ module LLaMACpp
18
30
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
19
31
  | () -> void
20
32
  def embeddings: () -> Array[Float]
21
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
33
+ def empty?: () -> bool
34
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
22
35
  def free: () -> void
23
36
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
24
37
  def logits: () -> Array[Float]
@@ -50,9 +63,20 @@ module LLaMACpp
50
63
  def seed=: (Integer) -> Integer
51
64
  def use_mlock: () -> bool
52
65
  def use_mlock=: (bool) -> bool
66
+ def use_mmap: () -> bool
67
+ def use_mmap=: (bool) -> bool
53
68
  def vocab_only: () -> bool
54
69
  def vocab_only=: (bool) -> bool
55
70
  end
56
71
 
57
72
  class Params = ContextParams
73
+
74
+ class Client
75
+ def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
76
+ ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
77
+ ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
78
+ def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
79
+ ?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
80
+ def embeddings(String) -> Array[Float]
81
+ end
58
82
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-20 00:00:00.000000000 Z
11
+ date: 2023-04-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,12 +26,16 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
30
+ - ext/llama_cpp/src/ggml-opencl.c
31
+ - ext/llama_cpp/src/ggml-opencl.h
29
32
  - ext/llama_cpp/src/ggml.c
30
33
  - ext/llama_cpp/src/ggml.h
31
34
  - ext/llama_cpp/src/llama.cpp
32
35
  - ext/llama_cpp/src/llama.h
33
36
  - ext/llama_cpp/src/llama_util.h
34
37
  - lib/llama_cpp.rb
38
+ - lib/llama_cpp/client.rb
35
39
  - lib/llama_cpp/version.rb
36
40
  - sig/llama_cpp.rbs
37
41
  homepage: https://github.com/yoshoku/llama_cpp.rb