llama_cpp 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/lib/llama_cpp.rb
CHANGED
@@ -5,9 +5,6 @@ require_relative 'llama_cpp/llama_cpp'
|
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
7
|
module LLaMACpp
|
8
|
-
# Class alias to match interface of whispercpp gem.
|
9
|
-
Params = ContextParams
|
10
|
-
|
11
8
|
module_function
|
12
9
|
|
13
10
|
# Generates sentences following the given prompt for operation check.
|
@@ -15,7 +12,6 @@ module LLaMACpp
|
|
15
12
|
# @param context [LLaMACpp::Context] The context to use.
|
16
13
|
# @param prompt [String] The prompt to start generation with.
|
17
14
|
# @param n_predict [Integer] The number of tokens to predict.
|
18
|
-
# @param n_threads [Integer] The number of threads.
|
19
15
|
# @param n_keep [Integer] The number of tokens to keep in the context.
|
20
16
|
# @param n_batch [Integer] The number of tokens to process in a batch.
|
21
17
|
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
@@ -29,14 +25,14 @@ module LLaMACpp
|
|
29
25
|
# @param temperature [Float] The temperature for temperature sampling.
|
30
26
|
# @return [String]
|
31
27
|
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
32
|
-
n_predict: 128,
|
28
|
+
n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
|
33
29
|
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
34
30
|
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
35
31
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
36
32
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
37
33
|
|
38
34
|
spaced_prompt = " #{prompt}"
|
39
|
-
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
35
|
+
embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
|
40
36
|
|
41
37
|
n_ctx = context.n_ctx
|
42
38
|
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
@@ -47,7 +43,7 @@ module LLaMACpp
|
|
47
43
|
n_consumed = 0
|
48
44
|
n_past = 0
|
49
45
|
n_remain = n_predict
|
50
|
-
n_vocab = context.n_vocab
|
46
|
+
n_vocab = context.model.n_vocab
|
51
47
|
output = []
|
52
48
|
|
53
49
|
while n_remain != 0
|
@@ -58,7 +54,7 @@ module LLaMACpp
|
|
58
54
|
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
59
55
|
end
|
60
56
|
|
61
|
-
context.eval(tokens: embd, n_past: n_past
|
57
|
+
context.eval(tokens: embd, n_past: n_past)
|
62
58
|
end
|
63
59
|
|
64
60
|
n_past += embd.size
|
@@ -99,7 +95,7 @@ module LLaMACpp
|
|
99
95
|
end
|
100
96
|
end
|
101
97
|
|
102
|
-
embd.each { |token| output << context.token_to_piece(token) }
|
98
|
+
embd.each { |token| output << context.model.token_to_piece(token) }
|
103
99
|
|
104
100
|
break if !embd.empty? && embd[-1] == context.token_eos
|
105
101
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -67,14 +67,13 @@ module LLaMACpp
|
|
67
67
|
class Model
|
68
68
|
public
|
69
69
|
|
70
|
-
def initialize: (model_path: String, params: ::LLaMACpp::
|
70
|
+
def initialize: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
71
71
|
| () -> void
|
72
72
|
def empty?: () -> bool
|
73
73
|
def free: () -> void
|
74
|
-
def load: (model_path: String, params: ::LLaMACpp::
|
75
|
-
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
74
|
+
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
75
|
+
def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
|
76
76
|
def n_vocab: () -> Integer
|
77
|
-
def n_ctx: () -> Integer
|
78
77
|
def n_ctx_train: () -> Integer
|
79
78
|
def n_embd: () -> Integer
|
80
79
|
def token_to_piece: (Integer) -> String
|
@@ -98,10 +97,50 @@ module LLaMACpp
|
|
98
97
|
def n_eval: () -> Integer
|
99
98
|
end
|
100
99
|
|
100
|
+
class ModelParams
|
101
|
+
public
|
102
|
+
|
103
|
+
def n_gpu_layers: () -> Integer
|
104
|
+
def n_gpu_layers=: (Integer) -> Integer
|
105
|
+
def main_gpu: () -> Integer
|
106
|
+
def main_gpu=: (Integer) -> Integer
|
107
|
+
def tensor_split: () -> Array[Float]
|
108
|
+
def vocab_only: () -> bool
|
109
|
+
def vocab_only=: (bool) -> bool
|
110
|
+
def use_mmap: () -> bool
|
111
|
+
def use_mmap=: (bool) -> bool
|
112
|
+
def use_mlock: () -> bool
|
113
|
+
def use_mlock=: (bool) -> bool
|
114
|
+
end
|
115
|
+
|
116
|
+
class Batch
|
117
|
+
public
|
118
|
+
|
119
|
+
def initialize: (n_tokens: Integer, embd: Integer) -> void
|
120
|
+
def n_tokens=: (Integer) -> Integer
|
121
|
+
def n_tokens: () -> Integer
|
122
|
+
def all_pos_zero=: (Integer) -> Integer
|
123
|
+
def all_pos_zero: () -> Integer
|
124
|
+
def all_pos_one=: (Integer) -> Integer
|
125
|
+
def all_pos_one: () -> Integer
|
126
|
+
def all_seq_id=: (Integer) -> Integer
|
127
|
+
def all_seq_id: () -> Integer
|
128
|
+
def set_token: (Integer, Integer) -> Integer
|
129
|
+
def get_token: (Integer) -> Integer
|
130
|
+
def set_pos: (Integer, Integer) -> Integer
|
131
|
+
def get_pos: (Integer) -> Integer
|
132
|
+
def set_seq_id: (Integer, Integer) -> Integer
|
133
|
+
def get_seq_id: (Integer) -> Integer
|
134
|
+
def set_logit: (Integer, bool) -> bool
|
135
|
+
def get_logit: (Integer) -> bool
|
136
|
+
end
|
137
|
+
|
101
138
|
class Context
|
102
139
|
public
|
103
140
|
|
104
|
-
|
141
|
+
attr_reader model: ::LLaMACpp::Model
|
142
|
+
|
143
|
+
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
105
144
|
def embeddings: () -> Array[Float]
|
106
145
|
def text: (Integer) -> String
|
107
146
|
def score: (Integer) -> Float
|
@@ -109,20 +148,20 @@ module LLaMACpp
|
|
109
148
|
def token_bos: () -> Integer
|
110
149
|
def token_eos: () -> Integer
|
111
150
|
def token_nl: () -> Integer
|
112
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer
|
113
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer
|
114
|
-
def
|
151
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
152
|
+
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
153
|
+
def decode: (::LLaMACpp::Batch) -> void
|
115
154
|
def logits: () -> Array[Float]
|
116
155
|
def n_ctx: () -> Integer
|
117
|
-
def n_ctx_train: () -> Integer
|
118
|
-
def n_embd: () -> Integer
|
119
|
-
def n_vocab: () -> Integer
|
120
156
|
def timings: () -> ::LLaMACpp::Timings
|
121
157
|
def print_timings: () -> void
|
122
158
|
def reset_timings: () -> void
|
123
|
-
def token_to_piece: (Integer) -> String
|
124
|
-
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
125
159
|
def kv_cache_token_count: () -> Integer
|
160
|
+
def kv_cache_tokens_rm: (Integer, Integer) -> void
|
161
|
+
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
162
|
+
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
163
|
+
def kv_cache_seq_keep: (Integer) -> void
|
164
|
+
def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
|
126
165
|
def set_rng_seed: (Integer) -> void
|
127
166
|
def load_session_file: (session_path: String) -> void
|
128
167
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -134,6 +173,7 @@ module LLaMACpp
|
|
134
173
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
135
174
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
136
175
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
176
|
+
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
137
177
|
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
138
178
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
139
179
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
@@ -146,37 +186,28 @@ module LLaMACpp
|
|
146
186
|
class ContextParams
|
147
187
|
public
|
148
188
|
|
149
|
-
def
|
150
|
-
def
|
151
|
-
def f16_kv: () -> bool
|
152
|
-
def f16_kv=: (bool) -> bool
|
153
|
-
def logits_all: () -> bool
|
154
|
-
def logits_all=: (bool) -> bool
|
189
|
+
def seed: () -> Integer
|
190
|
+
def seed=: (Integer) -> Integer
|
155
191
|
def n_ctx: () -> Integer
|
156
192
|
def n_ctx=: (Integer) -> Integer
|
157
193
|
def n_batch: () -> Integer
|
158
194
|
def n_batch=: (Integer) -> Integer
|
159
|
-
def
|
160
|
-
def
|
161
|
-
def
|
162
|
-
def
|
163
|
-
def tensor_split: () -> Array[Float]
|
195
|
+
def n_threads: () -> Integer
|
196
|
+
def n_threads=: (Integer) -> Integer
|
197
|
+
def n_threads_batch: () -> Integer
|
198
|
+
def n_threads_batch=: (Integer) -> Integer
|
164
199
|
def rope_freq_base=: (Float) -> Float
|
165
200
|
def rope_freq_base: () -> Float
|
166
201
|
def rope_freq_scale=: (Float) -> Float
|
167
202
|
def rope_freq_scale: () -> Float
|
168
|
-
def low_vram: () -> bool
|
169
|
-
def low_vram=: (bool) -> bool
|
170
203
|
def mul_mat_q: () -> bool
|
171
204
|
def mul_mat_q=: (bool) -> bool
|
172
|
-
def
|
173
|
-
def
|
174
|
-
def
|
175
|
-
def
|
176
|
-
def
|
177
|
-
def
|
178
|
-
def vocab_only: () -> bool
|
179
|
-
def vocab_only=: (bool) -> bool
|
205
|
+
def f16_kv: () -> bool
|
206
|
+
def f16_kv=: (bool) -> bool
|
207
|
+
def logits_all: () -> bool
|
208
|
+
def logits_all=: (bool) -> bool
|
209
|
+
def embedding: () -> bool
|
210
|
+
def embedding=: (bool) -> bool
|
180
211
|
end
|
181
212
|
|
182
213
|
class ModelQuantizeParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '0'
|
77
77
|
requirements: []
|
78
|
-
rubygems_version: 3.
|
78
|
+
rubygems_version: 3.4.19
|
79
79
|
signing_key:
|
80
80
|
specification_version: 4
|
81
81
|
summary: Ruby bindings for the llama.cpp.
|