llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/lib/llama_cpp.rb
CHANGED
@@ -5,9 +5,6 @@ require_relative 'llama_cpp/llama_cpp'
|
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
7
|
module LLaMACpp
|
8
|
-
# Class alias to match interface of whispercpp gem.
|
9
|
-
Params = ContextParams
|
10
|
-
|
11
8
|
module_function
|
12
9
|
|
13
10
|
# Generates sentences following the given prompt for operation check.
|
@@ -15,7 +12,6 @@ module LLaMACpp
|
|
15
12
|
# @param context [LLaMACpp::Context] The context to use.
|
16
13
|
# @param prompt [String] The prompt to start generation with.
|
17
14
|
# @param n_predict [Integer] The number of tokens to predict.
|
18
|
-
# @param n_threads [Integer] The number of threads.
|
19
15
|
# @param n_keep [Integer] The number of tokens to keep in the context.
|
20
16
|
# @param n_batch [Integer] The number of tokens to process in a batch.
|
21
17
|
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
@@ -29,14 +25,14 @@ module LLaMACpp
|
|
29
25
|
# @param temperature [Float] The temperature for temperature sampling.
|
30
26
|
# @return [String]
|
31
27
|
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
32
|
-
n_predict: 128,
|
28
|
+
n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
|
33
29
|
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
34
30
|
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
35
31
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
36
32
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
37
33
|
|
38
34
|
spaced_prompt = " #{prompt}"
|
39
|
-
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
35
|
+
embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
|
40
36
|
|
41
37
|
n_ctx = context.n_ctx
|
42
38
|
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
@@ -47,7 +43,7 @@ module LLaMACpp
|
|
47
43
|
n_consumed = 0
|
48
44
|
n_past = 0
|
49
45
|
n_remain = n_predict
|
50
|
-
n_vocab = context.n_vocab
|
46
|
+
n_vocab = context.model.n_vocab
|
51
47
|
output = []
|
52
48
|
|
53
49
|
while n_remain != 0
|
@@ -58,7 +54,7 @@ module LLaMACpp
|
|
58
54
|
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
59
55
|
end
|
60
56
|
|
61
|
-
context.eval(tokens: embd, n_past: n_past
|
57
|
+
context.eval(tokens: embd, n_past: n_past)
|
62
58
|
end
|
63
59
|
|
64
60
|
n_past += embd.size
|
@@ -99,7 +95,7 @@ module LLaMACpp
|
|
99
95
|
end
|
100
96
|
end
|
101
97
|
|
102
|
-
embd.each { |token| output << context.token_to_piece(token) }
|
98
|
+
embd.each { |token| output << context.model.token_to_piece(token) }
|
103
99
|
|
104
100
|
break if !embd.empty? && embd[-1] == context.token_eos
|
105
101
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -67,14 +67,13 @@ module LLaMACpp
|
|
67
67
|
class Model
|
68
68
|
public
|
69
69
|
|
70
|
-
def initialize: (model_path: String, params: ::LLaMACpp::
|
70
|
+
def initialize: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
71
71
|
| () -> void
|
72
72
|
def empty?: () -> bool
|
73
73
|
def free: () -> void
|
74
|
-
def load: (model_path: String, params: ::LLaMACpp::
|
75
|
-
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
74
|
+
def load: (model_path: String, params: ::LLaMACpp::ModelParams) -> void
|
75
|
+
def apply_lora_from_file: (lora_path: String, ?scale: Float, ?base_model_path: String, ?n_threads: Integer) -> void
|
76
76
|
def n_vocab: () -> Integer
|
77
|
-
def n_ctx: () -> Integer
|
78
77
|
def n_ctx_train: () -> Integer
|
79
78
|
def n_embd: () -> Integer
|
80
79
|
def token_to_piece: (Integer) -> String
|
@@ -98,10 +97,50 @@ module LLaMACpp
|
|
98
97
|
def n_eval: () -> Integer
|
99
98
|
end
|
100
99
|
|
100
|
+
class ModelParams
|
101
|
+
public
|
102
|
+
|
103
|
+
def n_gpu_layers: () -> Integer
|
104
|
+
def n_gpu_layers=: (Integer) -> Integer
|
105
|
+
def main_gpu: () -> Integer
|
106
|
+
def main_gpu=: (Integer) -> Integer
|
107
|
+
def tensor_split: () -> Array[Float]
|
108
|
+
def vocab_only: () -> bool
|
109
|
+
def vocab_only=: (bool) -> bool
|
110
|
+
def use_mmap: () -> bool
|
111
|
+
def use_mmap=: (bool) -> bool
|
112
|
+
def use_mlock: () -> bool
|
113
|
+
def use_mlock=: (bool) -> bool
|
114
|
+
end
|
115
|
+
|
116
|
+
class Batch
|
117
|
+
public
|
118
|
+
|
119
|
+
def initialize: (n_tokens: Integer, embd: Integer) -> void
|
120
|
+
def n_tokens=: (Integer) -> Integer
|
121
|
+
def n_tokens: () -> Integer
|
122
|
+
def all_pos_zero=: (Integer) -> Integer
|
123
|
+
def all_pos_zero: () -> Integer
|
124
|
+
def all_pos_one=: (Integer) -> Integer
|
125
|
+
def all_pos_one: () -> Integer
|
126
|
+
def all_seq_id=: (Integer) -> Integer
|
127
|
+
def all_seq_id: () -> Integer
|
128
|
+
def set_token: (Integer, Integer) -> Integer
|
129
|
+
def get_token: (Integer) -> Integer
|
130
|
+
def set_pos: (Integer, Integer) -> Integer
|
131
|
+
def get_pos: (Integer) -> Integer
|
132
|
+
def set_seq_id: (Integer, Integer) -> Integer
|
133
|
+
def get_seq_id: (Integer) -> Integer
|
134
|
+
def set_logit: (Integer, bool) -> bool
|
135
|
+
def get_logit: (Integer) -> bool
|
136
|
+
end
|
137
|
+
|
101
138
|
class Context
|
102
139
|
public
|
103
140
|
|
104
|
-
|
141
|
+
attr_reader model: ::LLaMACpp::Model
|
142
|
+
|
143
|
+
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
105
144
|
def embeddings: () -> Array[Float]
|
106
145
|
def text: (Integer) -> String
|
107
146
|
def score: (Integer) -> Float
|
@@ -109,20 +148,20 @@ module LLaMACpp
|
|
109
148
|
def token_bos: () -> Integer
|
110
149
|
def token_eos: () -> Integer
|
111
150
|
def token_nl: () -> Integer
|
112
|
-
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer
|
113
|
-
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer
|
114
|
-
def
|
151
|
+
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
152
|
+
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
153
|
+
def decode: (::LLaMACpp::Batch) -> void
|
115
154
|
def logits: () -> Array[Float]
|
116
155
|
def n_ctx: () -> Integer
|
117
|
-
def n_ctx_train: () -> Integer
|
118
|
-
def n_embd: () -> Integer
|
119
|
-
def n_vocab: () -> Integer
|
120
156
|
def timings: () -> ::LLaMACpp::Timings
|
121
157
|
def print_timings: () -> void
|
122
158
|
def reset_timings: () -> void
|
123
|
-
def token_to_piece: (Integer) -> String
|
124
|
-
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
125
159
|
def kv_cache_token_count: () -> Integer
|
160
|
+
def kv_cache_tokens_rm: (Integer, Integer) -> void
|
161
|
+
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
162
|
+
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
163
|
+
def kv_cache_seq_keep: (Integer) -> void
|
164
|
+
def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
|
126
165
|
def set_rng_seed: (Integer) -> void
|
127
166
|
def load_session_file: (session_path: String) -> void
|
128
167
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -134,6 +173,7 @@ module LLaMACpp
|
|
134
173
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
135
174
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
136
175
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
176
|
+
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
137
177
|
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
138
178
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
139
179
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
@@ -146,37 +186,28 @@ module LLaMACpp
|
|
146
186
|
class ContextParams
|
147
187
|
public
|
148
188
|
|
149
|
-
def
|
150
|
-
def
|
151
|
-
def f16_kv: () -> bool
|
152
|
-
def f16_kv=: (bool) -> bool
|
153
|
-
def logits_all: () -> bool
|
154
|
-
def logits_all=: (bool) -> bool
|
189
|
+
def seed: () -> Integer
|
190
|
+
def seed=: (Integer) -> Integer
|
155
191
|
def n_ctx: () -> Integer
|
156
192
|
def n_ctx=: (Integer) -> Integer
|
157
193
|
def n_batch: () -> Integer
|
158
194
|
def n_batch=: (Integer) -> Integer
|
159
|
-
def
|
160
|
-
def
|
161
|
-
def
|
162
|
-
def
|
163
|
-
def tensor_split: () -> Array[Float]
|
195
|
+
def n_threads: () -> Integer
|
196
|
+
def n_threads=: (Integer) -> Integer
|
197
|
+
def n_threads_batch: () -> Integer
|
198
|
+
def n_threads_batch=: (Integer) -> Integer
|
164
199
|
def rope_freq_base=: (Float) -> Float
|
165
200
|
def rope_freq_base: () -> Float
|
166
201
|
def rope_freq_scale=: (Float) -> Float
|
167
202
|
def rope_freq_scale: () -> Float
|
168
|
-
def low_vram: () -> bool
|
169
|
-
def low_vram=: (bool) -> bool
|
170
203
|
def mul_mat_q: () -> bool
|
171
204
|
def mul_mat_q=: (bool) -> bool
|
172
|
-
def
|
173
|
-
def
|
174
|
-
def
|
175
|
-
def
|
176
|
-
def
|
177
|
-
def
|
178
|
-
def vocab_only: () -> bool
|
179
|
-
def vocab_only=: (bool) -> bool
|
205
|
+
def f16_kv: () -> bool
|
206
|
+
def f16_kv=: (bool) -> bool
|
207
|
+
def logits_all: () -> bool
|
208
|
+
def logits_all=: (bool) -> bool
|
209
|
+
def embedding: () -> bool
|
210
|
+
def embedding=: (bool) -> bool
|
180
211
|
end
|
181
212
|
|
182
213
|
class ModelQuantizeParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-09-
|
11
|
+
date: 2023-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '0'
|
77
77
|
requirements: []
|
78
|
-
rubygems_version: 3.
|
78
|
+
rubygems_version: 3.4.19
|
79
79
|
signing_key:
|
80
80
|
specification_version: 4
|
81
81
|
summary: Ruby bindings for the llama.cpp.
|