llama_cpp 0.0.6 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +762 -36
- data/ext/llama_cpp/src/ggml-cuda.h +11 -4
- data/ext/llama_cpp/src/ggml-opencl.c +398 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1957 -909
- data/ext/llama_cpp/src/ggml.h +696 -627
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +91 -12
- data/ext/llama_cpp/src/llama.cpp +755 -159
- data/ext/llama_cpp/src/llama.h +85 -34
- data/lib/llama_cpp/client.rb +174 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +43 -11
- data/sig/llama_cpp.rbs +53 -3
- metadata +6 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,9 +19,11 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
23
|
-
#define LLAMA_FILE_MAGIC
|
24
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED
|
22
|
+
#define LLAMA_FILE_VERSION 1
|
23
|
+
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
|
+
#define LLAMA_SESSION_MAGIC 'ggsn'
|
26
|
+
#define LLAMA_SESSION_VERSION 1
|
25
27
|
|
26
28
|
#ifdef __cplusplus
|
27
29
|
extern "C" {
|
@@ -39,18 +41,22 @@ extern "C" {
|
|
39
41
|
|
40
42
|
typedef struct llama_token_data {
|
41
43
|
llama_token id; // token id
|
42
|
-
|
44
|
+
float logit; // log-odds of the token
|
43
45
|
float p; // probability of the token
|
44
|
-
float plog; // log probability of the token
|
45
|
-
|
46
46
|
} llama_token_data;
|
47
47
|
|
48
|
+
typedef struct llama_token_data_array {
|
49
|
+
llama_token_data * data;
|
50
|
+
size_t size;
|
51
|
+
bool sorted;
|
52
|
+
} llama_token_data_array;
|
53
|
+
|
48
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
49
55
|
|
50
56
|
struct llama_context_params {
|
51
57
|
int n_ctx; // text context
|
52
58
|
int n_parts; // -1 for default
|
53
|
-
int seed; // RNG seed,
|
59
|
+
int seed; // RNG seed, -1 for random
|
54
60
|
|
55
61
|
bool f16_kv; // use fp16 for KV cache
|
56
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -73,7 +79,10 @@ extern "C" {
|
|
73
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
81
|
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
-
LLAMA_FTYPE_MOSTLY_Q4_3
|
82
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
83
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
77
86
|
};
|
78
87
|
|
79
88
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -112,22 +121,28 @@ extern "C" {
|
|
112
121
|
const char * path_base_model,
|
113
122
|
int n_threads);
|
114
123
|
|
115
|
-
// Returns the
|
116
|
-
|
117
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
124
|
+
// Returns the number of tokens in the KV cache
|
125
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
118
126
|
|
119
|
-
//
|
120
|
-
LLAMA_API
|
127
|
+
// Sets the current rng seed.
|
128
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
121
129
|
|
122
|
-
// Returns the
|
123
|
-
|
130
|
+
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
131
|
+
// and kv_cache) - will often be smaller after compacting tokens
|
132
|
+
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
124
133
|
|
125
|
-
//
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
134
|
+
// Copies the state to the specified destination address.
|
135
|
+
// Destination needs to have allocated enough memory.
|
136
|
+
// Returns the number of bytes copied
|
137
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
138
|
+
|
139
|
+
// Set the state reading from the specified address
|
140
|
+
// Returns the number of bytes read
|
141
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
142
|
+
|
143
|
+
// Save/load session file
|
144
|
+
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
145
|
+
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
131
146
|
|
132
147
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
133
148
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -152,9 +167,9 @@ extern "C" {
|
|
152
167
|
int n_max_tokens,
|
153
168
|
bool add_bos);
|
154
169
|
|
155
|
-
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
156
|
-
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
157
|
-
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
170
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
171
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
172
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
158
173
|
|
159
174
|
// Token logits obtained from the last call to llama_eval()
|
160
175
|
// The logits for the last token are stored in the last row
|
@@ -168,21 +183,57 @@ extern "C" {
|
|
168
183
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
169
184
|
|
170
185
|
// Token Id -> String. Uses the vocabulary in the provided context
|
171
|
-
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
186
|
+
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
172
187
|
|
173
188
|
// Special tokens
|
174
189
|
LLAMA_API llama_token llama_token_bos();
|
175
190
|
LLAMA_API llama_token llama_token_eos();
|
191
|
+
LLAMA_API llama_token llama_token_nl();
|
192
|
+
|
193
|
+
// Sampling functions
|
194
|
+
|
195
|
+
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
196
|
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
197
|
+
|
198
|
+
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
199
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
200
|
+
|
201
|
+
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
202
|
+
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
|
+
|
204
|
+
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
|
206
|
+
|
207
|
+
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
209
|
+
|
210
|
+
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
|
212
|
+
|
213
|
+
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
215
|
+
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
|
+
|
217
|
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
218
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
219
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
220
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
221
|
+
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
222
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
223
|
+
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
224
|
+
|
225
|
+
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
226
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
227
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
228
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
229
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
230
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
231
|
+
|
232
|
+
/// @details Selects the token with the highest probability.
|
233
|
+
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
176
234
|
|
177
|
-
|
178
|
-
LLAMA_API llama_token
|
179
|
-
struct llama_context * ctx,
|
180
|
-
const llama_token * last_n_tokens_data,
|
181
|
-
int last_n_tokens_size,
|
182
|
-
int top_k,
|
183
|
-
float top_p,
|
184
|
-
float temp,
|
185
|
-
float repeat_penalty);
|
235
|
+
/// @details Randomly selects a token from the candidates based on their probabilities.
|
236
|
+
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
186
237
|
|
187
238
|
// Performance information
|
188
239
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client # rubocop:disable Metrics/ClassLength
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param frequency [Float] The frequency penalty value.
|
65
|
+
# @param presence [Float] The presence penalty value.
|
66
|
+
# @param top_k [Integer] The top-k value.
|
67
|
+
# @param top_p [Float] The top-p value.
|
68
|
+
# @param tfs_z [Float] The tail free sampling parameter.
|
69
|
+
# @param typical_p [Float] The typical probability value.
|
70
|
+
# @param temperature [Float] The temperature value.
|
71
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
72
|
+
# @return [String]
|
73
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
74
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
75
|
+
frequency: 0.0, presence: 0.0,
|
76
|
+
top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
|
77
|
+
embd_input = tokenize_prompt(prompt)
|
78
|
+
|
79
|
+
n_ctx = @context.n_ctx
|
80
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
81
|
+
|
82
|
+
last_n_tokens = [0] * n_ctx
|
83
|
+
|
84
|
+
embd = []
|
85
|
+
n_consumed = 0
|
86
|
+
n_past = 0
|
87
|
+
n_remain = max_tokens
|
88
|
+
n_vocab = @context.n_vocab
|
89
|
+
output = []
|
90
|
+
|
91
|
+
while n_remain != 0
|
92
|
+
unless embd.empty?
|
93
|
+
if n_past + embd.size > n_ctx
|
94
|
+
n_left = n_past - n_keep
|
95
|
+
n_past = n_keep
|
96
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
97
|
+
end
|
98
|
+
|
99
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
100
|
+
end
|
101
|
+
|
102
|
+
n_past += embd.size
|
103
|
+
embd.clear
|
104
|
+
|
105
|
+
if embd_input.size <= n_consumed
|
106
|
+
logits = @context.logits
|
107
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
108
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
109
|
+
|
110
|
+
# apply penalties
|
111
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
112
|
+
@context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
113
|
+
@context.sample_frequency_and_presence_penalties(
|
114
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
115
|
+
)
|
116
|
+
|
117
|
+
# temperature sampling
|
118
|
+
@context.sample_top_k(candidates, k: top_k)
|
119
|
+
@context.sample_tail_free(candidates, z: tfs_z)
|
120
|
+
@context.sample_typical(candidates, prob: typical_p)
|
121
|
+
@context.sample_top_p(candidates, prob: top_p)
|
122
|
+
@context.sample_temperature(candidates, temperature: temperature)
|
123
|
+
id = @context.sample_token(candidates)
|
124
|
+
|
125
|
+
last_n_tokens.shift
|
126
|
+
last_n_tokens.push(id)
|
127
|
+
|
128
|
+
last_n_tokens.shift
|
129
|
+
last_n_tokens.push(id)
|
130
|
+
|
131
|
+
embd.push(id)
|
132
|
+
n_remain -= 1
|
133
|
+
else
|
134
|
+
while embd_input.size > n_consumed
|
135
|
+
embd.push(embd_input[n_consumed])
|
136
|
+
last_n_tokens.shift
|
137
|
+
last_n_tokens.push(embd_input[n_consumed])
|
138
|
+
n_consumed += 1
|
139
|
+
break if embd.size >= n_batch
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
144
|
+
|
145
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
146
|
+
end
|
147
|
+
|
148
|
+
output.join.delete_prefix(" #{prompt}").strip
|
149
|
+
end
|
150
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
151
|
+
|
152
|
+
# def chat(prompt); end
|
153
|
+
|
154
|
+
# Obtains the embedding for a given text.
|
155
|
+
#
|
156
|
+
# @param text [String] The text to obtain the embedding for.
|
157
|
+
# @return [Array<Float>]
|
158
|
+
def embeddings(text)
|
159
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
160
|
+
|
161
|
+
embd_input = tokenize_prompt(text)
|
162
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
163
|
+
|
164
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
165
|
+
@context.embeddings
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
169
|
+
|
170
|
+
def tokenize_prompt(prompt)
|
171
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0
|
6
|
+
VERSION = '0.1.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-173d0e6'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,40 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
repeat_penalty = 1.1
|
41
|
+
frequency = 0.0
|
42
|
+
presence = 0.0
|
43
|
+
top_k = 40
|
44
|
+
top_p = 0.95
|
45
|
+
tfs_z = 1.0
|
46
|
+
typical_p = 1.0
|
47
|
+
temperature = 0.8
|
48
|
+
n_batch = 512
|
49
|
+
n_vocab = context.n_vocab
|
33
50
|
output = []
|
34
51
|
|
35
52
|
while n_remain != 0
|
@@ -47,10 +64,25 @@ module LLaMACpp
|
|
47
64
|
embd.clear
|
48
65
|
|
49
66
|
if embd_input.size <= n_consumed
|
50
|
-
|
51
|
-
|
52
|
-
|
67
|
+
logits = context.logits
|
68
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
69
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
70
|
+
|
71
|
+
# apply penalties
|
72
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
73
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
74
|
+
context.sample_frequency_and_presence_penalties(
|
75
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
53
76
|
)
|
77
|
+
|
78
|
+
# temperature sampling
|
79
|
+
context.sample_top_k(candidates, k: top_k)
|
80
|
+
context.sample_tail_free(candidates, z: tfs_z)
|
81
|
+
context.sample_typical(candidates, prob: typical_p)
|
82
|
+
context.sample_top_p(candidates, prob: top_p)
|
83
|
+
context.sample_temperature(candidates, temperature: temperature)
|
84
|
+
id = context.sample_token(candidates)
|
85
|
+
|
54
86
|
last_n_tokens.shift
|
55
87
|
last_n_tokens.push(id)
|
56
88
|
|
@@ -62,13 +94,13 @@ module LLaMACpp
|
|
62
94
|
last_n_tokens.shift
|
63
95
|
last_n_tokens.push(embd_input[n_consumed])
|
64
96
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
97
|
+
break if embd.size >= n_batch
|
66
98
|
end
|
67
99
|
end
|
68
100
|
|
69
101
|
embd.each { |token| output << context.token_to_str(token) }
|
70
102
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
103
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
104
|
end
|
73
105
|
|
74
106
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -11,22 +11,46 @@ module LLaMACpp
|
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
13
|
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
-
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
|
+
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
|
+
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
15
17
|
|
16
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
17
|
-
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
19
|
+
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
18
20
|
def self?.print_system_info: () -> void
|
19
21
|
def self?.token_bos: () -> Integer
|
20
22
|
def self?.token_eos: () -> Integer
|
23
|
+
def self?.token_nl: () -> Integer
|
21
24
|
def self?.mmap_supported?: () -> bool
|
22
25
|
def self?.mlock_supported?: () -> bool
|
23
26
|
|
27
|
+
class TokenData
|
28
|
+
public
|
29
|
+
|
30
|
+
def initialize: (id: Integer, logit: Float, p: Float) -> void
|
31
|
+
def id: () -> Integer
|
32
|
+
def id=: (Integer) -> Integer
|
33
|
+
def logit: () -> Float
|
34
|
+
def logit=: (Float) -> Float
|
35
|
+
def p: () -> Float
|
36
|
+
def p=: (Float) -> Float
|
37
|
+
end
|
38
|
+
|
39
|
+
class TokenDataArray
|
40
|
+
public
|
41
|
+
|
42
|
+
def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
|
43
|
+
def size: () -> Integer
|
44
|
+
def sorted: () -> bool
|
45
|
+
end
|
46
|
+
|
24
47
|
class Context
|
25
48
|
public
|
26
49
|
|
27
50
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
28
51
|
| () -> void
|
29
52
|
def embeddings: () -> Array[Float]
|
53
|
+
def empty?: () -> bool
|
30
54
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
31
55
|
def free: () -> void
|
32
56
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
@@ -36,10 +60,23 @@ module LLaMACpp
|
|
36
60
|
def n_vocab: () -> Integer
|
37
61
|
def print_timings: () -> void
|
38
62
|
def reset_timings: () -> void
|
39
|
-
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
40
63
|
def token_to_str: (Integer) -> String
|
41
64
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
42
65
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
|
+
def kv_cache_token_count: () -> Integer
|
67
|
+
def set_rng_seed: (Integer) -> void
|
68
|
+
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
|
+
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
|
+
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
71
|
+
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
72
|
+
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
73
|
+
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
74
|
+
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
75
|
+
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
76
|
+
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
77
|
+
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
78
|
+
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
79
|
+
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
43
80
|
end
|
44
81
|
|
45
82
|
class ContextParams
|
@@ -59,9 +96,22 @@ module LLaMACpp
|
|
59
96
|
def seed=: (Integer) -> Integer
|
60
97
|
def use_mlock: () -> bool
|
61
98
|
def use_mlock=: (bool) -> bool
|
99
|
+
def use_mmap: () -> bool
|
100
|
+
def use_mmap=: (bool) -> bool
|
62
101
|
def vocab_only: () -> bool
|
63
102
|
def vocab_only=: (bool) -> bool
|
64
103
|
end
|
65
104
|
|
66
105
|
class Params = ContextParams
|
106
|
+
|
107
|
+
class Client
|
108
|
+
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
+
?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
|
+
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
|
+
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
|
+
?frequency: Float, ?presence: Float,
|
113
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
|
114
|
+
?repeat_penalty: Float) -> String
|
115
|
+
def embeddings(String) -> Array[Float]
|
116
|
+
end
|
67
117
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,12 +27,15 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.c
|
31
|
+
- ext/llama_cpp/src/ggml-opencl.h
|
30
32
|
- ext/llama_cpp/src/ggml.c
|
31
33
|
- ext/llama_cpp/src/ggml.h
|
34
|
+
- ext/llama_cpp/src/llama-util.h
|
32
35
|
- ext/llama_cpp/src/llama.cpp
|
33
36
|
- ext/llama_cpp/src/llama.h
|
34
|
-
- ext/llama_cpp/src/llama_util.h
|
35
37
|
- lib/llama_cpp.rb
|
38
|
+
- lib/llama_cpp/client.rb
|
36
39
|
- lib/llama_cpp/version.rb
|
37
40
|
- sig/llama_cpp.rbs
|
38
41
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|