llama_cpp 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +762 -36
- data/ext/llama_cpp/src/ggml-cuda.h +11 -4
- data/ext/llama_cpp/src/ggml-opencl.c +398 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1957 -909
- data/ext/llama_cpp/src/ggml.h +696 -627
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +91 -12
- data/ext/llama_cpp/src/llama.cpp +755 -159
- data/ext/llama_cpp/src/llama.h +85 -34
- data/lib/llama_cpp/client.rb +174 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +43 -11
- data/sig/llama_cpp.rbs +53 -3
- metadata +6 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,9 +19,11 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
23
|
-
#define LLAMA_FILE_MAGIC
|
24
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED
|
22
|
+
#define LLAMA_FILE_VERSION 1
|
23
|
+
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
|
+
#define LLAMA_SESSION_MAGIC 'ggsn'
|
26
|
+
#define LLAMA_SESSION_VERSION 1
|
25
27
|
|
26
28
|
#ifdef __cplusplus
|
27
29
|
extern "C" {
|
@@ -39,18 +41,22 @@ extern "C" {
|
|
39
41
|
|
40
42
|
typedef struct llama_token_data {
|
41
43
|
llama_token id; // token id
|
42
|
-
|
44
|
+
float logit; // log-odds of the token
|
43
45
|
float p; // probability of the token
|
44
|
-
float plog; // log probability of the token
|
45
|
-
|
46
46
|
} llama_token_data;
|
47
47
|
|
48
|
+
typedef struct llama_token_data_array {
|
49
|
+
llama_token_data * data;
|
50
|
+
size_t size;
|
51
|
+
bool sorted;
|
52
|
+
} llama_token_data_array;
|
53
|
+
|
48
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
49
55
|
|
50
56
|
struct llama_context_params {
|
51
57
|
int n_ctx; // text context
|
52
58
|
int n_parts; // -1 for default
|
53
|
-
int seed; // RNG seed,
|
59
|
+
int seed; // RNG seed, -1 for random
|
54
60
|
|
55
61
|
bool f16_kv; // use fp16 for KV cache
|
56
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -73,7 +79,10 @@ extern "C" {
|
|
73
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
81
|
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
-
LLAMA_FTYPE_MOSTLY_Q4_3
|
82
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
83
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
77
86
|
};
|
78
87
|
|
79
88
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -112,22 +121,28 @@ extern "C" {
|
|
112
121
|
const char * path_base_model,
|
113
122
|
int n_threads);
|
114
123
|
|
115
|
-
// Returns the
|
116
|
-
|
117
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
124
|
+
// Returns the number of tokens in the KV cache
|
125
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
118
126
|
|
119
|
-
//
|
120
|
-
LLAMA_API
|
127
|
+
// Sets the current rng seed.
|
128
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
121
129
|
|
122
|
-
// Returns the
|
123
|
-
|
130
|
+
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
131
|
+
// and kv_cache) - will often be smaller after compacting tokens
|
132
|
+
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
124
133
|
|
125
|
-
//
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
134
|
+
// Copies the state to the specified destination address.
|
135
|
+
// Destination needs to have allocated enough memory.
|
136
|
+
// Returns the number of bytes copied
|
137
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
138
|
+
|
139
|
+
// Set the state reading from the specified address
|
140
|
+
// Returns the number of bytes read
|
141
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
142
|
+
|
143
|
+
// Save/load session file
|
144
|
+
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
145
|
+
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
131
146
|
|
132
147
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
133
148
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -152,9 +167,9 @@ extern "C" {
|
|
152
167
|
int n_max_tokens,
|
153
168
|
bool add_bos);
|
154
169
|
|
155
|
-
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
156
|
-
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
157
|
-
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
170
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
171
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
172
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
158
173
|
|
159
174
|
// Token logits obtained from the last call to llama_eval()
|
160
175
|
// The logits for the last token are stored in the last row
|
@@ -168,21 +183,57 @@ extern "C" {
|
|
168
183
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
169
184
|
|
170
185
|
// Token Id -> String. Uses the vocabulary in the provided context
|
171
|
-
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
186
|
+
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
172
187
|
|
173
188
|
// Special tokens
|
174
189
|
LLAMA_API llama_token llama_token_bos();
|
175
190
|
LLAMA_API llama_token llama_token_eos();
|
191
|
+
LLAMA_API llama_token llama_token_nl();
|
192
|
+
|
193
|
+
// Sampling functions
|
194
|
+
|
195
|
+
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
196
|
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
197
|
+
|
198
|
+
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
199
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
200
|
+
|
201
|
+
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
202
|
+
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
|
+
|
204
|
+
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
|
206
|
+
|
207
|
+
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
209
|
+
|
210
|
+
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
|
212
|
+
|
213
|
+
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
215
|
+
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
|
+
|
217
|
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
218
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
219
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
220
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
221
|
+
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
222
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
223
|
+
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
224
|
+
|
225
|
+
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
226
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
227
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
228
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
229
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
230
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
231
|
+
|
232
|
+
/// @details Selects the token with the highest probability.
|
233
|
+
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
176
234
|
|
177
|
-
|
178
|
-
LLAMA_API llama_token
|
179
|
-
struct llama_context * ctx,
|
180
|
-
const llama_token * last_n_tokens_data,
|
181
|
-
int last_n_tokens_size,
|
182
|
-
int top_k,
|
183
|
-
float top_p,
|
184
|
-
float temp,
|
185
|
-
float repeat_penalty);
|
235
|
+
/// @details Randomly selects a token from the candidates based on their probabilities.
|
236
|
+
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
186
237
|
|
187
238
|
// Performance information
|
188
239
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client # rubocop:disable Metrics/ClassLength
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param frequency [Float] The frequency penalty value.
|
65
|
+
# @param presence [Float] The presence penalty value.
|
66
|
+
# @param top_k [Integer] The top-k value.
|
67
|
+
# @param top_p [Float] The top-p value.
|
68
|
+
# @param tfs_z [Float] The tail free sampling parameter.
|
69
|
+
# @param typical_p [Float] The typical probability value.
|
70
|
+
# @param temperature [Float] The temperature value.
|
71
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
72
|
+
# @return [String]
|
73
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
74
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
75
|
+
frequency: 0.0, presence: 0.0,
|
76
|
+
top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
|
77
|
+
embd_input = tokenize_prompt(prompt)
|
78
|
+
|
79
|
+
n_ctx = @context.n_ctx
|
80
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
81
|
+
|
82
|
+
last_n_tokens = [0] * n_ctx
|
83
|
+
|
84
|
+
embd = []
|
85
|
+
n_consumed = 0
|
86
|
+
n_past = 0
|
87
|
+
n_remain = max_tokens
|
88
|
+
n_vocab = @context.n_vocab
|
89
|
+
output = []
|
90
|
+
|
91
|
+
while n_remain != 0
|
92
|
+
unless embd.empty?
|
93
|
+
if n_past + embd.size > n_ctx
|
94
|
+
n_left = n_past - n_keep
|
95
|
+
n_past = n_keep
|
96
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
97
|
+
end
|
98
|
+
|
99
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
100
|
+
end
|
101
|
+
|
102
|
+
n_past += embd.size
|
103
|
+
embd.clear
|
104
|
+
|
105
|
+
if embd_input.size <= n_consumed
|
106
|
+
logits = @context.logits
|
107
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
108
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
109
|
+
|
110
|
+
# apply penalties
|
111
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
112
|
+
@context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
113
|
+
@context.sample_frequency_and_presence_penalties(
|
114
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
115
|
+
)
|
116
|
+
|
117
|
+
# temperature sampling
|
118
|
+
@context.sample_top_k(candidates, k: top_k)
|
119
|
+
@context.sample_tail_free(candidates, z: tfs_z)
|
120
|
+
@context.sample_typical(candidates, prob: typical_p)
|
121
|
+
@context.sample_top_p(candidates, prob: top_p)
|
122
|
+
@context.sample_temperature(candidates, temperature: temperature)
|
123
|
+
id = @context.sample_token(candidates)
|
124
|
+
|
125
|
+
last_n_tokens.shift
|
126
|
+
last_n_tokens.push(id)
|
127
|
+
|
128
|
+
last_n_tokens.shift
|
129
|
+
last_n_tokens.push(id)
|
130
|
+
|
131
|
+
embd.push(id)
|
132
|
+
n_remain -= 1
|
133
|
+
else
|
134
|
+
while embd_input.size > n_consumed
|
135
|
+
embd.push(embd_input[n_consumed])
|
136
|
+
last_n_tokens.shift
|
137
|
+
last_n_tokens.push(embd_input[n_consumed])
|
138
|
+
n_consumed += 1
|
139
|
+
break if embd.size >= n_batch
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
144
|
+
|
145
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
146
|
+
end
|
147
|
+
|
148
|
+
output.join.delete_prefix(" #{prompt}").strip
|
149
|
+
end
|
150
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
151
|
+
|
152
|
+
# def chat(prompt); end
|
153
|
+
|
154
|
+
# Obtains the embedding for a given text.
|
155
|
+
#
|
156
|
+
# @param text [String] The text to obtain the embedding for.
|
157
|
+
# @return [Array<Float>]
|
158
|
+
def embeddings(text)
|
159
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
160
|
+
|
161
|
+
embd_input = tokenize_prompt(text)
|
162
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
163
|
+
|
164
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
165
|
+
@context.embeddings
|
166
|
+
end
|
167
|
+
|
168
|
+
private
|
169
|
+
|
170
|
+
def tokenize_prompt(prompt)
|
171
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0
|
6
|
+
VERSION = '0.1.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-173d0e6'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,40 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
repeat_penalty = 1.1
|
41
|
+
frequency = 0.0
|
42
|
+
presence = 0.0
|
43
|
+
top_k = 40
|
44
|
+
top_p = 0.95
|
45
|
+
tfs_z = 1.0
|
46
|
+
typical_p = 1.0
|
47
|
+
temperature = 0.8
|
48
|
+
n_batch = 512
|
49
|
+
n_vocab = context.n_vocab
|
33
50
|
output = []
|
34
51
|
|
35
52
|
while n_remain != 0
|
@@ -47,10 +64,25 @@ module LLaMACpp
|
|
47
64
|
embd.clear
|
48
65
|
|
49
66
|
if embd_input.size <= n_consumed
|
50
|
-
|
51
|
-
|
52
|
-
|
67
|
+
logits = context.logits
|
68
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
69
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
70
|
+
|
71
|
+
# apply penalties
|
72
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
73
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
74
|
+
context.sample_frequency_and_presence_penalties(
|
75
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
53
76
|
)
|
77
|
+
|
78
|
+
# temperature sampling
|
79
|
+
context.sample_top_k(candidates, k: top_k)
|
80
|
+
context.sample_tail_free(candidates, z: tfs_z)
|
81
|
+
context.sample_typical(candidates, prob: typical_p)
|
82
|
+
context.sample_top_p(candidates, prob: top_p)
|
83
|
+
context.sample_temperature(candidates, temperature: temperature)
|
84
|
+
id = context.sample_token(candidates)
|
85
|
+
|
54
86
|
last_n_tokens.shift
|
55
87
|
last_n_tokens.push(id)
|
56
88
|
|
@@ -62,13 +94,13 @@ module LLaMACpp
|
|
62
94
|
last_n_tokens.shift
|
63
95
|
last_n_tokens.push(embd_input[n_consumed])
|
64
96
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
97
|
+
break if embd.size >= n_batch
|
66
98
|
end
|
67
99
|
end
|
68
100
|
|
69
101
|
embd.each { |token| output << context.token_to_str(token) }
|
70
102
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
103
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
104
|
end
|
73
105
|
|
74
106
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -11,22 +11,46 @@ module LLaMACpp
|
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
13
|
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
-
|
14
|
+
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
|
+
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
|
+
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
15
17
|
|
16
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
17
|
-
def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
|
19
|
+
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
18
20
|
def self?.print_system_info: () -> void
|
19
21
|
def self?.token_bos: () -> Integer
|
20
22
|
def self?.token_eos: () -> Integer
|
23
|
+
def self?.token_nl: () -> Integer
|
21
24
|
def self?.mmap_supported?: () -> bool
|
22
25
|
def self?.mlock_supported?: () -> bool
|
23
26
|
|
27
|
+
class TokenData
|
28
|
+
public
|
29
|
+
|
30
|
+
def initialize: (id: Integer, logit: Float, p: Float) -> void
|
31
|
+
def id: () -> Integer
|
32
|
+
def id=: (Integer) -> Integer
|
33
|
+
def logit: () -> Float
|
34
|
+
def logit=: (Float) -> Float
|
35
|
+
def p: () -> Float
|
36
|
+
def p=: (Float) -> Float
|
37
|
+
end
|
38
|
+
|
39
|
+
class TokenDataArray
|
40
|
+
public
|
41
|
+
|
42
|
+
def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
|
43
|
+
def size: () -> Integer
|
44
|
+
def sorted: () -> bool
|
45
|
+
end
|
46
|
+
|
24
47
|
class Context
|
25
48
|
public
|
26
49
|
|
27
50
|
def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
28
51
|
| () -> void
|
29
52
|
def embeddings: () -> Array[Float]
|
53
|
+
def empty?: () -> bool
|
30
54
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
31
55
|
def free: () -> void
|
32
56
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
@@ -36,10 +60,23 @@ module LLaMACpp
|
|
36
60
|
def n_vocab: () -> Integer
|
37
61
|
def print_timings: () -> void
|
38
62
|
def reset_timings: () -> void
|
39
|
-
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
40
63
|
def token_to_str: (Integer) -> String
|
41
64
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
42
65
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
|
+
def kv_cache_token_count: () -> Integer
|
67
|
+
def set_rng_seed: (Integer) -> void
|
68
|
+
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
|
+
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
|
+
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
71
|
+
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
72
|
+
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
73
|
+
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
74
|
+
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
75
|
+
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
76
|
+
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
77
|
+
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
78
|
+
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
79
|
+
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
43
80
|
end
|
44
81
|
|
45
82
|
class ContextParams
|
@@ -59,9 +96,22 @@ module LLaMACpp
|
|
59
96
|
def seed=: (Integer) -> Integer
|
60
97
|
def use_mlock: () -> bool
|
61
98
|
def use_mlock=: (bool) -> bool
|
99
|
+
def use_mmap: () -> bool
|
100
|
+
def use_mmap=: (bool) -> bool
|
62
101
|
def vocab_only: () -> bool
|
63
102
|
def vocab_only=: (bool) -> bool
|
64
103
|
end
|
65
104
|
|
66
105
|
class Params = ContextParams
|
106
|
+
|
107
|
+
class Client
|
108
|
+
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
+
?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
|
+
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
|
+
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
|
+
?frequency: Float, ?presence: Float,
|
113
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
|
114
|
+
?repeat_penalty: Float) -> String
|
115
|
+
def embeddings(String) -> Array[Float]
|
116
|
+
end
|
67
117
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,12 +27,15 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.c
|
31
|
+
- ext/llama_cpp/src/ggml-opencl.h
|
30
32
|
- ext/llama_cpp/src/ggml.c
|
31
33
|
- ext/llama_cpp/src/ggml.h
|
34
|
+
- ext/llama_cpp/src/llama-util.h
|
32
35
|
- ext/llama_cpp/src/llama.cpp
|
33
36
|
- ext/llama_cpp/src/llama.h
|
34
|
-
- ext/llama_cpp/src/llama_util.h
|
35
37
|
- lib/llama_cpp.rb
|
38
|
+
- lib/llama_cpp/client.rb
|
36
39
|
- lib/llama_cpp/version.rb
|
37
40
|
- sig/llama_cpp.rbs
|
38
41
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|