llama_cpp 0.0.7 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +829 -51
- data/ext/llama_cpp/src/ggml-cuda.h +9 -32
- data/ext/llama_cpp/src/ggml-opencl.c +169 -24
- data/ext/llama_cpp/src/ggml.c +6672 -4376
- data/ext/llama_cpp/src/ggml.h +250 -15
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +710 -217
- data/ext/llama_cpp/src/llama.h +75 -28
- data/lib/llama_cpp/client.rb +30 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +41 -7
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,9 +19,11 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
23
|
-
#define LLAMA_FILE_MAGIC
|
24
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED
|
22
|
+
#define LLAMA_FILE_VERSION 2
|
23
|
+
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
|
+
#define LLAMA_SESSION_MAGIC 'ggsn'
|
26
|
+
#define LLAMA_SESSION_VERSION 1
|
25
27
|
|
26
28
|
#ifdef __cplusplus
|
27
29
|
extern "C" {
|
@@ -39,18 +41,22 @@ extern "C" {
|
|
39
41
|
|
40
42
|
typedef struct llama_token_data {
|
41
43
|
llama_token id; // token id
|
42
|
-
|
44
|
+
float logit; // log-odds of the token
|
43
45
|
float p; // probability of the token
|
44
|
-
float plog; // log probability of the token
|
45
|
-
|
46
46
|
} llama_token_data;
|
47
47
|
|
48
|
+
typedef struct llama_token_data_array {
|
49
|
+
llama_token_data * data;
|
50
|
+
size_t size;
|
51
|
+
bool sorted;
|
52
|
+
} llama_token_data_array;
|
53
|
+
|
48
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
49
55
|
|
50
56
|
struct llama_context_params {
|
51
|
-
int n_ctx;
|
52
|
-
int
|
53
|
-
int seed;
|
57
|
+
int n_ctx; // text context
|
58
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
59
|
+
int seed; // RNG seed, -1 for random
|
54
60
|
|
55
61
|
bool f16_kv; // use fp16 for KV cache
|
56
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -72,8 +78,8 @@ extern "C" {
|
|
72
78
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
-
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, //
|
76
|
-
LLAMA_FTYPE_MOSTLY_Q4_3
|
81
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
82
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
77
83
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
84
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
85
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -116,23 +122,28 @@ extern "C" {
|
|
116
122
|
int n_threads);
|
117
123
|
|
118
124
|
// Returns the number of tokens in the KV cache
|
119
|
-
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
125
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
120
126
|
|
121
127
|
// Sets the current rng seed.
|
122
128
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
129
|
|
124
|
-
// Returns the size in bytes of the state (rng, logits, embedding
|
125
|
-
|
130
|
+
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
131
|
+
// and kv_cache) - will often be smaller after compacting tokens
|
132
|
+
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
126
133
|
|
127
134
|
// Copies the state to the specified destination address.
|
128
135
|
// Destination needs to have allocated enough memory.
|
129
136
|
// Returns the number of bytes copied
|
130
|
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
137
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
131
138
|
|
132
139
|
// Set the state reading from the specified address
|
133
140
|
// Returns the number of bytes read
|
134
141
|
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
135
142
|
|
143
|
+
// Save/load session file
|
144
|
+
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
145
|
+
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
146
|
+
|
136
147
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
137
148
|
// tokens + n_tokens is the provided batch of new tokens to process
|
138
149
|
// n_past is the number of tokens to use from previous eval calls
|
@@ -156,9 +167,9 @@ extern "C" {
|
|
156
167
|
int n_max_tokens,
|
157
168
|
bool add_bos);
|
158
169
|
|
159
|
-
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
160
|
-
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
161
|
-
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
170
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
171
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
172
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
162
173
|
|
163
174
|
// Token logits obtained from the last call to llama_eval()
|
164
175
|
// The logits for the last token are stored in the last row
|
@@ -172,21 +183,57 @@ extern "C" {
|
|
172
183
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
173
184
|
|
174
185
|
// Token Id -> String. Uses the vocabulary in the provided context
|
175
|
-
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
186
|
+
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
176
187
|
|
177
188
|
// Special tokens
|
178
189
|
LLAMA_API llama_token llama_token_bos();
|
179
190
|
LLAMA_API llama_token llama_token_eos();
|
191
|
+
LLAMA_API llama_token llama_token_nl();
|
192
|
+
|
193
|
+
// Sampling functions
|
194
|
+
|
195
|
+
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
196
|
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
197
|
+
|
198
|
+
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
199
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
200
|
+
|
201
|
+
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
202
|
+
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
|
+
|
204
|
+
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
206
|
+
|
207
|
+
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
209
|
+
|
210
|
+
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
212
|
+
|
213
|
+
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
215
|
+
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
|
+
|
217
|
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
218
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
219
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
220
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
221
|
+
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
222
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
223
|
+
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
224
|
+
|
225
|
+
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
226
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
227
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
228
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
229
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
230
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
231
|
+
|
232
|
+
/// @details Selects the token with the highest probability.
|
233
|
+
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
180
234
|
|
181
|
-
|
182
|
-
LLAMA_API llama_token
|
183
|
-
struct llama_context * ctx,
|
184
|
-
const llama_token * last_n_tokens_data,
|
185
|
-
int last_n_tokens_size,
|
186
|
-
int top_k,
|
187
|
-
float top_p,
|
188
|
-
float temp,
|
189
|
-
float repeat_penalty);
|
235
|
+
/// @details Randomly selects a token from the candidates based on their probabilities.
|
236
|
+
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
190
237
|
|
191
238
|
// Performance information
|
192
239
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
module LLaMACpp
|
4
4
|
# Client provides a high-level interface to the LLM model.
|
5
|
-
class Client
|
5
|
+
class Client # rubocop:disable Metrics/ClassLength
|
6
6
|
# Creates a new client.
|
7
7
|
#
|
8
8
|
# @param model_path [String] The path to the model file.
|
9
9
|
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
10
|
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
11
|
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
12
|
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
13
|
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
14
|
# @param use_mlock [Boolean] The flag hether to use mlock.
|
@@ -19,7 +18,7 @@ module LLaMACpp
|
|
19
18
|
# @return [Client]
|
20
19
|
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
20
|
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
-
n_ctx: 512,
|
21
|
+
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
22
|
embedding: false,
|
24
23
|
n_threads: 1, seed: 0)
|
25
24
|
@params = {
|
@@ -27,7 +26,6 @@ module LLaMACpp
|
|
27
26
|
lora_adapter_path: lora_adapter_path,
|
28
27
|
lora_base_path: lora_base_path,
|
29
28
|
n_ctx: n_ctx,
|
30
|
-
n_parts: n_parts,
|
31
29
|
memory_f16: memory_f16,
|
32
30
|
use_mmap: use_mmap,
|
33
31
|
use_mlock: use_mlock,
|
@@ -61,14 +59,19 @@ module LLaMACpp
|
|
61
59
|
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
60
|
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
61
|
# @param n_batch [Integer] The batch size.
|
62
|
+
# @param frequency [Float] The frequency penalty value.
|
63
|
+
# @param presence [Float] The presence penalty value.
|
64
64
|
# @param top_k [Integer] The top-k value.
|
65
65
|
# @param top_p [Float] The top-p value.
|
66
|
+
# @param tfs_z [Float] The tail free sampling parameter.
|
67
|
+
# @param typical_p [Float] The typical probability value.
|
66
68
|
# @param temperature [Float] The temperature value.
|
67
69
|
# @param repeat_penalty [Float] The repeat penalty value.
|
68
70
|
# @return [String]
|
69
71
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
72
|
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
-
|
73
|
+
frequency: 0.0, presence: 0.0,
|
74
|
+
top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
|
72
75
|
embd_input = tokenize_prompt(prompt)
|
73
76
|
|
74
77
|
n_ctx = @context.n_ctx
|
@@ -80,6 +83,7 @@ module LLaMACpp
|
|
80
83
|
n_consumed = 0
|
81
84
|
n_past = 0
|
82
85
|
n_remain = max_tokens
|
86
|
+
n_vocab = @context.n_vocab
|
83
87
|
output = []
|
84
88
|
|
85
89
|
while n_remain != 0
|
@@ -97,11 +101,28 @@ module LLaMACpp
|
|
97
101
|
embd.clear
|
98
102
|
|
99
103
|
if embd_input.size <= n_consumed
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
+
logits = @context.logits
|
105
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
106
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
107
|
+
|
108
|
+
# apply penalties
|
109
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
110
|
+
@context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
111
|
+
@context.sample_frequency_and_presence_penalties(
|
112
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
104
113
|
)
|
114
|
+
|
115
|
+
# temperature sampling
|
116
|
+
@context.sample_top_k(candidates, k: top_k)
|
117
|
+
@context.sample_tail_free(candidates, z: tfs_z)
|
118
|
+
@context.sample_typical(candidates, prob: typical_p)
|
119
|
+
@context.sample_top_p(candidates, prob: top_p)
|
120
|
+
@context.sample_temperature(candidates, temperature: temperature)
|
121
|
+
id = @context.sample_token(candidates)
|
122
|
+
|
123
|
+
last_n_tokens.shift
|
124
|
+
last_n_tokens.push(id)
|
125
|
+
|
105
126
|
last_n_tokens.shift
|
106
127
|
last_n_tokens.push(id)
|
107
128
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.1.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-6986c78'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -37,7 +37,16 @@ module LLaMACpp
|
|
37
37
|
n_past = 0
|
38
38
|
n_remain = n_predict
|
39
39
|
repeat_last_n = 64
|
40
|
+
repeat_penalty = 1.1
|
41
|
+
frequency = 0.0
|
42
|
+
presence = 0.0
|
43
|
+
top_k = 40
|
44
|
+
top_p = 0.95
|
45
|
+
tfs_z = 1.0
|
46
|
+
typical_p = 1.0
|
47
|
+
temperature = 0.8
|
40
48
|
n_batch = 512
|
49
|
+
n_vocab = context.n_vocab
|
41
50
|
output = []
|
42
51
|
|
43
52
|
while n_remain != 0
|
@@ -55,10 +64,25 @@ module LLaMACpp
|
|
55
64
|
embd.clear
|
56
65
|
|
57
66
|
if embd_input.size <= n_consumed
|
58
|
-
|
59
|
-
|
60
|
-
|
67
|
+
logits = context.logits
|
68
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
69
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
70
|
+
|
71
|
+
# apply penalties
|
72
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
73
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
74
|
+
context.sample_frequency_and_presence_penalties(
|
75
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
61
76
|
)
|
77
|
+
|
78
|
+
# temperature sampling
|
79
|
+
context.sample_top_k(candidates, k: top_k)
|
80
|
+
context.sample_tail_free(candidates, z: tfs_z)
|
81
|
+
context.sample_typical(candidates, prob: typical_p)
|
82
|
+
context.sample_top_p(candidates, prob: top_p)
|
83
|
+
context.sample_temperature(candidates, temperature: temperature)
|
84
|
+
id = context.sample_token(candidates)
|
85
|
+
|
62
86
|
last_n_tokens.shift
|
63
87
|
last_n_tokens.push(id)
|
64
88
|
|
data/sig/llama_cpp.rbs
CHANGED
@@ -10,8 +10,6 @@ module LLaMACpp
|
|
10
10
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
-
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
-
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
13
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
16
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
17
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
@@ -21,9 +19,30 @@ module LLaMACpp
|
|
21
19
|
def self?.print_system_info: () -> void
|
22
20
|
def self?.token_bos: () -> Integer
|
23
21
|
def self?.token_eos: () -> Integer
|
22
|
+
def self?.token_nl: () -> Integer
|
24
23
|
def self?.mmap_supported?: () -> bool
|
25
24
|
def self?.mlock_supported?: () -> bool
|
26
25
|
|
26
|
+
class TokenData
|
27
|
+
public
|
28
|
+
|
29
|
+
def initialize: (id: Integer, logit: Float, p: Float) -> void
|
30
|
+
def id: () -> Integer
|
31
|
+
def id=: (Integer) -> Integer
|
32
|
+
def logit: () -> Float
|
33
|
+
def logit=: (Float) -> Float
|
34
|
+
def p: () -> Float
|
35
|
+
def p=: (Float) -> Float
|
36
|
+
end
|
37
|
+
|
38
|
+
class TokenDataArray
|
39
|
+
public
|
40
|
+
|
41
|
+
def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
|
42
|
+
def size: () -> Integer
|
43
|
+
def sorted: () -> bool
|
44
|
+
end
|
45
|
+
|
27
46
|
class Context
|
28
47
|
public
|
29
48
|
|
@@ -40,10 +59,25 @@ module LLaMACpp
|
|
40
59
|
def n_vocab: () -> Integer
|
41
60
|
def print_timings: () -> void
|
42
61
|
def reset_timings: () -> void
|
43
|
-
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
44
62
|
def token_to_str: (Integer) -> String
|
45
63
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
46
64
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
65
|
+
def kv_cache_token_count: () -> Integer
|
66
|
+
def set_rng_seed: (Integer) -> void
|
67
|
+
def load_session_file: (session_path: String) -> void
|
68
|
+
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
69
|
+
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
70
|
+
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
71
|
+
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
72
|
+
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
73
|
+
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
74
|
+
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
75
|
+
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
76
|
+
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
77
|
+
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
78
|
+
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
79
|
+
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
80
|
+
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
47
81
|
end
|
48
82
|
|
49
83
|
class ContextParams
|
@@ -57,8 +91,6 @@ module LLaMACpp
|
|
57
91
|
def logits_all=: (bool) -> bool
|
58
92
|
def n_ctx: () -> Integer
|
59
93
|
def n_ctx=: (Integer) -> Integer
|
60
|
-
def n_parts: () -> Integer
|
61
|
-
def n_parts=: (Integer) -> Integer
|
62
94
|
def seed: () -> Integer
|
63
95
|
def seed=: (Integer) -> Integer
|
64
96
|
def use_mlock: () -> bool
|
@@ -73,10 +105,12 @@ module LLaMACpp
|
|
73
105
|
|
74
106
|
class Client
|
75
107
|
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
76
|
-
?n_ctx: Integer, ?
|
108
|
+
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
77
109
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
78
110
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
79
|
-
?
|
111
|
+
?frequency: Float, ?presence: Float,
|
112
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
|
113
|
+
?repeat_penalty: Float) -> String
|
80
114
|
def embeddings(String) -> Array[Float]
|
81
115
|
end
|
82
116
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -31,9 +31,9 @@ files:
|
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|
34
|
+
- ext/llama_cpp/src/llama-util.h
|
34
35
|
- ext/llama_cpp/src/llama.cpp
|
35
36
|
- ext/llama_cpp/src/llama.h
|
36
|
-
- ext/llama_cpp/src/llama_util.h
|
37
37
|
- lib/llama_cpp.rb
|
38
38
|
- lib/llama_cpp/client.rb
|
39
39
|
- lib/llama_cpp/version.rb
|