llama_cpp 0.0.7 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,9 +19,11 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
23
|
-
#define LLAMA_FILE_MAGIC
|
24
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED
|
22
|
+
#define LLAMA_FILE_VERSION 1
|
23
|
+
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
|
+
#define LLAMA_SESSION_MAGIC 'ggsn'
|
26
|
+
#define LLAMA_SESSION_VERSION 1
|
25
27
|
|
26
28
|
#ifdef __cplusplus
|
27
29
|
extern "C" {
|
@@ -39,18 +41,22 @@ extern "C" {
|
|
39
41
|
|
40
42
|
typedef struct llama_token_data {
|
41
43
|
llama_token id; // token id
|
42
|
-
|
44
|
+
float logit; // log-odds of the token
|
43
45
|
float p; // probability of the token
|
44
|
-
float plog; // log probability of the token
|
45
|
-
|
46
46
|
} llama_token_data;
|
47
47
|
|
48
|
+
typedef struct llama_token_data_array {
|
49
|
+
llama_token_data * data;
|
50
|
+
size_t size;
|
51
|
+
bool sorted;
|
52
|
+
} llama_token_data_array;
|
53
|
+
|
48
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
49
55
|
|
50
56
|
struct llama_context_params {
|
51
57
|
int n_ctx; // text context
|
52
58
|
int n_parts; // -1 for default
|
53
|
-
int seed; // RNG seed,
|
59
|
+
int seed; // RNG seed, -1 for random
|
54
60
|
|
55
61
|
bool f16_kv; // use fp16 for KV cache
|
56
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -73,7 +79,7 @@ extern "C" {
|
|
73
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
81
|
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
-
LLAMA_FTYPE_MOSTLY_Q4_3
|
82
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
77
83
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
84
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
85
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -116,13 +122,14 @@ extern "C" {
|
|
116
122
|
int n_threads);
|
117
123
|
|
118
124
|
// Returns the number of tokens in the KV cache
|
119
|
-
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
125
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
120
126
|
|
121
127
|
// Sets the current rng seed.
|
122
128
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
129
|
|
124
|
-
// Returns the size in bytes of the state (rng, logits, embedding
|
125
|
-
|
130
|
+
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
131
|
+
// and kv_cache) - will often be smaller after compacting tokens
|
132
|
+
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
126
133
|
|
127
134
|
// Copies the state to the specified destination address.
|
128
135
|
// Destination needs to have allocated enough memory.
|
@@ -133,6 +140,10 @@ extern "C" {
|
|
133
140
|
// Returns the number of bytes read
|
134
141
|
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
135
142
|
|
143
|
+
// Save/load session file
|
144
|
+
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
145
|
+
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
146
|
+
|
136
147
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
137
148
|
// tokens + n_tokens is the provided batch of new tokens to process
|
138
149
|
// n_past is the number of tokens to use from previous eval calls
|
@@ -156,9 +167,9 @@ extern "C" {
|
|
156
167
|
int n_max_tokens,
|
157
168
|
bool add_bos);
|
158
169
|
|
159
|
-
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
160
|
-
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
161
|
-
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
170
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
171
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
172
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
162
173
|
|
163
174
|
// Token logits obtained from the last call to llama_eval()
|
164
175
|
// The logits for the last token are stored in the last row
|
@@ -172,21 +183,57 @@ extern "C" {
|
|
172
183
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
173
184
|
|
174
185
|
// Token Id -> String. Uses the vocabulary in the provided context
|
175
|
-
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
186
|
+
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
176
187
|
|
177
188
|
// Special tokens
|
178
189
|
LLAMA_API llama_token llama_token_bos();
|
179
190
|
LLAMA_API llama_token llama_token_eos();
|
191
|
+
LLAMA_API llama_token llama_token_nl();
|
192
|
+
|
193
|
+
// Sampling functions
|
194
|
+
|
195
|
+
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
196
|
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
197
|
+
|
198
|
+
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
199
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
200
|
+
|
201
|
+
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
202
|
+
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
|
+
|
204
|
+
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
|
206
|
+
|
207
|
+
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
209
|
+
|
210
|
+
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
|
212
|
+
|
213
|
+
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
215
|
+
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
|
+
|
217
|
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
218
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
219
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
220
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
221
|
+
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
222
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
223
|
+
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
224
|
+
|
225
|
+
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
226
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
227
|
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
228
|
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
229
|
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
230
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
231
|
+
|
232
|
+
/// @details Selects the token with the highest probability.
|
233
|
+
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
180
234
|
|
181
|
-
|
182
|
-
LLAMA_API llama_token
|
183
|
-
struct llama_context * ctx,
|
184
|
-
const llama_token * last_n_tokens_data,
|
185
|
-
int last_n_tokens_size,
|
186
|
-
int top_k,
|
187
|
-
float top_p,
|
188
|
-
float temp,
|
189
|
-
float repeat_penalty);
|
235
|
+
/// @details Randomly selects a token from the candidates based on their probabilities.
|
236
|
+
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
190
237
|
|
191
238
|
// Performance information
|
192
239
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module LLaMACpp
|
4
4
|
# Client provides a high-level interface to the LLM model.
|
5
|
-
class Client
|
5
|
+
class Client # rubocop:disable Metrics/ClassLength
|
6
6
|
# Creates a new client.
|
7
7
|
#
|
8
8
|
# @param model_path [String] The path to the model file.
|
@@ -61,14 +61,19 @@ module LLaMACpp
|
|
61
61
|
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
62
|
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
63
|
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param frequency [Float] The frequency penalty value.
|
65
|
+
# @param presence [Float] The presence penalty value.
|
64
66
|
# @param top_k [Integer] The top-k value.
|
65
67
|
# @param top_p [Float] The top-p value.
|
68
|
+
# @param tfs_z [Float] The tail free sampling parameter.
|
69
|
+
# @param typical_p [Float] The typical probability value.
|
66
70
|
# @param temperature [Float] The temperature value.
|
67
71
|
# @param repeat_penalty [Float] The repeat penalty value.
|
68
72
|
# @return [String]
|
69
73
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
74
|
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
-
|
75
|
+
frequency: 0.0, presence: 0.0,
|
76
|
+
top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
|
72
77
|
embd_input = tokenize_prompt(prompt)
|
73
78
|
|
74
79
|
n_ctx = @context.n_ctx
|
@@ -80,6 +85,7 @@ module LLaMACpp
|
|
80
85
|
n_consumed = 0
|
81
86
|
n_past = 0
|
82
87
|
n_remain = max_tokens
|
88
|
+
n_vocab = @context.n_vocab
|
83
89
|
output = []
|
84
90
|
|
85
91
|
while n_remain != 0
|
@@ -97,11 +103,28 @@ module LLaMACpp
|
|
97
103
|
embd.clear
|
98
104
|
|
99
105
|
if embd_input.size <= n_consumed
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
106
|
+
logits = @context.logits
|
107
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
108
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
109
|
+
|
110
|
+
# apply penalties
|
111
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
112
|
+
@context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
113
|
+
@context.sample_frequency_and_presence_penalties(
|
114
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
104
115
|
)
|
116
|
+
|
117
|
+
# temperature sampling
|
118
|
+
@context.sample_top_k(candidates, k: top_k)
|
119
|
+
@context.sample_tail_free(candidates, z: tfs_z)
|
120
|
+
@context.sample_typical(candidates, prob: typical_p)
|
121
|
+
@context.sample_top_p(candidates, prob: top_p)
|
122
|
+
@context.sample_temperature(candidates, temperature: temperature)
|
123
|
+
id = @context.sample_token(candidates)
|
124
|
+
|
125
|
+
last_n_tokens.shift
|
126
|
+
last_n_tokens.push(id)
|
127
|
+
|
105
128
|
last_n_tokens.shift
|
106
129
|
last_n_tokens.push(id)
|
107
130
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0
|
6
|
+
VERSION = '0.1.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-173d0e6'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -37,7 +37,16 @@ module LLaMACpp
|
|
37
37
|
n_past = 0
|
38
38
|
n_remain = n_predict
|
39
39
|
repeat_last_n = 64
|
40
|
+
repeat_penalty = 1.1
|
41
|
+
frequency = 0.0
|
42
|
+
presence = 0.0
|
43
|
+
top_k = 40
|
44
|
+
top_p = 0.95
|
45
|
+
tfs_z = 1.0
|
46
|
+
typical_p = 1.0
|
47
|
+
temperature = 0.8
|
40
48
|
n_batch = 512
|
49
|
+
n_vocab = context.n_vocab
|
41
50
|
output = []
|
42
51
|
|
43
52
|
while n_remain != 0
|
@@ -55,10 +64,25 @@ module LLaMACpp
|
|
55
64
|
embd.clear
|
56
65
|
|
57
66
|
if embd_input.size <= n_consumed
|
58
|
-
|
59
|
-
|
60
|
-
|
67
|
+
logits = context.logits
|
68
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
69
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
70
|
+
|
71
|
+
# apply penalties
|
72
|
+
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
73
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
|
74
|
+
context.sample_frequency_and_presence_penalties(
|
75
|
+
candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
|
61
76
|
)
|
77
|
+
|
78
|
+
# temperature sampling
|
79
|
+
context.sample_top_k(candidates, k: top_k)
|
80
|
+
context.sample_tail_free(candidates, z: tfs_z)
|
81
|
+
context.sample_typical(candidates, prob: typical_p)
|
82
|
+
context.sample_top_p(candidates, prob: top_p)
|
83
|
+
context.sample_temperature(candidates, temperature: temperature)
|
84
|
+
id = context.sample_token(candidates)
|
85
|
+
|
62
86
|
last_n_tokens.shift
|
63
87
|
last_n_tokens.push(id)
|
64
88
|
|
data/sig/llama_cpp.rbs
CHANGED
@@ -11,7 +11,6 @@ module LLaMACpp
|
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
13
|
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
|
-
LLAMA_FTYPE_MOSTLY_Q4_3: Integer
|
15
14
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
16
15
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
17
16
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
@@ -21,9 +20,30 @@ module LLaMACpp
|
|
21
20
|
def self?.print_system_info: () -> void
|
22
21
|
def self?.token_bos: () -> Integer
|
23
22
|
def self?.token_eos: () -> Integer
|
23
|
+
def self?.token_nl: () -> Integer
|
24
24
|
def self?.mmap_supported?: () -> bool
|
25
25
|
def self?.mlock_supported?: () -> bool
|
26
26
|
|
27
|
+
class TokenData
|
28
|
+
public
|
29
|
+
|
30
|
+
def initialize: (id: Integer, logit: Float, p: Float) -> void
|
31
|
+
def id: () -> Integer
|
32
|
+
def id=: (Integer) -> Integer
|
33
|
+
def logit: () -> Float
|
34
|
+
def logit=: (Float) -> Float
|
35
|
+
def p: () -> Float
|
36
|
+
def p=: (Float) -> Float
|
37
|
+
end
|
38
|
+
|
39
|
+
class TokenDataArray
|
40
|
+
public
|
41
|
+
|
42
|
+
def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
|
43
|
+
def size: () -> Integer
|
44
|
+
def sorted: () -> bool
|
45
|
+
end
|
46
|
+
|
27
47
|
class Context
|
28
48
|
public
|
29
49
|
|
@@ -40,10 +60,23 @@ module LLaMACpp
|
|
40
60
|
def n_vocab: () -> Integer
|
41
61
|
def print_timings: () -> void
|
42
62
|
def reset_timings: () -> void
|
43
|
-
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
44
63
|
def token_to_str: (Integer) -> String
|
45
64
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
46
65
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
|
+
def kv_cache_token_count: () -> Integer
|
67
|
+
def set_rng_seed: (Integer) -> void
|
68
|
+
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
|
+
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
|
+
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
71
|
+
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
72
|
+
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
73
|
+
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
74
|
+
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
75
|
+
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
76
|
+
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
77
|
+
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
78
|
+
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
79
|
+
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
47
80
|
end
|
48
81
|
|
49
82
|
class ContextParams
|
@@ -76,7 +109,9 @@ module LLaMACpp
|
|
76
109
|
?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
77
110
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
78
111
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
79
|
-
?
|
112
|
+
?frequency: Float, ?presence: Float,
|
113
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
|
114
|
+
?repeat_penalty: Float) -> String
|
80
115
|
def embeddings(String) -> Array[Float]
|
81
116
|
end
|
82
117
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -31,9 +31,9 @@ files:
|
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|
34
|
+
- ext/llama_cpp/src/llama-util.h
|
34
35
|
- ext/llama_cpp/src/llama.cpp
|
35
36
|
- ext/llama_cpp/src/llama.h
|
36
|
-
- ext/llama_cpp/src/llama_util.h
|
37
37
|
- lib/llama_cpp.rb
|
38
38
|
- lib/llama_cpp/client.rb
|
39
39
|
- lib/llama_cpp/version.rb
|