llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -37,6 +37,8 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
39
39
|
|
40
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
41
|
+
|
40
42
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
41
43
|
|
42
44
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
@@ -60,13 +62,9 @@ extern "C" {
|
|
60
62
|
struct llama_model;
|
61
63
|
struct llama_context;
|
62
64
|
|
63
|
-
typedef
|
64
|
-
|
65
|
-
|
66
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
-
};
|
65
|
+
typedef int32_t llama_pos;
|
66
|
+
typedef int32_t llama_token;
|
67
|
+
typedef int32_t llama_seq_id;
|
70
68
|
|
71
69
|
enum llama_vocab_type {
|
72
70
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
@@ -86,24 +84,24 @@ extern "C" {
|
|
86
84
|
// model file types
|
87
85
|
enum llama_ftype {
|
88
86
|
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
-
LLAMA_FTYPE_MOSTLY_F16 = 1,
|
90
|
-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
|
91
|
-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
|
92
|
-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
|
93
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5,
|
94
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6,
|
95
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
|
96
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
|
97
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
|
98
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10
|
99
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
|
100
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
|
101
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
|
102
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
|
103
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
|
104
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
|
105
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
|
106
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
87
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
88
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
91
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
93
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
107
105
|
|
108
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
107
|
};
|
@@ -122,41 +120,68 @@ extern "C" {
|
|
122
120
|
|
123
121
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
123
|
+
// Input data for llama_decode
|
124
|
+
// A llama_batch object can contain input about one or many sequences
|
125
|
+
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
126
|
+
//
|
127
|
+
// - token : the token ids of the input (used when embd is NULL)
|
128
|
+
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
129
|
+
// - pos : the positions of the respective token in the sequence
|
130
|
+
// - seq_id : the sequence to which the respective token belongs
|
131
|
+
// - logits : if zero, the logits for the respective token will not be output
|
132
|
+
//
|
133
|
+
typedef struct llama_batch {
|
134
|
+
int32_t n_tokens;
|
135
|
+
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
llama_seq_id * seq_id;
|
140
|
+
int8_t * logits;
|
141
|
+
|
142
|
+
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
|
+
// for future-proof code, use the above fields instead and ignore everything below
|
144
|
+
//
|
145
|
+
// pos[i] = all_pos_0 + i*all_pos_1
|
146
|
+
//
|
147
|
+
llama_pos all_pos_0; // used if pos == NULL
|
148
|
+
llama_pos all_pos_1; // used if pos == NULL
|
149
|
+
llama_seq_id all_seq_id; // used if seq_id == NULL
|
150
|
+
} llama_batch;
|
151
|
+
|
152
|
+
struct llama_model_params {
|
153
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
154
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
132
155
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
133
156
|
|
134
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
135
|
-
float rope_freq_base; // RoPE base frequency
|
136
|
-
float rope_freq_scale; // RoPE frequency scaling factor
|
137
|
-
|
138
157
|
// called with a progress value between 0 and 1, pass NULL to disable
|
139
158
|
llama_progress_callback progress_callback;
|
140
159
|
// context pointer passed to the progress callback
|
141
160
|
void * progress_callback_user_data;
|
142
161
|
|
143
162
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
144
|
-
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
145
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
146
|
-
bool f16_kv; // use fp16 for KV cache
|
147
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
148
163
|
bool vocab_only; // only load the vocabulary, no weights
|
149
164
|
bool use_mmap; // use mmap if possible
|
150
165
|
bool use_mlock; // force system to keep model in RAM
|
151
|
-
bool embedding; // embedding mode only
|
152
166
|
};
|
153
167
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
168
|
+
struct llama_context_params {
|
169
|
+
uint32_t seed; // RNG seed, -1 for random
|
170
|
+
uint32_t n_ctx; // text context
|
171
|
+
uint32_t n_batch; // prompt processing batch size
|
172
|
+
uint32_t n_threads; // number of threads to use for generation
|
173
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
174
|
+
|
175
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
176
|
+
float rope_freq_base; // RoPE base frequency
|
177
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
178
|
+
|
179
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
180
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
181
|
+
bool f16_kv; // use fp16 for KV cache
|
182
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
183
|
+
bool embedding; // embedding mode only
|
184
|
+
};
|
160
185
|
|
161
186
|
// model quantization parameters
|
162
187
|
typedef struct llama_model_quantize_params {
|
@@ -215,6 +240,8 @@ extern "C" {
|
|
215
240
|
int32_t n_eval;
|
216
241
|
};
|
217
242
|
|
243
|
+
// Helpers for getting default parameters
|
244
|
+
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
218
245
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
219
246
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
220
247
|
|
@@ -228,7 +255,7 @@ extern "C" {
|
|
228
255
|
|
229
256
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
230
257
|
const char * path_model,
|
231
|
-
struct
|
258
|
+
struct llama_model_params params);
|
232
259
|
|
233
260
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
234
261
|
|
@@ -245,25 +272,28 @@ extern "C" {
|
|
245
272
|
LLAMA_API bool llama_mmap_supported (void);
|
246
273
|
LLAMA_API bool llama_mlock_supported(void);
|
247
274
|
|
248
|
-
LLAMA_API
|
275
|
+
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
276
|
+
|
249
277
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
250
|
-
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
251
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
252
278
|
|
253
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct
|
279
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
254
280
|
|
255
|
-
LLAMA_API int
|
256
|
-
LLAMA_API int
|
257
|
-
LLAMA_API int
|
258
|
-
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
281
|
+
LLAMA_API int llama_n_vocab (const struct llama_model * model);
|
282
|
+
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
283
|
+
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
259
284
|
|
260
285
|
// Get a string describing the model type
|
261
286
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
287
|
+
|
262
288
|
// Returns the total size of all the tensors in the model in bytes
|
263
289
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
290
|
+
|
264
291
|
// Returns the total number of parameters in the model
|
265
292
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
266
293
|
|
294
|
+
// Get a llama model tensor
|
295
|
+
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
296
|
+
|
267
297
|
// Returns 0 on success
|
268
298
|
LLAMA_API int llama_model_quantize(
|
269
299
|
const char * fname_inp,
|
@@ -279,21 +309,65 @@ extern "C" {
|
|
279
309
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
280
310
|
struct llama_context * ctx,
|
281
311
|
const char * path_lora,
|
312
|
+
float scale,
|
282
313
|
const char * path_base_model,
|
283
314
|
int n_threads),
|
284
|
-
"
|
315
|
+
"use llama_model_apply_lora_from_file instead");
|
285
316
|
|
286
317
|
LLAMA_API int llama_model_apply_lora_from_file(
|
287
318
|
const struct llama_model * model,
|
288
|
-
|
289
|
-
|
290
|
-
|
319
|
+
const char * path_lora,
|
320
|
+
float scale,
|
321
|
+
const char * path_base_model,
|
322
|
+
int n_threads);
|
323
|
+
|
324
|
+
//
|
325
|
+
// KV cache
|
326
|
+
//
|
291
327
|
|
292
328
|
// Returns the number of tokens in the KV cache
|
293
|
-
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx)
|
329
|
+
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
330
|
+
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
294
331
|
|
295
|
-
//
|
296
|
-
LLAMA_API void
|
332
|
+
// Remove all tokens data of cells in [c0, c1)
|
333
|
+
LLAMA_API void llama_kv_cache_tokens_rm(
|
334
|
+
struct llama_context * ctx,
|
335
|
+
int32_t c0,
|
336
|
+
int32_t c1);
|
337
|
+
|
338
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
339
|
+
LLAMA_API void llama_kv_cache_seq_rm(
|
340
|
+
struct llama_context * ctx,
|
341
|
+
llama_seq_id seq_id,
|
342
|
+
llama_pos p0,
|
343
|
+
llama_pos p1);
|
344
|
+
|
345
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
346
|
+
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
347
|
+
LLAMA_API void llama_kv_cache_seq_cp(
|
348
|
+
struct llama_context * ctx,
|
349
|
+
llama_seq_id seq_id_src,
|
350
|
+
llama_seq_id seq_id_dst,
|
351
|
+
llama_pos p0,
|
352
|
+
llama_pos p1);
|
353
|
+
|
354
|
+
// Removes all tokens that do not belong to the specified sequence
|
355
|
+
LLAMA_API void llama_kv_cache_seq_keep(
|
356
|
+
struct llama_context * ctx,
|
357
|
+
llama_seq_id seq_id);
|
358
|
+
|
359
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
360
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
361
|
+
LLAMA_API void llama_kv_cache_seq_shift(
|
362
|
+
struct llama_context * ctx,
|
363
|
+
llama_seq_id seq_id,
|
364
|
+
llama_pos p0,
|
365
|
+
llama_pos p1,
|
366
|
+
llama_pos delta);
|
367
|
+
|
368
|
+
//
|
369
|
+
// State / sessions
|
370
|
+
//
|
297
371
|
|
298
372
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
299
373
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -302,48 +376,102 @@ extern "C" {
|
|
302
376
|
// Copies the state to the specified destination address.
|
303
377
|
// Destination needs to have allocated enough memory.
|
304
378
|
// Returns the number of bytes copied
|
305
|
-
LLAMA_API size_t llama_copy_state_data(
|
379
|
+
LLAMA_API size_t llama_copy_state_data(
|
380
|
+
struct llama_context * ctx,
|
381
|
+
uint8_t * dst);
|
306
382
|
|
307
383
|
// Set the state reading from the specified address
|
308
384
|
// Returns the number of bytes read
|
309
|
-
LLAMA_API size_t llama_set_state_data(
|
385
|
+
LLAMA_API size_t llama_set_state_data(
|
386
|
+
struct llama_context * ctx,
|
387
|
+
uint8_t * src);
|
310
388
|
|
311
389
|
// Save/load session file
|
312
|
-
LLAMA_API bool llama_load_session_file(
|
313
|
-
|
390
|
+
LLAMA_API bool llama_load_session_file(
|
391
|
+
struct llama_context * ctx,
|
392
|
+
const char * path_session,
|
393
|
+
llama_token * tokens_out,
|
394
|
+
size_t n_token_capacity,
|
395
|
+
size_t * n_token_count_out);
|
396
|
+
|
397
|
+
LLAMA_API bool llama_save_session_file(
|
398
|
+
struct llama_context * ctx,
|
399
|
+
const char * path_session,
|
400
|
+
const llama_token * tokens,
|
401
|
+
size_t n_token_count);
|
402
|
+
|
403
|
+
//
|
404
|
+
// Decoding
|
405
|
+
//
|
314
406
|
|
315
|
-
// Run the llama inference to obtain the logits and probabilities for the next token.
|
407
|
+
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
316
408
|
// tokens + n_tokens is the provided batch of new tokens to process
|
317
409
|
// n_past is the number of tokens to use from previous eval calls
|
318
410
|
// Returns 0 on success
|
319
|
-
|
411
|
+
// DEPRECATED: use llama_decode() instead
|
412
|
+
LLAMA_API DEPRECATED(int llama_eval(
|
320
413
|
struct llama_context * ctx,
|
321
|
-
|
322
|
-
|
323
|
-
int n_past,
|
324
|
-
|
414
|
+
llama_token * tokens,
|
415
|
+
int32_t n_tokens,
|
416
|
+
int n_past),
|
417
|
+
"use llama_decode() instead");
|
325
418
|
|
326
419
|
// Same as llama_eval, but use float matrix input directly.
|
327
|
-
|
420
|
+
// DEPRECATED: use llama_decode() instead
|
421
|
+
LLAMA_API DEPRECATED(int llama_eval_embd(
|
328
422
|
struct llama_context * ctx,
|
329
|
-
|
330
|
-
|
331
|
-
int n_past,
|
332
|
-
|
423
|
+
float * embd,
|
424
|
+
int32_t n_tokens,
|
425
|
+
int n_past),
|
426
|
+
"use llama_decode() instead");
|
333
427
|
|
334
|
-
//
|
335
|
-
//
|
336
|
-
//
|
337
|
-
//
|
338
|
-
LLAMA_API
|
428
|
+
// Return batch for single sequence of tokens starting at pos_0
|
429
|
+
//
|
430
|
+
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
431
|
+
//
|
432
|
+
LLAMA_API struct llama_batch llama_batch_get_one(
|
433
|
+
llama_token * tokens,
|
434
|
+
int32_t n_tokens,
|
435
|
+
llama_pos pos_0,
|
436
|
+
llama_seq_id seq_id);
|
437
|
+
|
438
|
+
// Allocates a batch of tokens on the heap
|
439
|
+
// The batch has to be freed with llama_batch_free()
|
440
|
+
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
441
|
+
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
442
|
+
// The rest of the llama_batch members are allocated with size n_tokens
|
443
|
+
// All members are left uninitialized
|
444
|
+
LLAMA_API struct llama_batch llama_batch_init(
|
445
|
+
int32_t n_tokens,
|
446
|
+
int32_t embd);
|
447
|
+
|
448
|
+
// Frees a batch of tokens allocated with llama_batch_init()
|
449
|
+
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
450
|
+
|
451
|
+
// Positive return values does not mean a fatal error, but rather a warning.
|
452
|
+
// 0 - success
|
453
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
454
|
+
// < 0 - error
|
455
|
+
LLAMA_API int llama_decode(
|
456
|
+
struct llama_context * ctx,
|
457
|
+
struct llama_batch batch);
|
458
|
+
|
459
|
+
// Set the number of threads used for decoding
|
460
|
+
// n_threads is the number of threads used for generation (single token)
|
461
|
+
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
462
|
+
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
339
463
|
|
340
464
|
// Token logits obtained from the last call to llama_eval()
|
341
465
|
// The logits for the last token are stored in the last row
|
342
|
-
//
|
343
|
-
// Rows: n_tokens
|
466
|
+
// Logits for which llama_batch.logits[i] == 0 are undefined
|
467
|
+
// Rows: n_tokens provided with llama_batch
|
344
468
|
// Cols: n_vocab
|
345
469
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
346
470
|
|
471
|
+
// Logits for the ith token. Equivalent to:
|
472
|
+
// llama_get_logits(ctx) + i*n_vocab
|
473
|
+
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
474
|
+
|
347
475
|
// Get the embeddings for the input
|
348
476
|
// shape: [n_embd] (1-dimensional)
|
349
477
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
@@ -372,14 +500,6 @@ extern "C" {
|
|
372
500
|
// Returns the number of tokens on success, no more than n_max_tokens
|
373
501
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
374
502
|
LLAMA_API int llama_tokenize(
|
375
|
-
struct llama_context * ctx,
|
376
|
-
const char * text,
|
377
|
-
int text_len,
|
378
|
-
llama_token * tokens,
|
379
|
-
int n_max_tokens,
|
380
|
-
bool add_bos);
|
381
|
-
|
382
|
-
LLAMA_API int llama_tokenize_with_model(
|
383
503
|
const struct llama_model * model,
|
384
504
|
const char * text,
|
385
505
|
int text_len,
|
@@ -392,12 +512,6 @@ extern "C" {
|
|
392
512
|
// Does not write null terminator to the buffer.
|
393
513
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
394
514
|
LLAMA_API int llama_token_to_piece(
|
395
|
-
const struct llama_context * ctx,
|
396
|
-
llama_token token,
|
397
|
-
char * buf,
|
398
|
-
int length);
|
399
|
-
|
400
|
-
LLAMA_API int llama_token_to_piece_with_model(
|
401
515
|
const struct llama_model * model,
|
402
516
|
llama_token token,
|
403
517
|
char * buf,
|
@@ -420,11 +534,25 @@ extern "C" {
|
|
420
534
|
// Sampling functions
|
421
535
|
//
|
422
536
|
|
537
|
+
// Sets the current rng seed.
|
538
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
539
|
+
|
423
540
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
424
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
541
|
+
LLAMA_API void llama_sample_repetition_penalty(
|
542
|
+
struct llama_context * ctx,
|
543
|
+
llama_token_data_array * candidates,
|
544
|
+
const llama_token * last_tokens,
|
545
|
+
size_t last_tokens_size,
|
546
|
+
float penalty);
|
425
547
|
|
426
548
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
427
|
-
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
549
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
550
|
+
struct llama_context * ctx,
|
551
|
+
llama_token_data_array * candidates,
|
552
|
+
const llama_token * last_tokens,
|
553
|
+
size_t last_tokens_size,
|
554
|
+
float alpha_frequency,
|
555
|
+
float alpha_presence);
|
428
556
|
|
429
557
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
430
558
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -437,23 +565,54 @@ extern "C" {
|
|
437
565
|
float scale);
|
438
566
|
|
439
567
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
440
|
-
LLAMA_API void llama_sample_softmax(
|
568
|
+
LLAMA_API void llama_sample_softmax(
|
569
|
+
struct llama_context * ctx,
|
570
|
+
llama_token_data_array * candidates);
|
441
571
|
|
442
572
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
443
|
-
LLAMA_API void llama_sample_top_k(
|
573
|
+
LLAMA_API void llama_sample_top_k(
|
574
|
+
struct llama_context * ctx,
|
575
|
+
llama_token_data_array * candidates,
|
576
|
+
int k,
|
577
|
+
size_t min_keep);
|
444
578
|
|
445
579
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
446
|
-
LLAMA_API void llama_sample_top_p(
|
580
|
+
LLAMA_API void llama_sample_top_p(
|
581
|
+
struct llama_context * ctx,
|
582
|
+
llama_token_data_array * candidates,
|
583
|
+
float p,
|
584
|
+
size_t min_keep);
|
447
585
|
|
448
586
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
449
|
-
LLAMA_API void llama_sample_tail_free(
|
587
|
+
LLAMA_API void llama_sample_tail_free(
|
588
|
+
struct llama_context * ctx,
|
589
|
+
llama_token_data_array * candidates,
|
590
|
+
float z,
|
591
|
+
size_t min_keep);
|
450
592
|
|
451
593
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
452
|
-
LLAMA_API void llama_sample_typical(
|
453
|
-
|
594
|
+
LLAMA_API void llama_sample_typical(
|
595
|
+
struct llama_context * ctx,
|
596
|
+
llama_token_data_array * candidates,
|
597
|
+
float p,
|
598
|
+
size_t min_keep);
|
599
|
+
|
600
|
+
LLAMA_API void llama_sample_temp(
|
601
|
+
struct llama_context * ctx,
|
602
|
+
llama_token_data_array * candidates,
|
603
|
+
float temp);
|
604
|
+
|
605
|
+
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
606
|
+
struct llama_context * ctx,
|
607
|
+
llama_token_data_array * candidates,
|
608
|
+
float temp),
|
609
|
+
"use llama_sample_temp instead");
|
454
610
|
|
455
611
|
/// @details Apply constraints from grammar
|
456
|
-
LLAMA_API void llama_sample_grammar(
|
612
|
+
LLAMA_API void llama_sample_grammar(
|
613
|
+
struct llama_context * ctx,
|
614
|
+
llama_token_data_array * candidates,
|
615
|
+
const struct llama_grammar * grammar);
|
457
616
|
|
458
617
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
459
618
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
@@ -461,23 +620,41 @@ extern "C" {
|
|
461
620
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
462
621
|
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
463
622
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
464
|
-
LLAMA_API llama_token llama_sample_token_mirostat(
|
623
|
+
LLAMA_API llama_token llama_sample_token_mirostat(
|
624
|
+
struct llama_context * ctx,
|
625
|
+
llama_token_data_array * candidates,
|
626
|
+
float tau,
|
627
|
+
float eta,
|
628
|
+
int m,
|
629
|
+
float * mu);
|
465
630
|
|
466
631
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
467
632
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
468
633
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
469
634
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
470
635
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
471
|
-
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
636
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
637
|
+
struct llama_context * ctx,
|
638
|
+
llama_token_data_array * candidates,
|
639
|
+
float tau,
|
640
|
+
float eta,
|
641
|
+
float * mu);
|
472
642
|
|
473
643
|
/// @details Selects the token with the highest probability.
|
474
|
-
LLAMA_API llama_token llama_sample_token_greedy(
|
644
|
+
LLAMA_API llama_token llama_sample_token_greedy(
|
645
|
+
struct llama_context * ctx,
|
646
|
+
llama_token_data_array * candidates);
|
475
647
|
|
476
648
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
477
|
-
LLAMA_API llama_token llama_sample_token(
|
649
|
+
LLAMA_API llama_token llama_sample_token(
|
650
|
+
struct llama_context * ctx,
|
651
|
+
llama_token_data_array * candidates);
|
478
652
|
|
479
653
|
/// @details Accepts the sampled token into the grammar
|
480
|
-
LLAMA_API void llama_grammar_accept_token(
|
654
|
+
LLAMA_API void llama_grammar_accept_token(
|
655
|
+
struct llama_context * ctx,
|
656
|
+
struct llama_grammar * grammar,
|
657
|
+
llama_token token);
|
481
658
|
|
482
659
|
//
|
483
660
|
// Beam search
|
@@ -485,9 +662,10 @@ extern "C" {
|
|
485
662
|
|
486
663
|
struct llama_beam_view {
|
487
664
|
const llama_token * tokens;
|
665
|
+
|
488
666
|
size_t n_tokens;
|
489
|
-
float
|
490
|
-
bool
|
667
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
668
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
491
669
|
};
|
492
670
|
|
493
671
|
// Passed to beam_search_callback function.
|
@@ -496,9 +674,10 @@ extern "C" {
|
|
496
674
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
497
675
|
struct llama_beams_state {
|
498
676
|
struct llama_beam_view * beam_views;
|
677
|
+
|
499
678
|
size_t n_beams; // Number of elements in beam_views[].
|
500
679
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
501
|
-
bool
|
680
|
+
bool last_call; // True iff this is the last callback invocation.
|
502
681
|
};
|
503
682
|
|
504
683
|
// Type of pointer to the beam_search_callback function.
|
@@ -513,11 +692,17 @@ extern "C" {
|
|
513
692
|
/// @param n_beams Number of beams to use.
|
514
693
|
/// @param n_past Number of tokens already evaluated.
|
515
694
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
516
|
-
|
517
|
-
|
695
|
+
LLAMA_API void llama_beam_search(
|
696
|
+
struct llama_context * ctx,
|
697
|
+
llama_beam_search_callback_fn_t callback,
|
698
|
+
void * callback_data,
|
699
|
+
size_t n_beams,
|
700
|
+
int n_past,
|
701
|
+
int n_predict);
|
518
702
|
|
519
703
|
// Performance information
|
520
704
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
705
|
+
|
521
706
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
522
707
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
523
708
|
|
@@ -526,7 +711,7 @@ extern "C" {
|
|
526
711
|
|
527
712
|
// Set callback for all future logging events.
|
528
713
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
529
|
-
LLAMA_API void llama_log_set(
|
714
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
530
715
|
|
531
716
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
532
717
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.6.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1292'
|
10
10
|
end
|