llama_cpp 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -37,6 +37,8 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
39
39
|
|
40
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
41
|
+
|
40
42
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
41
43
|
|
42
44
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
@@ -60,13 +62,9 @@ extern "C" {
|
|
60
62
|
struct llama_model;
|
61
63
|
struct llama_context;
|
62
64
|
|
63
|
-
typedef
|
64
|
-
|
65
|
-
|
66
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
-
};
|
65
|
+
typedef int32_t llama_pos;
|
66
|
+
typedef int32_t llama_token;
|
67
|
+
typedef int32_t llama_seq_id;
|
70
68
|
|
71
69
|
enum llama_vocab_type {
|
72
70
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
@@ -86,24 +84,24 @@ extern "C" {
|
|
86
84
|
// model file types
|
87
85
|
enum llama_ftype {
|
88
86
|
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
-
LLAMA_FTYPE_MOSTLY_F16 = 1,
|
90
|
-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
|
91
|
-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
|
92
|
-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
|
93
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5,
|
94
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6,
|
95
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
|
96
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
|
97
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
|
98
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10
|
99
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
|
100
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
|
101
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
|
102
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
|
103
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
|
104
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
|
105
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
|
106
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
87
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
88
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
91
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
93
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
107
105
|
|
108
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
107
|
};
|
@@ -122,41 +120,68 @@ extern "C" {
|
|
122
120
|
|
123
121
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
123
|
+
// Input data for llama_decode
|
124
|
+
// A llama_batch object can contain input about one or many sequences
|
125
|
+
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
126
|
+
//
|
127
|
+
// - token : the token ids of the input (used when embd is NULL)
|
128
|
+
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
129
|
+
// - pos : the positions of the respective token in the sequence
|
130
|
+
// - seq_id : the sequence to which the respective token belongs
|
131
|
+
// - logits : if zero, the logits for the respective token will not be output
|
132
|
+
//
|
133
|
+
typedef struct llama_batch {
|
134
|
+
int32_t n_tokens;
|
135
|
+
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
llama_seq_id * seq_id;
|
140
|
+
int8_t * logits;
|
141
|
+
|
142
|
+
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
|
+
// for future-proof code, use the above fields instead and ignore everything below
|
144
|
+
//
|
145
|
+
// pos[i] = all_pos_0 + i*all_pos_1
|
146
|
+
//
|
147
|
+
llama_pos all_pos_0; // used if pos == NULL
|
148
|
+
llama_pos all_pos_1; // used if pos == NULL
|
149
|
+
llama_seq_id all_seq_id; // used if seq_id == NULL
|
150
|
+
} llama_batch;
|
151
|
+
|
152
|
+
struct llama_model_params {
|
153
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
154
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
132
155
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
133
156
|
|
134
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
135
|
-
float rope_freq_base; // RoPE base frequency
|
136
|
-
float rope_freq_scale; // RoPE frequency scaling factor
|
137
|
-
|
138
157
|
// called with a progress value between 0 and 1, pass NULL to disable
|
139
158
|
llama_progress_callback progress_callback;
|
140
159
|
// context pointer passed to the progress callback
|
141
160
|
void * progress_callback_user_data;
|
142
161
|
|
143
162
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
144
|
-
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
145
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
146
|
-
bool f16_kv; // use fp16 for KV cache
|
147
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
148
163
|
bool vocab_only; // only load the vocabulary, no weights
|
149
164
|
bool use_mmap; // use mmap if possible
|
150
165
|
bool use_mlock; // force system to keep model in RAM
|
151
|
-
bool embedding; // embedding mode only
|
152
166
|
};
|
153
167
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
168
|
+
struct llama_context_params {
|
169
|
+
uint32_t seed; // RNG seed, -1 for random
|
170
|
+
uint32_t n_ctx; // text context
|
171
|
+
uint32_t n_batch; // prompt processing batch size
|
172
|
+
uint32_t n_threads; // number of threads to use for generation
|
173
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
174
|
+
|
175
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
176
|
+
float rope_freq_base; // RoPE base frequency
|
177
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
178
|
+
|
179
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
180
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
181
|
+
bool f16_kv; // use fp16 for KV cache
|
182
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
183
|
+
bool embedding; // embedding mode only
|
184
|
+
};
|
160
185
|
|
161
186
|
// model quantization parameters
|
162
187
|
typedef struct llama_model_quantize_params {
|
@@ -215,6 +240,8 @@ extern "C" {
|
|
215
240
|
int32_t n_eval;
|
216
241
|
};
|
217
242
|
|
243
|
+
// Helpers for getting default parameters
|
244
|
+
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
218
245
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
219
246
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
220
247
|
|
@@ -228,7 +255,7 @@ extern "C" {
|
|
228
255
|
|
229
256
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
230
257
|
const char * path_model,
|
231
|
-
struct
|
258
|
+
struct llama_model_params params);
|
232
259
|
|
233
260
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
234
261
|
|
@@ -245,25 +272,28 @@ extern "C" {
|
|
245
272
|
LLAMA_API bool llama_mmap_supported (void);
|
246
273
|
LLAMA_API bool llama_mlock_supported(void);
|
247
274
|
|
248
|
-
LLAMA_API
|
275
|
+
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
276
|
+
|
249
277
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
250
|
-
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
251
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
252
278
|
|
253
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct
|
279
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
254
280
|
|
255
|
-
LLAMA_API int
|
256
|
-
LLAMA_API int
|
257
|
-
LLAMA_API int
|
258
|
-
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
281
|
+
LLAMA_API int llama_n_vocab (const struct llama_model * model);
|
282
|
+
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
283
|
+
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
259
284
|
|
260
285
|
// Get a string describing the model type
|
261
286
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
287
|
+
|
262
288
|
// Returns the total size of all the tensors in the model in bytes
|
263
289
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
290
|
+
|
264
291
|
// Returns the total number of parameters in the model
|
265
292
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
266
293
|
|
294
|
+
// Get a llama model tensor
|
295
|
+
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
296
|
+
|
267
297
|
// Returns 0 on success
|
268
298
|
LLAMA_API int llama_model_quantize(
|
269
299
|
const char * fname_inp,
|
@@ -279,21 +309,65 @@ extern "C" {
|
|
279
309
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
280
310
|
struct llama_context * ctx,
|
281
311
|
const char * path_lora,
|
312
|
+
float scale,
|
282
313
|
const char * path_base_model,
|
283
314
|
int n_threads),
|
284
|
-
"
|
315
|
+
"use llama_model_apply_lora_from_file instead");
|
285
316
|
|
286
317
|
LLAMA_API int llama_model_apply_lora_from_file(
|
287
318
|
const struct llama_model * model,
|
288
|
-
|
289
|
-
|
290
|
-
|
319
|
+
const char * path_lora,
|
320
|
+
float scale,
|
321
|
+
const char * path_base_model,
|
322
|
+
int n_threads);
|
323
|
+
|
324
|
+
//
|
325
|
+
// KV cache
|
326
|
+
//
|
291
327
|
|
292
328
|
// Returns the number of tokens in the KV cache
|
293
|
-
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx)
|
329
|
+
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
330
|
+
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
294
331
|
|
295
|
-
//
|
296
|
-
LLAMA_API void
|
332
|
+
// Remove all tokens data of cells in [c0, c1)
|
333
|
+
LLAMA_API void llama_kv_cache_tokens_rm(
|
334
|
+
struct llama_context * ctx,
|
335
|
+
int32_t c0,
|
336
|
+
int32_t c1);
|
337
|
+
|
338
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
339
|
+
LLAMA_API void llama_kv_cache_seq_rm(
|
340
|
+
struct llama_context * ctx,
|
341
|
+
llama_seq_id seq_id,
|
342
|
+
llama_pos p0,
|
343
|
+
llama_pos p1);
|
344
|
+
|
345
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
346
|
+
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
347
|
+
LLAMA_API void llama_kv_cache_seq_cp(
|
348
|
+
struct llama_context * ctx,
|
349
|
+
llama_seq_id seq_id_src,
|
350
|
+
llama_seq_id seq_id_dst,
|
351
|
+
llama_pos p0,
|
352
|
+
llama_pos p1);
|
353
|
+
|
354
|
+
// Removes all tokens that do not belong to the specified sequence
|
355
|
+
LLAMA_API void llama_kv_cache_seq_keep(
|
356
|
+
struct llama_context * ctx,
|
357
|
+
llama_seq_id seq_id);
|
358
|
+
|
359
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
360
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
361
|
+
LLAMA_API void llama_kv_cache_seq_shift(
|
362
|
+
struct llama_context * ctx,
|
363
|
+
llama_seq_id seq_id,
|
364
|
+
llama_pos p0,
|
365
|
+
llama_pos p1,
|
366
|
+
llama_pos delta);
|
367
|
+
|
368
|
+
//
|
369
|
+
// State / sessions
|
370
|
+
//
|
297
371
|
|
298
372
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
299
373
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -302,48 +376,102 @@ extern "C" {
|
|
302
376
|
// Copies the state to the specified destination address.
|
303
377
|
// Destination needs to have allocated enough memory.
|
304
378
|
// Returns the number of bytes copied
|
305
|
-
LLAMA_API size_t llama_copy_state_data(
|
379
|
+
LLAMA_API size_t llama_copy_state_data(
|
380
|
+
struct llama_context * ctx,
|
381
|
+
uint8_t * dst);
|
306
382
|
|
307
383
|
// Set the state reading from the specified address
|
308
384
|
// Returns the number of bytes read
|
309
|
-
LLAMA_API size_t llama_set_state_data(
|
385
|
+
LLAMA_API size_t llama_set_state_data(
|
386
|
+
struct llama_context * ctx,
|
387
|
+
uint8_t * src);
|
310
388
|
|
311
389
|
// Save/load session file
|
312
|
-
LLAMA_API bool llama_load_session_file(
|
313
|
-
|
390
|
+
LLAMA_API bool llama_load_session_file(
|
391
|
+
struct llama_context * ctx,
|
392
|
+
const char * path_session,
|
393
|
+
llama_token * tokens_out,
|
394
|
+
size_t n_token_capacity,
|
395
|
+
size_t * n_token_count_out);
|
396
|
+
|
397
|
+
LLAMA_API bool llama_save_session_file(
|
398
|
+
struct llama_context * ctx,
|
399
|
+
const char * path_session,
|
400
|
+
const llama_token * tokens,
|
401
|
+
size_t n_token_count);
|
402
|
+
|
403
|
+
//
|
404
|
+
// Decoding
|
405
|
+
//
|
314
406
|
|
315
|
-
// Run the llama inference to obtain the logits and probabilities for the next token.
|
407
|
+
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
316
408
|
// tokens + n_tokens is the provided batch of new tokens to process
|
317
409
|
// n_past is the number of tokens to use from previous eval calls
|
318
410
|
// Returns 0 on success
|
319
|
-
|
411
|
+
// DEPRECATED: use llama_decode() instead
|
412
|
+
LLAMA_API DEPRECATED(int llama_eval(
|
320
413
|
struct llama_context * ctx,
|
321
|
-
|
322
|
-
|
323
|
-
int n_past,
|
324
|
-
|
414
|
+
llama_token * tokens,
|
415
|
+
int32_t n_tokens,
|
416
|
+
int n_past),
|
417
|
+
"use llama_decode() instead");
|
325
418
|
|
326
419
|
// Same as llama_eval, but use float matrix input directly.
|
327
|
-
|
420
|
+
// DEPRECATED: use llama_decode() instead
|
421
|
+
LLAMA_API DEPRECATED(int llama_eval_embd(
|
328
422
|
struct llama_context * ctx,
|
329
|
-
|
330
|
-
|
331
|
-
int n_past,
|
332
|
-
|
423
|
+
float * embd,
|
424
|
+
int32_t n_tokens,
|
425
|
+
int n_past),
|
426
|
+
"use llama_decode() instead");
|
333
427
|
|
334
|
-
//
|
335
|
-
//
|
336
|
-
//
|
337
|
-
//
|
338
|
-
LLAMA_API
|
428
|
+
// Return batch for single sequence of tokens starting at pos_0
|
429
|
+
//
|
430
|
+
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
431
|
+
//
|
432
|
+
LLAMA_API struct llama_batch llama_batch_get_one(
|
433
|
+
llama_token * tokens,
|
434
|
+
int32_t n_tokens,
|
435
|
+
llama_pos pos_0,
|
436
|
+
llama_seq_id seq_id);
|
437
|
+
|
438
|
+
// Allocates a batch of tokens on the heap
|
439
|
+
// The batch has to be freed with llama_batch_free()
|
440
|
+
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
441
|
+
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
442
|
+
// The rest of the llama_batch members are allocated with size n_tokens
|
443
|
+
// All members are left uninitialized
|
444
|
+
LLAMA_API struct llama_batch llama_batch_init(
|
445
|
+
int32_t n_tokens,
|
446
|
+
int32_t embd);
|
447
|
+
|
448
|
+
// Frees a batch of tokens allocated with llama_batch_init()
|
449
|
+
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
450
|
+
|
451
|
+
// Positive return values does not mean a fatal error, but rather a warning.
|
452
|
+
// 0 - success
|
453
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
454
|
+
// < 0 - error
|
455
|
+
LLAMA_API int llama_decode(
|
456
|
+
struct llama_context * ctx,
|
457
|
+
struct llama_batch batch);
|
458
|
+
|
459
|
+
// Set the number of threads used for decoding
|
460
|
+
// n_threads is the number of threads used for generation (single token)
|
461
|
+
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
462
|
+
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
339
463
|
|
340
464
|
// Token logits obtained from the last call to llama_eval()
|
341
465
|
// The logits for the last token are stored in the last row
|
342
|
-
//
|
343
|
-
// Rows: n_tokens
|
466
|
+
// Logits for which llama_batch.logits[i] == 0 are undefined
|
467
|
+
// Rows: n_tokens provided with llama_batch
|
344
468
|
// Cols: n_vocab
|
345
469
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
346
470
|
|
471
|
+
// Logits for the ith token. Equivalent to:
|
472
|
+
// llama_get_logits(ctx) + i*n_vocab
|
473
|
+
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
474
|
+
|
347
475
|
// Get the embeddings for the input
|
348
476
|
// shape: [n_embd] (1-dimensional)
|
349
477
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
@@ -372,14 +500,6 @@ extern "C" {
|
|
372
500
|
// Returns the number of tokens on success, no more than n_max_tokens
|
373
501
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
374
502
|
LLAMA_API int llama_tokenize(
|
375
|
-
struct llama_context * ctx,
|
376
|
-
const char * text,
|
377
|
-
int text_len,
|
378
|
-
llama_token * tokens,
|
379
|
-
int n_max_tokens,
|
380
|
-
bool add_bos);
|
381
|
-
|
382
|
-
LLAMA_API int llama_tokenize_with_model(
|
383
503
|
const struct llama_model * model,
|
384
504
|
const char * text,
|
385
505
|
int text_len,
|
@@ -392,12 +512,6 @@ extern "C" {
|
|
392
512
|
// Does not write null terminator to the buffer.
|
393
513
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
394
514
|
LLAMA_API int llama_token_to_piece(
|
395
|
-
const struct llama_context * ctx,
|
396
|
-
llama_token token,
|
397
|
-
char * buf,
|
398
|
-
int length);
|
399
|
-
|
400
|
-
LLAMA_API int llama_token_to_piece_with_model(
|
401
515
|
const struct llama_model * model,
|
402
516
|
llama_token token,
|
403
517
|
char * buf,
|
@@ -420,11 +534,25 @@ extern "C" {
|
|
420
534
|
// Sampling functions
|
421
535
|
//
|
422
536
|
|
537
|
+
// Sets the current rng seed.
|
538
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
539
|
+
|
423
540
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
424
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
541
|
+
LLAMA_API void llama_sample_repetition_penalty(
|
542
|
+
struct llama_context * ctx,
|
543
|
+
llama_token_data_array * candidates,
|
544
|
+
const llama_token * last_tokens,
|
545
|
+
size_t last_tokens_size,
|
546
|
+
float penalty);
|
425
547
|
|
426
548
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
427
|
-
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
549
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
550
|
+
struct llama_context * ctx,
|
551
|
+
llama_token_data_array * candidates,
|
552
|
+
const llama_token * last_tokens,
|
553
|
+
size_t last_tokens_size,
|
554
|
+
float alpha_frequency,
|
555
|
+
float alpha_presence);
|
428
556
|
|
429
557
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
430
558
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -437,23 +565,54 @@ extern "C" {
|
|
437
565
|
float scale);
|
438
566
|
|
439
567
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
440
|
-
LLAMA_API void llama_sample_softmax(
|
568
|
+
LLAMA_API void llama_sample_softmax(
|
569
|
+
struct llama_context * ctx,
|
570
|
+
llama_token_data_array * candidates);
|
441
571
|
|
442
572
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
443
|
-
LLAMA_API void llama_sample_top_k(
|
573
|
+
LLAMA_API void llama_sample_top_k(
|
574
|
+
struct llama_context * ctx,
|
575
|
+
llama_token_data_array * candidates,
|
576
|
+
int k,
|
577
|
+
size_t min_keep);
|
444
578
|
|
445
579
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
446
|
-
LLAMA_API void llama_sample_top_p(
|
580
|
+
LLAMA_API void llama_sample_top_p(
|
581
|
+
struct llama_context * ctx,
|
582
|
+
llama_token_data_array * candidates,
|
583
|
+
float p,
|
584
|
+
size_t min_keep);
|
447
585
|
|
448
586
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
449
|
-
LLAMA_API void llama_sample_tail_free(
|
587
|
+
LLAMA_API void llama_sample_tail_free(
|
588
|
+
struct llama_context * ctx,
|
589
|
+
llama_token_data_array * candidates,
|
590
|
+
float z,
|
591
|
+
size_t min_keep);
|
450
592
|
|
451
593
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
452
|
-
LLAMA_API void llama_sample_typical(
|
453
|
-
|
594
|
+
LLAMA_API void llama_sample_typical(
|
595
|
+
struct llama_context * ctx,
|
596
|
+
llama_token_data_array * candidates,
|
597
|
+
float p,
|
598
|
+
size_t min_keep);
|
599
|
+
|
600
|
+
LLAMA_API void llama_sample_temp(
|
601
|
+
struct llama_context * ctx,
|
602
|
+
llama_token_data_array * candidates,
|
603
|
+
float temp);
|
604
|
+
|
605
|
+
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
606
|
+
struct llama_context * ctx,
|
607
|
+
llama_token_data_array * candidates,
|
608
|
+
float temp),
|
609
|
+
"use llama_sample_temp instead");
|
454
610
|
|
455
611
|
/// @details Apply constraints from grammar
|
456
|
-
LLAMA_API void llama_sample_grammar(
|
612
|
+
LLAMA_API void llama_sample_grammar(
|
613
|
+
struct llama_context * ctx,
|
614
|
+
llama_token_data_array * candidates,
|
615
|
+
const struct llama_grammar * grammar);
|
457
616
|
|
458
617
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
459
618
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
@@ -461,23 +620,41 @@ extern "C" {
|
|
461
620
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
462
621
|
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
463
622
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
464
|
-
LLAMA_API llama_token llama_sample_token_mirostat(
|
623
|
+
LLAMA_API llama_token llama_sample_token_mirostat(
|
624
|
+
struct llama_context * ctx,
|
625
|
+
llama_token_data_array * candidates,
|
626
|
+
float tau,
|
627
|
+
float eta,
|
628
|
+
int m,
|
629
|
+
float * mu);
|
465
630
|
|
466
631
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
467
632
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
468
633
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
469
634
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
470
635
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
471
|
-
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
636
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
637
|
+
struct llama_context * ctx,
|
638
|
+
llama_token_data_array * candidates,
|
639
|
+
float tau,
|
640
|
+
float eta,
|
641
|
+
float * mu);
|
472
642
|
|
473
643
|
/// @details Selects the token with the highest probability.
|
474
|
-
LLAMA_API llama_token llama_sample_token_greedy(
|
644
|
+
LLAMA_API llama_token llama_sample_token_greedy(
|
645
|
+
struct llama_context * ctx,
|
646
|
+
llama_token_data_array * candidates);
|
475
647
|
|
476
648
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
477
|
-
LLAMA_API llama_token llama_sample_token(
|
649
|
+
LLAMA_API llama_token llama_sample_token(
|
650
|
+
struct llama_context * ctx,
|
651
|
+
llama_token_data_array * candidates);
|
478
652
|
|
479
653
|
/// @details Accepts the sampled token into the grammar
|
480
|
-
LLAMA_API void llama_grammar_accept_token(
|
654
|
+
LLAMA_API void llama_grammar_accept_token(
|
655
|
+
struct llama_context * ctx,
|
656
|
+
struct llama_grammar * grammar,
|
657
|
+
llama_token token);
|
481
658
|
|
482
659
|
//
|
483
660
|
// Beam search
|
@@ -485,9 +662,10 @@ extern "C" {
|
|
485
662
|
|
486
663
|
struct llama_beam_view {
|
487
664
|
const llama_token * tokens;
|
665
|
+
|
488
666
|
size_t n_tokens;
|
489
|
-
float
|
490
|
-
bool
|
667
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
668
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
491
669
|
};
|
492
670
|
|
493
671
|
// Passed to beam_search_callback function.
|
@@ -496,9 +674,10 @@ extern "C" {
|
|
496
674
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
497
675
|
struct llama_beams_state {
|
498
676
|
struct llama_beam_view * beam_views;
|
677
|
+
|
499
678
|
size_t n_beams; // Number of elements in beam_views[].
|
500
679
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
501
|
-
bool
|
680
|
+
bool last_call; // True iff this is the last callback invocation.
|
502
681
|
};
|
503
682
|
|
504
683
|
// Type of pointer to the beam_search_callback function.
|
@@ -513,11 +692,17 @@ extern "C" {
|
|
513
692
|
/// @param n_beams Number of beams to use.
|
514
693
|
/// @param n_past Number of tokens already evaluated.
|
515
694
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
516
|
-
|
517
|
-
|
695
|
+
LLAMA_API void llama_beam_search(
|
696
|
+
struct llama_context * ctx,
|
697
|
+
llama_beam_search_callback_fn_t callback,
|
698
|
+
void * callback_data,
|
699
|
+
size_t n_beams,
|
700
|
+
int n_past,
|
701
|
+
int n_predict);
|
518
702
|
|
519
703
|
// Performance information
|
520
704
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
705
|
+
|
521
706
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
522
707
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
523
708
|
|
@@ -526,7 +711,7 @@ extern "C" {
|
|
526
711
|
|
527
712
|
// Set callback for all future logging events.
|
528
713
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
529
|
-
LLAMA_API void llama_log_set(
|
714
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
530
715
|
|
531
716
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
532
717
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.6.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1292'
|
10
10
|
end
|