llama_cpp 0.5.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -37,10 +37,12 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
39
39
|
|
40
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
41
|
+
|
40
42
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
41
43
|
|
42
44
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION
|
45
|
+
#define LLAMA_SESSION_VERSION 2
|
44
46
|
|
45
47
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
46
48
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
@@ -60,13 +62,9 @@ extern "C" {
|
|
60
62
|
struct llama_model;
|
61
63
|
struct llama_context;
|
62
64
|
|
63
|
-
typedef
|
64
|
-
|
65
|
-
|
66
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
-
};
|
65
|
+
typedef int32_t llama_pos;
|
66
|
+
typedef int32_t llama_token;
|
67
|
+
typedef int32_t llama_seq_id;
|
70
68
|
|
71
69
|
enum llama_vocab_type {
|
72
70
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
@@ -86,24 +84,24 @@ extern "C" {
|
|
86
84
|
// model file types
|
87
85
|
enum llama_ftype {
|
88
86
|
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
-
LLAMA_FTYPE_MOSTLY_F16 = 1,
|
90
|
-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
|
91
|
-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
|
92
|
-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
|
93
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5,
|
94
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6,
|
95
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
|
96
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
|
97
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
|
98
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10
|
99
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
|
100
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
|
101
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
|
102
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
|
103
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
|
104
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
|
105
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
|
106
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
87
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
88
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
91
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
93
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
107
105
|
|
108
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
107
|
};
|
@@ -122,41 +120,68 @@ extern "C" {
|
|
122
120
|
|
123
121
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
123
|
+
// Input data for llama_decode
|
124
|
+
// A llama_batch object can contain input about one or many sequences
|
125
|
+
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
126
|
+
//
|
127
|
+
// - token : the token ids of the input (used when embd is NULL)
|
128
|
+
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
129
|
+
// - pos : the positions of the respective token in the sequence
|
130
|
+
// - seq_id : the sequence to which the respective token belongs
|
131
|
+
// - logits : if zero, the logits for the respective token will not be output
|
132
|
+
//
|
133
|
+
typedef struct llama_batch {
|
134
|
+
int32_t n_tokens;
|
135
|
+
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
llama_seq_id * seq_id;
|
140
|
+
int8_t * logits;
|
141
|
+
|
142
|
+
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
|
+
// for future-proof code, use the above fields instead and ignore everything below
|
144
|
+
//
|
145
|
+
// pos[i] = all_pos_0 + i*all_pos_1
|
146
|
+
//
|
147
|
+
llama_pos all_pos_0; // used if pos == NULL
|
148
|
+
llama_pos all_pos_1; // used if pos == NULL
|
149
|
+
llama_seq_id all_seq_id; // used if seq_id == NULL
|
150
|
+
} llama_batch;
|
151
|
+
|
152
|
+
struct llama_model_params {
|
153
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
154
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
132
155
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
133
156
|
|
134
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
135
|
-
float rope_freq_base; // RoPE base frequency
|
136
|
-
float rope_freq_scale; // RoPE frequency scaling factor
|
137
|
-
|
138
157
|
// called with a progress value between 0 and 1, pass NULL to disable
|
139
158
|
llama_progress_callback progress_callback;
|
140
159
|
// context pointer passed to the progress callback
|
141
160
|
void * progress_callback_user_data;
|
142
161
|
|
143
162
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
144
|
-
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
145
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
146
|
-
bool f16_kv; // use fp16 for KV cache
|
147
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
148
163
|
bool vocab_only; // only load the vocabulary, no weights
|
149
164
|
bool use_mmap; // use mmap if possible
|
150
165
|
bool use_mlock; // force system to keep model in RAM
|
151
|
-
bool embedding; // embedding mode only
|
152
166
|
};
|
153
167
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
168
|
+
struct llama_context_params {
|
169
|
+
uint32_t seed; // RNG seed, -1 for random
|
170
|
+
uint32_t n_ctx; // text context, 0 = from model
|
171
|
+
uint32_t n_batch; // prompt processing maximum batch size
|
172
|
+
uint32_t n_threads; // number of threads to use for generation
|
173
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
174
|
+
|
175
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
176
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
177
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
178
|
+
|
179
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
180
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
181
|
+
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
182
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
183
|
+
bool embedding; // embedding mode only
|
184
|
+
};
|
160
185
|
|
161
186
|
// model quantization parameters
|
162
187
|
typedef struct llama_model_quantize_params {
|
@@ -215,6 +240,8 @@ extern "C" {
|
|
215
240
|
int32_t n_eval;
|
216
241
|
};
|
217
242
|
|
243
|
+
// Helpers for getting default parameters
|
244
|
+
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
218
245
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
219
246
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
220
247
|
|
@@ -228,7 +255,7 @@ extern "C" {
|
|
228
255
|
|
229
256
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
230
257
|
const char * path_model,
|
231
|
-
struct
|
258
|
+
struct llama_model_params params);
|
232
259
|
|
233
260
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
234
261
|
|
@@ -245,25 +272,31 @@ extern "C" {
|
|
245
272
|
LLAMA_API bool llama_mmap_supported (void);
|
246
273
|
LLAMA_API bool llama_mlock_supported(void);
|
247
274
|
|
248
|
-
LLAMA_API
|
275
|
+
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
276
|
+
|
249
277
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
250
|
-
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
251
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
252
278
|
|
253
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct
|
279
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
280
|
+
|
281
|
+
LLAMA_API int llama_n_vocab (const struct llama_model * model);
|
282
|
+
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
283
|
+
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
254
284
|
|
255
|
-
|
256
|
-
LLAMA_API
|
257
|
-
LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
|
258
|
-
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
285
|
+
// Get the model's RoPE frequency scaling factor
|
286
|
+
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
259
287
|
|
260
288
|
// Get a string describing the model type
|
261
289
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
290
|
+
|
262
291
|
// Returns the total size of all the tensors in the model in bytes
|
263
292
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
293
|
+
|
264
294
|
// Returns the total number of parameters in the model
|
265
295
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
266
296
|
|
297
|
+
// Get a llama model tensor
|
298
|
+
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
299
|
+
|
267
300
|
// Returns 0 on success
|
268
301
|
LLAMA_API int llama_model_quantize(
|
269
302
|
const char * fname_inp,
|
@@ -279,21 +312,73 @@ extern "C" {
|
|
279
312
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
280
313
|
struct llama_context * ctx,
|
281
314
|
const char * path_lora,
|
315
|
+
float scale,
|
282
316
|
const char * path_base_model,
|
283
317
|
int n_threads),
|
284
|
-
"
|
318
|
+
"use llama_model_apply_lora_from_file instead");
|
285
319
|
|
286
320
|
LLAMA_API int llama_model_apply_lora_from_file(
|
287
321
|
const struct llama_model * model,
|
288
|
-
|
289
|
-
|
290
|
-
|
322
|
+
const char * path_lora,
|
323
|
+
float scale,
|
324
|
+
const char * path_base_model,
|
325
|
+
int n_threads);
|
326
|
+
|
327
|
+
//
|
328
|
+
// KV cache
|
329
|
+
//
|
291
330
|
|
292
331
|
// Returns the number of tokens in the KV cache
|
293
|
-
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx)
|
332
|
+
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
333
|
+
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
294
334
|
|
295
|
-
//
|
296
|
-
|
335
|
+
// Remove all tokens data of cells in [c0, c1)
|
336
|
+
// c0 < 0 : [0, c1]
|
337
|
+
// c1 < 0 : [c0, inf)
|
338
|
+
LLAMA_API void llama_kv_cache_tokens_rm(
|
339
|
+
struct llama_context * ctx,
|
340
|
+
int32_t c0,
|
341
|
+
int32_t c1);
|
342
|
+
|
343
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
344
|
+
// p0 < 0 : [0, p1]
|
345
|
+
// p1 < 0 : [p0, inf)
|
346
|
+
LLAMA_API void llama_kv_cache_seq_rm(
|
347
|
+
struct llama_context * ctx,
|
348
|
+
llama_seq_id seq_id,
|
349
|
+
llama_pos p0,
|
350
|
+
llama_pos p1);
|
351
|
+
|
352
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
353
|
+
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
354
|
+
// p0 < 0 : [0, p1]
|
355
|
+
// p1 < 0 : [p0, inf)
|
356
|
+
LLAMA_API void llama_kv_cache_seq_cp(
|
357
|
+
struct llama_context * ctx,
|
358
|
+
llama_seq_id seq_id_src,
|
359
|
+
llama_seq_id seq_id_dst,
|
360
|
+
llama_pos p0,
|
361
|
+
llama_pos p1);
|
362
|
+
|
363
|
+
// Removes all tokens that do not belong to the specified sequence
|
364
|
+
LLAMA_API void llama_kv_cache_seq_keep(
|
365
|
+
struct llama_context * ctx,
|
366
|
+
llama_seq_id seq_id);
|
367
|
+
|
368
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
369
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
370
|
+
// p0 < 0 : [0, p1]
|
371
|
+
// p1 < 0 : [p0, inf)
|
372
|
+
LLAMA_API void llama_kv_cache_seq_shift(
|
373
|
+
struct llama_context * ctx,
|
374
|
+
llama_seq_id seq_id,
|
375
|
+
llama_pos p0,
|
376
|
+
llama_pos p1,
|
377
|
+
llama_pos delta);
|
378
|
+
|
379
|
+
//
|
380
|
+
// State / sessions
|
381
|
+
//
|
297
382
|
|
298
383
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
299
384
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -302,48 +387,102 @@ extern "C" {
|
|
302
387
|
// Copies the state to the specified destination address.
|
303
388
|
// Destination needs to have allocated enough memory.
|
304
389
|
// Returns the number of bytes copied
|
305
|
-
LLAMA_API size_t llama_copy_state_data(
|
390
|
+
LLAMA_API size_t llama_copy_state_data(
|
391
|
+
struct llama_context * ctx,
|
392
|
+
uint8_t * dst);
|
306
393
|
|
307
394
|
// Set the state reading from the specified address
|
308
395
|
// Returns the number of bytes read
|
309
|
-
LLAMA_API size_t llama_set_state_data(
|
396
|
+
LLAMA_API size_t llama_set_state_data(
|
397
|
+
struct llama_context * ctx,
|
398
|
+
uint8_t * src);
|
310
399
|
|
311
400
|
// Save/load session file
|
312
|
-
LLAMA_API bool llama_load_session_file(
|
313
|
-
|
401
|
+
LLAMA_API bool llama_load_session_file(
|
402
|
+
struct llama_context * ctx,
|
403
|
+
const char * path_session,
|
404
|
+
llama_token * tokens_out,
|
405
|
+
size_t n_token_capacity,
|
406
|
+
size_t * n_token_count_out);
|
314
407
|
|
315
|
-
|
408
|
+
LLAMA_API bool llama_save_session_file(
|
409
|
+
struct llama_context * ctx,
|
410
|
+
const char * path_session,
|
411
|
+
const llama_token * tokens,
|
412
|
+
size_t n_token_count);
|
413
|
+
|
414
|
+
//
|
415
|
+
// Decoding
|
416
|
+
//
|
417
|
+
|
418
|
+
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
316
419
|
// tokens + n_tokens is the provided batch of new tokens to process
|
317
420
|
// n_past is the number of tokens to use from previous eval calls
|
318
421
|
// Returns 0 on success
|
319
|
-
|
422
|
+
// DEPRECATED: use llama_decode() instead
|
423
|
+
LLAMA_API DEPRECATED(int llama_eval(
|
320
424
|
struct llama_context * ctx,
|
321
|
-
|
322
|
-
|
323
|
-
int n_past,
|
324
|
-
|
425
|
+
llama_token * tokens,
|
426
|
+
int32_t n_tokens,
|
427
|
+
int n_past),
|
428
|
+
"use llama_decode() instead");
|
325
429
|
|
326
430
|
// Same as llama_eval, but use float matrix input directly.
|
327
|
-
|
431
|
+
// DEPRECATED: use llama_decode() instead
|
432
|
+
LLAMA_API DEPRECATED(int llama_eval_embd(
|
328
433
|
struct llama_context * ctx,
|
329
|
-
|
330
|
-
|
331
|
-
int n_past,
|
332
|
-
|
434
|
+
float * embd,
|
435
|
+
int32_t n_tokens,
|
436
|
+
int n_past),
|
437
|
+
"use llama_decode() instead");
|
438
|
+
|
439
|
+
// Return batch for single sequence of tokens starting at pos_0
|
440
|
+
//
|
441
|
+
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
442
|
+
//
|
443
|
+
LLAMA_API struct llama_batch llama_batch_get_one(
|
444
|
+
llama_token * tokens,
|
445
|
+
int32_t n_tokens,
|
446
|
+
llama_pos pos_0,
|
447
|
+
llama_seq_id seq_id);
|
448
|
+
|
449
|
+
// Allocates a batch of tokens on the heap
|
450
|
+
// The batch has to be freed with llama_batch_free()
|
451
|
+
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
452
|
+
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
453
|
+
// The rest of the llama_batch members are allocated with size n_tokens
|
454
|
+
// All members are left uninitialized
|
455
|
+
LLAMA_API struct llama_batch llama_batch_init(
|
456
|
+
int32_t n_tokens,
|
457
|
+
int32_t embd);
|
458
|
+
|
459
|
+
// Frees a batch of tokens allocated with llama_batch_init()
|
460
|
+
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
461
|
+
|
462
|
+
// Positive return values does not mean a fatal error, but rather a warning.
|
463
|
+
// 0 - success
|
464
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
465
|
+
// < 0 - error
|
466
|
+
LLAMA_API int llama_decode(
|
467
|
+
struct llama_context * ctx,
|
468
|
+
struct llama_batch batch);
|
333
469
|
|
334
|
-
//
|
335
|
-
//
|
336
|
-
//
|
337
|
-
|
338
|
-
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
470
|
+
// Set the number of threads used for decoding
|
471
|
+
// n_threads is the number of threads used for generation (single token)
|
472
|
+
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
473
|
+
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
339
474
|
|
340
475
|
// Token logits obtained from the last call to llama_eval()
|
341
476
|
// The logits for the last token are stored in the last row
|
342
|
-
//
|
343
|
-
// Rows: n_tokens
|
477
|
+
// Logits for which llama_batch.logits[i] == 0 are undefined
|
478
|
+
// Rows: n_tokens provided with llama_batch
|
344
479
|
// Cols: n_vocab
|
345
480
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
346
481
|
|
482
|
+
// Logits for the ith token. Equivalent to:
|
483
|
+
// llama_get_logits(ctx) + i*n_vocab
|
484
|
+
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
485
|
+
|
347
486
|
// Get the embeddings for the input
|
348
487
|
// shape: [n_embd] (1-dimensional)
|
349
488
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
@@ -362,6 +501,11 @@ extern "C" {
|
|
362
501
|
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
363
502
|
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
364
503
|
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
504
|
+
// codellama infill tokens
|
505
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
|
506
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
|
507
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
|
508
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
|
365
509
|
|
366
510
|
//
|
367
511
|
// Tokenization
|
@@ -372,14 +516,6 @@ extern "C" {
|
|
372
516
|
// Returns the number of tokens on success, no more than n_max_tokens
|
373
517
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
374
518
|
LLAMA_API int llama_tokenize(
|
375
|
-
struct llama_context * ctx,
|
376
|
-
const char * text,
|
377
|
-
int text_len,
|
378
|
-
llama_token * tokens,
|
379
|
-
int n_max_tokens,
|
380
|
-
bool add_bos);
|
381
|
-
|
382
|
-
LLAMA_API int llama_tokenize_with_model(
|
383
519
|
const struct llama_model * model,
|
384
520
|
const char * text,
|
385
521
|
int text_len,
|
@@ -392,12 +528,6 @@ extern "C" {
|
|
392
528
|
// Does not write null terminator to the buffer.
|
393
529
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
394
530
|
LLAMA_API int llama_token_to_piece(
|
395
|
-
const struct llama_context * ctx,
|
396
|
-
llama_token token,
|
397
|
-
char * buf,
|
398
|
-
int length);
|
399
|
-
|
400
|
-
LLAMA_API int llama_token_to_piece_with_model(
|
401
531
|
const struct llama_model * model,
|
402
532
|
llama_token token,
|
403
533
|
char * buf,
|
@@ -420,11 +550,25 @@ extern "C" {
|
|
420
550
|
// Sampling functions
|
421
551
|
//
|
422
552
|
|
553
|
+
// Sets the current rng seed.
|
554
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
555
|
+
|
423
556
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
424
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
557
|
+
LLAMA_API void llama_sample_repetition_penalty(
|
558
|
+
struct llama_context * ctx,
|
559
|
+
llama_token_data_array * candidates,
|
560
|
+
const llama_token * last_tokens,
|
561
|
+
size_t last_tokens_size,
|
562
|
+
float penalty);
|
425
563
|
|
426
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
427
|
-
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
565
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
566
|
+
struct llama_context * ctx,
|
567
|
+
llama_token_data_array * candidates,
|
568
|
+
const llama_token * last_tokens,
|
569
|
+
size_t last_tokens_size,
|
570
|
+
float alpha_frequency,
|
571
|
+
float alpha_presence);
|
428
572
|
|
429
573
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
430
574
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -437,23 +581,54 @@ extern "C" {
|
|
437
581
|
float scale);
|
438
582
|
|
439
583
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
440
|
-
LLAMA_API void llama_sample_softmax(
|
584
|
+
LLAMA_API void llama_sample_softmax(
|
585
|
+
struct llama_context * ctx,
|
586
|
+
llama_token_data_array * candidates);
|
441
587
|
|
442
588
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
443
|
-
LLAMA_API void llama_sample_top_k(
|
589
|
+
LLAMA_API void llama_sample_top_k(
|
590
|
+
struct llama_context * ctx,
|
591
|
+
llama_token_data_array * candidates,
|
592
|
+
int k,
|
593
|
+
size_t min_keep);
|
444
594
|
|
445
595
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
446
|
-
LLAMA_API void llama_sample_top_p(
|
596
|
+
LLAMA_API void llama_sample_top_p(
|
597
|
+
struct llama_context * ctx,
|
598
|
+
llama_token_data_array * candidates,
|
599
|
+
float p,
|
600
|
+
size_t min_keep);
|
447
601
|
|
448
602
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
449
|
-
LLAMA_API void llama_sample_tail_free(
|
603
|
+
LLAMA_API void llama_sample_tail_free(
|
604
|
+
struct llama_context * ctx,
|
605
|
+
llama_token_data_array * candidates,
|
606
|
+
float z,
|
607
|
+
size_t min_keep);
|
450
608
|
|
451
609
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
452
|
-
LLAMA_API void llama_sample_typical(
|
453
|
-
|
610
|
+
LLAMA_API void llama_sample_typical(
|
611
|
+
struct llama_context * ctx,
|
612
|
+
llama_token_data_array * candidates,
|
613
|
+
float p,
|
614
|
+
size_t min_keep);
|
615
|
+
|
616
|
+
LLAMA_API void llama_sample_temp(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float temp);
|
620
|
+
|
621
|
+
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
622
|
+
struct llama_context * ctx,
|
623
|
+
llama_token_data_array * candidates,
|
624
|
+
float temp),
|
625
|
+
"use llama_sample_temp instead");
|
454
626
|
|
455
627
|
/// @details Apply constraints from grammar
|
456
|
-
LLAMA_API void llama_sample_grammar(
|
628
|
+
LLAMA_API void llama_sample_grammar(
|
629
|
+
struct llama_context * ctx,
|
630
|
+
llama_token_data_array * candidates,
|
631
|
+
const struct llama_grammar * grammar);
|
457
632
|
|
458
633
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
459
634
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
@@ -461,23 +636,41 @@ extern "C" {
|
|
461
636
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
462
637
|
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
463
638
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
464
|
-
LLAMA_API llama_token llama_sample_token_mirostat(
|
639
|
+
LLAMA_API llama_token llama_sample_token_mirostat(
|
640
|
+
struct llama_context * ctx,
|
641
|
+
llama_token_data_array * candidates,
|
642
|
+
float tau,
|
643
|
+
float eta,
|
644
|
+
int m,
|
645
|
+
float * mu);
|
465
646
|
|
466
647
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
467
648
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
468
649
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
469
650
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
470
651
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
471
|
-
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
652
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
653
|
+
struct llama_context * ctx,
|
654
|
+
llama_token_data_array * candidates,
|
655
|
+
float tau,
|
656
|
+
float eta,
|
657
|
+
float * mu);
|
472
658
|
|
473
659
|
/// @details Selects the token with the highest probability.
|
474
|
-
LLAMA_API llama_token llama_sample_token_greedy(
|
660
|
+
LLAMA_API llama_token llama_sample_token_greedy(
|
661
|
+
struct llama_context * ctx,
|
662
|
+
llama_token_data_array * candidates);
|
475
663
|
|
476
664
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
477
|
-
LLAMA_API llama_token llama_sample_token(
|
665
|
+
LLAMA_API llama_token llama_sample_token(
|
666
|
+
struct llama_context * ctx,
|
667
|
+
llama_token_data_array * candidates);
|
478
668
|
|
479
669
|
/// @details Accepts the sampled token into the grammar
|
480
|
-
LLAMA_API void llama_grammar_accept_token(
|
670
|
+
LLAMA_API void llama_grammar_accept_token(
|
671
|
+
struct llama_context * ctx,
|
672
|
+
struct llama_grammar * grammar,
|
673
|
+
llama_token token);
|
481
674
|
|
482
675
|
//
|
483
676
|
// Beam search
|
@@ -485,9 +678,10 @@ extern "C" {
|
|
485
678
|
|
486
679
|
struct llama_beam_view {
|
487
680
|
const llama_token * tokens;
|
681
|
+
|
488
682
|
size_t n_tokens;
|
489
|
-
float
|
490
|
-
bool
|
683
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
684
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
491
685
|
};
|
492
686
|
|
493
687
|
// Passed to beam_search_callback function.
|
@@ -496,9 +690,10 @@ extern "C" {
|
|
496
690
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
497
691
|
struct llama_beams_state {
|
498
692
|
struct llama_beam_view * beam_views;
|
693
|
+
|
499
694
|
size_t n_beams; // Number of elements in beam_views[].
|
500
695
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
501
|
-
bool
|
696
|
+
bool last_call; // True iff this is the last callback invocation.
|
502
697
|
};
|
503
698
|
|
504
699
|
// Type of pointer to the beam_search_callback function.
|
@@ -513,11 +708,17 @@ extern "C" {
|
|
513
708
|
/// @param n_beams Number of beams to use.
|
514
709
|
/// @param n_past Number of tokens already evaluated.
|
515
710
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
516
|
-
|
517
|
-
|
711
|
+
LLAMA_API void llama_beam_search(
|
712
|
+
struct llama_context * ctx,
|
713
|
+
llama_beam_search_callback_fn_t callback,
|
714
|
+
void * callback_data,
|
715
|
+
size_t n_beams,
|
716
|
+
int n_past,
|
717
|
+
int n_predict);
|
518
718
|
|
519
719
|
// Performance information
|
520
720
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
721
|
+
|
521
722
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
522
723
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
523
724
|
|
@@ -526,7 +727,7 @@ extern "C" {
|
|
526
727
|
|
527
728
|
// Set callback for all future logging events.
|
528
729
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
529
|
-
LLAMA_API void llama_log_set(
|
730
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
530
731
|
|
531
732
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
532
733
|
|