llama_cpp 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -37,10 +37,12 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
39
39
|
|
40
|
+
#define LLAMA_MAX_RNG_STATE (64*1024)
|
41
|
+
|
40
42
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
41
43
|
|
42
44
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION
|
45
|
+
#define LLAMA_SESSION_VERSION 2
|
44
46
|
|
45
47
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
46
48
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
@@ -60,13 +62,9 @@ extern "C" {
|
|
60
62
|
struct llama_model;
|
61
63
|
struct llama_context;
|
62
64
|
|
63
|
-
typedef
|
64
|
-
|
65
|
-
|
66
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
-
};
|
65
|
+
typedef int32_t llama_pos;
|
66
|
+
typedef int32_t llama_token;
|
67
|
+
typedef int32_t llama_seq_id;
|
70
68
|
|
71
69
|
enum llama_vocab_type {
|
72
70
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
@@ -86,24 +84,24 @@ extern "C" {
|
|
86
84
|
// model file types
|
87
85
|
enum llama_ftype {
|
88
86
|
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
-
LLAMA_FTYPE_MOSTLY_F16 = 1,
|
90
|
-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,
|
91
|
-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,
|
92
|
-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,
|
93
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5,
|
94
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6,
|
95
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,
|
96
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,
|
97
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,
|
98
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10
|
99
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
|
100
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
|
101
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
|
102
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
|
103
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
|
104
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
|
105
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
|
106
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
87
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
88
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
91
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
93
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
107
105
|
|
108
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
107
|
};
|
@@ -122,41 +120,68 @@ extern "C" {
|
|
122
120
|
|
123
121
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
123
|
+
// Input data for llama_decode
|
124
|
+
// A llama_batch object can contain input about one or many sequences
|
125
|
+
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
126
|
+
//
|
127
|
+
// - token : the token ids of the input (used when embd is NULL)
|
128
|
+
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
129
|
+
// - pos : the positions of the respective token in the sequence
|
130
|
+
// - seq_id : the sequence to which the respective token belongs
|
131
|
+
// - logits : if zero, the logits for the respective token will not be output
|
132
|
+
//
|
133
|
+
typedef struct llama_batch {
|
134
|
+
int32_t n_tokens;
|
135
|
+
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
llama_seq_id * seq_id;
|
140
|
+
int8_t * logits;
|
141
|
+
|
142
|
+
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
|
+
// for future-proof code, use the above fields instead and ignore everything below
|
144
|
+
//
|
145
|
+
// pos[i] = all_pos_0 + i*all_pos_1
|
146
|
+
//
|
147
|
+
llama_pos all_pos_0; // used if pos == NULL
|
148
|
+
llama_pos all_pos_1; // used if pos == NULL
|
149
|
+
llama_seq_id all_seq_id; // used if seq_id == NULL
|
150
|
+
} llama_batch;
|
151
|
+
|
152
|
+
struct llama_model_params {
|
153
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
154
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
132
155
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
133
156
|
|
134
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
135
|
-
float rope_freq_base; // RoPE base frequency
|
136
|
-
float rope_freq_scale; // RoPE frequency scaling factor
|
137
|
-
|
138
157
|
// called with a progress value between 0 and 1, pass NULL to disable
|
139
158
|
llama_progress_callback progress_callback;
|
140
159
|
// context pointer passed to the progress callback
|
141
160
|
void * progress_callback_user_data;
|
142
161
|
|
143
162
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
144
|
-
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
145
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
146
|
-
bool f16_kv; // use fp16 for KV cache
|
147
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
148
163
|
bool vocab_only; // only load the vocabulary, no weights
|
149
164
|
bool use_mmap; // use mmap if possible
|
150
165
|
bool use_mlock; // force system to keep model in RAM
|
151
|
-
bool embedding; // embedding mode only
|
152
166
|
};
|
153
167
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
168
|
+
struct llama_context_params {
|
169
|
+
uint32_t seed; // RNG seed, -1 for random
|
170
|
+
uint32_t n_ctx; // text context, 0 = from model
|
171
|
+
uint32_t n_batch; // prompt processing maximum batch size
|
172
|
+
uint32_t n_threads; // number of threads to use for generation
|
173
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
174
|
+
|
175
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
176
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
177
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
178
|
+
|
179
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
180
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
181
|
+
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
182
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
183
|
+
bool embedding; // embedding mode only
|
184
|
+
};
|
160
185
|
|
161
186
|
// model quantization parameters
|
162
187
|
typedef struct llama_model_quantize_params {
|
@@ -215,6 +240,8 @@ extern "C" {
|
|
215
240
|
int32_t n_eval;
|
216
241
|
};
|
217
242
|
|
243
|
+
// Helpers for getting default parameters
|
244
|
+
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
218
245
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
219
246
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
220
247
|
|
@@ -228,7 +255,7 @@ extern "C" {
|
|
228
255
|
|
229
256
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
230
257
|
const char * path_model,
|
231
|
-
struct
|
258
|
+
struct llama_model_params params);
|
232
259
|
|
233
260
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
234
261
|
|
@@ -245,25 +272,31 @@ extern "C" {
|
|
245
272
|
LLAMA_API bool llama_mmap_supported (void);
|
246
273
|
LLAMA_API bool llama_mlock_supported(void);
|
247
274
|
|
248
|
-
LLAMA_API
|
275
|
+
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
276
|
+
|
249
277
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
250
|
-
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
251
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
252
278
|
|
253
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct
|
279
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
280
|
+
|
281
|
+
LLAMA_API int llama_n_vocab (const struct llama_model * model);
|
282
|
+
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
283
|
+
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
254
284
|
|
255
|
-
|
256
|
-
LLAMA_API
|
257
|
-
LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
|
258
|
-
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
285
|
+
// Get the model's RoPE frequency scaling factor
|
286
|
+
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
259
287
|
|
260
288
|
// Get a string describing the model type
|
261
289
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
290
|
+
|
262
291
|
// Returns the total size of all the tensors in the model in bytes
|
263
292
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
293
|
+
|
264
294
|
// Returns the total number of parameters in the model
|
265
295
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
266
296
|
|
297
|
+
// Get a llama model tensor
|
298
|
+
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
299
|
+
|
267
300
|
// Returns 0 on success
|
268
301
|
LLAMA_API int llama_model_quantize(
|
269
302
|
const char * fname_inp,
|
@@ -279,21 +312,73 @@ extern "C" {
|
|
279
312
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
280
313
|
struct llama_context * ctx,
|
281
314
|
const char * path_lora,
|
315
|
+
float scale,
|
282
316
|
const char * path_base_model,
|
283
317
|
int n_threads),
|
284
|
-
"
|
318
|
+
"use llama_model_apply_lora_from_file instead");
|
285
319
|
|
286
320
|
LLAMA_API int llama_model_apply_lora_from_file(
|
287
321
|
const struct llama_model * model,
|
288
|
-
|
289
|
-
|
290
|
-
|
322
|
+
const char * path_lora,
|
323
|
+
float scale,
|
324
|
+
const char * path_base_model,
|
325
|
+
int n_threads);
|
326
|
+
|
327
|
+
//
|
328
|
+
// KV cache
|
329
|
+
//
|
291
330
|
|
292
331
|
// Returns the number of tokens in the KV cache
|
293
|
-
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx)
|
332
|
+
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
333
|
+
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
294
334
|
|
295
|
-
//
|
296
|
-
|
335
|
+
// Remove all tokens data of cells in [c0, c1)
|
336
|
+
// c0 < 0 : [0, c1]
|
337
|
+
// c1 < 0 : [c0, inf)
|
338
|
+
LLAMA_API void llama_kv_cache_tokens_rm(
|
339
|
+
struct llama_context * ctx,
|
340
|
+
int32_t c0,
|
341
|
+
int32_t c1);
|
342
|
+
|
343
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
344
|
+
// p0 < 0 : [0, p1]
|
345
|
+
// p1 < 0 : [p0, inf)
|
346
|
+
LLAMA_API void llama_kv_cache_seq_rm(
|
347
|
+
struct llama_context * ctx,
|
348
|
+
llama_seq_id seq_id,
|
349
|
+
llama_pos p0,
|
350
|
+
llama_pos p1);
|
351
|
+
|
352
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
353
|
+
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
354
|
+
// p0 < 0 : [0, p1]
|
355
|
+
// p1 < 0 : [p0, inf)
|
356
|
+
LLAMA_API void llama_kv_cache_seq_cp(
|
357
|
+
struct llama_context * ctx,
|
358
|
+
llama_seq_id seq_id_src,
|
359
|
+
llama_seq_id seq_id_dst,
|
360
|
+
llama_pos p0,
|
361
|
+
llama_pos p1);
|
362
|
+
|
363
|
+
// Removes all tokens that do not belong to the specified sequence
|
364
|
+
LLAMA_API void llama_kv_cache_seq_keep(
|
365
|
+
struct llama_context * ctx,
|
366
|
+
llama_seq_id seq_id);
|
367
|
+
|
368
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
369
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
370
|
+
// p0 < 0 : [0, p1]
|
371
|
+
// p1 < 0 : [p0, inf)
|
372
|
+
LLAMA_API void llama_kv_cache_seq_shift(
|
373
|
+
struct llama_context * ctx,
|
374
|
+
llama_seq_id seq_id,
|
375
|
+
llama_pos p0,
|
376
|
+
llama_pos p1,
|
377
|
+
llama_pos delta);
|
378
|
+
|
379
|
+
//
|
380
|
+
// State / sessions
|
381
|
+
//
|
297
382
|
|
298
383
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
299
384
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -302,48 +387,102 @@ extern "C" {
|
|
302
387
|
// Copies the state to the specified destination address.
|
303
388
|
// Destination needs to have allocated enough memory.
|
304
389
|
// Returns the number of bytes copied
|
305
|
-
LLAMA_API size_t llama_copy_state_data(
|
390
|
+
LLAMA_API size_t llama_copy_state_data(
|
391
|
+
struct llama_context * ctx,
|
392
|
+
uint8_t * dst);
|
306
393
|
|
307
394
|
// Set the state reading from the specified address
|
308
395
|
// Returns the number of bytes read
|
309
|
-
LLAMA_API size_t llama_set_state_data(
|
396
|
+
LLAMA_API size_t llama_set_state_data(
|
397
|
+
struct llama_context * ctx,
|
398
|
+
uint8_t * src);
|
310
399
|
|
311
400
|
// Save/load session file
|
312
|
-
LLAMA_API bool llama_load_session_file(
|
313
|
-
|
401
|
+
LLAMA_API bool llama_load_session_file(
|
402
|
+
struct llama_context * ctx,
|
403
|
+
const char * path_session,
|
404
|
+
llama_token * tokens_out,
|
405
|
+
size_t n_token_capacity,
|
406
|
+
size_t * n_token_count_out);
|
314
407
|
|
315
|
-
|
408
|
+
LLAMA_API bool llama_save_session_file(
|
409
|
+
struct llama_context * ctx,
|
410
|
+
const char * path_session,
|
411
|
+
const llama_token * tokens,
|
412
|
+
size_t n_token_count);
|
413
|
+
|
414
|
+
//
|
415
|
+
// Decoding
|
416
|
+
//
|
417
|
+
|
418
|
+
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
316
419
|
// tokens + n_tokens is the provided batch of new tokens to process
|
317
420
|
// n_past is the number of tokens to use from previous eval calls
|
318
421
|
// Returns 0 on success
|
319
|
-
|
422
|
+
// DEPRECATED: use llama_decode() instead
|
423
|
+
LLAMA_API DEPRECATED(int llama_eval(
|
320
424
|
struct llama_context * ctx,
|
321
|
-
|
322
|
-
|
323
|
-
int n_past,
|
324
|
-
|
425
|
+
llama_token * tokens,
|
426
|
+
int32_t n_tokens,
|
427
|
+
int n_past),
|
428
|
+
"use llama_decode() instead");
|
325
429
|
|
326
430
|
// Same as llama_eval, but use float matrix input directly.
|
327
|
-
|
431
|
+
// DEPRECATED: use llama_decode() instead
|
432
|
+
LLAMA_API DEPRECATED(int llama_eval_embd(
|
328
433
|
struct llama_context * ctx,
|
329
|
-
|
330
|
-
|
331
|
-
int n_past,
|
332
|
-
|
434
|
+
float * embd,
|
435
|
+
int32_t n_tokens,
|
436
|
+
int n_past),
|
437
|
+
"use llama_decode() instead");
|
438
|
+
|
439
|
+
// Return batch for single sequence of tokens starting at pos_0
|
440
|
+
//
|
441
|
+
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
442
|
+
//
|
443
|
+
LLAMA_API struct llama_batch llama_batch_get_one(
|
444
|
+
llama_token * tokens,
|
445
|
+
int32_t n_tokens,
|
446
|
+
llama_pos pos_0,
|
447
|
+
llama_seq_id seq_id);
|
448
|
+
|
449
|
+
// Allocates a batch of tokens on the heap
|
450
|
+
// The batch has to be freed with llama_batch_free()
|
451
|
+
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
452
|
+
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
453
|
+
// The rest of the llama_batch members are allocated with size n_tokens
|
454
|
+
// All members are left uninitialized
|
455
|
+
LLAMA_API struct llama_batch llama_batch_init(
|
456
|
+
int32_t n_tokens,
|
457
|
+
int32_t embd);
|
458
|
+
|
459
|
+
// Frees a batch of tokens allocated with llama_batch_init()
|
460
|
+
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
461
|
+
|
462
|
+
// Positive return values does not mean a fatal error, but rather a warning.
|
463
|
+
// 0 - success
|
464
|
+
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
465
|
+
// < 0 - error
|
466
|
+
LLAMA_API int llama_decode(
|
467
|
+
struct llama_context * ctx,
|
468
|
+
struct llama_batch batch);
|
333
469
|
|
334
|
-
//
|
335
|
-
//
|
336
|
-
//
|
337
|
-
|
338
|
-
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
470
|
+
// Set the number of threads used for decoding
|
471
|
+
// n_threads is the number of threads used for generation (single token)
|
472
|
+
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
473
|
+
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
339
474
|
|
340
475
|
// Token logits obtained from the last call to llama_eval()
|
341
476
|
// The logits for the last token are stored in the last row
|
342
|
-
//
|
343
|
-
// Rows: n_tokens
|
477
|
+
// Logits for which llama_batch.logits[i] == 0 are undefined
|
478
|
+
// Rows: n_tokens provided with llama_batch
|
344
479
|
// Cols: n_vocab
|
345
480
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
346
481
|
|
482
|
+
// Logits for the ith token. Equivalent to:
|
483
|
+
// llama_get_logits(ctx) + i*n_vocab
|
484
|
+
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
485
|
+
|
347
486
|
// Get the embeddings for the input
|
348
487
|
// shape: [n_embd] (1-dimensional)
|
349
488
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
@@ -362,6 +501,11 @@ extern "C" {
|
|
362
501
|
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
363
502
|
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
364
503
|
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
504
|
+
// codellama infill tokens
|
505
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
|
506
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
|
507
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
|
508
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
|
365
509
|
|
366
510
|
//
|
367
511
|
// Tokenization
|
@@ -372,14 +516,6 @@ extern "C" {
|
|
372
516
|
// Returns the number of tokens on success, no more than n_max_tokens
|
373
517
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
374
518
|
LLAMA_API int llama_tokenize(
|
375
|
-
struct llama_context * ctx,
|
376
|
-
const char * text,
|
377
|
-
int text_len,
|
378
|
-
llama_token * tokens,
|
379
|
-
int n_max_tokens,
|
380
|
-
bool add_bos);
|
381
|
-
|
382
|
-
LLAMA_API int llama_tokenize_with_model(
|
383
519
|
const struct llama_model * model,
|
384
520
|
const char * text,
|
385
521
|
int text_len,
|
@@ -392,12 +528,6 @@ extern "C" {
|
|
392
528
|
// Does not write null terminator to the buffer.
|
393
529
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
394
530
|
LLAMA_API int llama_token_to_piece(
|
395
|
-
const struct llama_context * ctx,
|
396
|
-
llama_token token,
|
397
|
-
char * buf,
|
398
|
-
int length);
|
399
|
-
|
400
|
-
LLAMA_API int llama_token_to_piece_with_model(
|
401
531
|
const struct llama_model * model,
|
402
532
|
llama_token token,
|
403
533
|
char * buf,
|
@@ -420,11 +550,25 @@ extern "C" {
|
|
420
550
|
// Sampling functions
|
421
551
|
//
|
422
552
|
|
553
|
+
// Sets the current rng seed.
|
554
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
555
|
+
|
423
556
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
424
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
557
|
+
LLAMA_API void llama_sample_repetition_penalty(
|
558
|
+
struct llama_context * ctx,
|
559
|
+
llama_token_data_array * candidates,
|
560
|
+
const llama_token * last_tokens,
|
561
|
+
size_t last_tokens_size,
|
562
|
+
float penalty);
|
425
563
|
|
426
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
427
|
-
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
565
|
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
566
|
+
struct llama_context * ctx,
|
567
|
+
llama_token_data_array * candidates,
|
568
|
+
const llama_token * last_tokens,
|
569
|
+
size_t last_tokens_size,
|
570
|
+
float alpha_frequency,
|
571
|
+
float alpha_presence);
|
428
572
|
|
429
573
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
430
574
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -437,23 +581,54 @@ extern "C" {
|
|
437
581
|
float scale);
|
438
582
|
|
439
583
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
440
|
-
LLAMA_API void llama_sample_softmax(
|
584
|
+
LLAMA_API void llama_sample_softmax(
|
585
|
+
struct llama_context * ctx,
|
586
|
+
llama_token_data_array * candidates);
|
441
587
|
|
442
588
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
443
|
-
LLAMA_API void llama_sample_top_k(
|
589
|
+
LLAMA_API void llama_sample_top_k(
|
590
|
+
struct llama_context * ctx,
|
591
|
+
llama_token_data_array * candidates,
|
592
|
+
int k,
|
593
|
+
size_t min_keep);
|
444
594
|
|
445
595
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
446
|
-
LLAMA_API void llama_sample_top_p(
|
596
|
+
LLAMA_API void llama_sample_top_p(
|
597
|
+
struct llama_context * ctx,
|
598
|
+
llama_token_data_array * candidates,
|
599
|
+
float p,
|
600
|
+
size_t min_keep);
|
447
601
|
|
448
602
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
449
|
-
LLAMA_API void llama_sample_tail_free(
|
603
|
+
LLAMA_API void llama_sample_tail_free(
|
604
|
+
struct llama_context * ctx,
|
605
|
+
llama_token_data_array * candidates,
|
606
|
+
float z,
|
607
|
+
size_t min_keep);
|
450
608
|
|
451
609
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
452
|
-
LLAMA_API void llama_sample_typical(
|
453
|
-
|
610
|
+
LLAMA_API void llama_sample_typical(
|
611
|
+
struct llama_context * ctx,
|
612
|
+
llama_token_data_array * candidates,
|
613
|
+
float p,
|
614
|
+
size_t min_keep);
|
615
|
+
|
616
|
+
LLAMA_API void llama_sample_temp(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float temp);
|
620
|
+
|
621
|
+
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
622
|
+
struct llama_context * ctx,
|
623
|
+
llama_token_data_array * candidates,
|
624
|
+
float temp),
|
625
|
+
"use llama_sample_temp instead");
|
454
626
|
|
455
627
|
/// @details Apply constraints from grammar
|
456
|
-
LLAMA_API void llama_sample_grammar(
|
628
|
+
LLAMA_API void llama_sample_grammar(
|
629
|
+
struct llama_context * ctx,
|
630
|
+
llama_token_data_array * candidates,
|
631
|
+
const struct llama_grammar * grammar);
|
457
632
|
|
458
633
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
459
634
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
@@ -461,23 +636,41 @@ extern "C" {
|
|
461
636
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
462
637
|
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
463
638
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
464
|
-
LLAMA_API llama_token llama_sample_token_mirostat(
|
639
|
+
LLAMA_API llama_token llama_sample_token_mirostat(
|
640
|
+
struct llama_context * ctx,
|
641
|
+
llama_token_data_array * candidates,
|
642
|
+
float tau,
|
643
|
+
float eta,
|
644
|
+
int m,
|
645
|
+
float * mu);
|
465
646
|
|
466
647
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
467
648
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
468
649
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
469
650
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
470
651
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
471
|
-
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
652
|
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
653
|
+
struct llama_context * ctx,
|
654
|
+
llama_token_data_array * candidates,
|
655
|
+
float tau,
|
656
|
+
float eta,
|
657
|
+
float * mu);
|
472
658
|
|
473
659
|
/// @details Selects the token with the highest probability.
|
474
|
-
LLAMA_API llama_token llama_sample_token_greedy(
|
660
|
+
LLAMA_API llama_token llama_sample_token_greedy(
|
661
|
+
struct llama_context * ctx,
|
662
|
+
llama_token_data_array * candidates);
|
475
663
|
|
476
664
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
477
|
-
LLAMA_API llama_token llama_sample_token(
|
665
|
+
LLAMA_API llama_token llama_sample_token(
|
666
|
+
struct llama_context * ctx,
|
667
|
+
llama_token_data_array * candidates);
|
478
668
|
|
479
669
|
/// @details Accepts the sampled token into the grammar
|
480
|
-
LLAMA_API void llama_grammar_accept_token(
|
670
|
+
LLAMA_API void llama_grammar_accept_token(
|
671
|
+
struct llama_context * ctx,
|
672
|
+
struct llama_grammar * grammar,
|
673
|
+
llama_token token);
|
481
674
|
|
482
675
|
//
|
483
676
|
// Beam search
|
@@ -485,9 +678,10 @@ extern "C" {
|
|
485
678
|
|
486
679
|
struct llama_beam_view {
|
487
680
|
const llama_token * tokens;
|
681
|
+
|
488
682
|
size_t n_tokens;
|
489
|
-
float
|
490
|
-
bool
|
683
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
684
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
491
685
|
};
|
492
686
|
|
493
687
|
// Passed to beam_search_callback function.
|
@@ -496,9 +690,10 @@ extern "C" {
|
|
496
690
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
497
691
|
struct llama_beams_state {
|
498
692
|
struct llama_beam_view * beam_views;
|
693
|
+
|
499
694
|
size_t n_beams; // Number of elements in beam_views[].
|
500
695
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
501
|
-
bool
|
696
|
+
bool last_call; // True iff this is the last callback invocation.
|
502
697
|
};
|
503
698
|
|
504
699
|
// Type of pointer to the beam_search_callback function.
|
@@ -513,11 +708,17 @@ extern "C" {
|
|
513
708
|
/// @param n_beams Number of beams to use.
|
514
709
|
/// @param n_past Number of tokens already evaluated.
|
515
710
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
516
|
-
|
517
|
-
|
711
|
+
LLAMA_API void llama_beam_search(
|
712
|
+
struct llama_context * ctx,
|
713
|
+
llama_beam_search_callback_fn_t callback,
|
714
|
+
void * callback_data,
|
715
|
+
size_t n_beams,
|
716
|
+
int n_past,
|
717
|
+
int n_predict);
|
518
718
|
|
519
719
|
// Performance information
|
520
720
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
721
|
+
|
521
722
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
522
723
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
523
724
|
|
@@ -526,7 +727,7 @@ extern "C" {
|
|
526
727
|
|
527
728
|
// Set callback for all future logging events.
|
528
729
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
529
|
-
LLAMA_API void llama_log_set(
|
730
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
530
731
|
|
531
732
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
532
733
|
|