llama_cpp 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,10 +37,12 @@
37
37
 
38
38
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
39
39
 
40
+ #define LLAMA_MAX_RNG_STATE (64*1024)
41
+
40
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
41
43
 
42
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
43
- #define LLAMA_SESSION_VERSION 1
45
+ #define LLAMA_SESSION_VERSION 2
44
46
 
45
47
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
46
48
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -60,13 +62,9 @@ extern "C" {
60
62
  struct llama_model;
61
63
  struct llama_context;
62
64
 
63
- typedef int llama_token;
64
-
65
- enum llama_log_level {
66
- LLAMA_LOG_LEVEL_ERROR = 2,
67
- LLAMA_LOG_LEVEL_WARN = 3,
68
- LLAMA_LOG_LEVEL_INFO = 4
69
- };
65
+ typedef int32_t llama_pos;
66
+ typedef int32_t llama_token;
67
+ typedef int32_t llama_seq_id;
70
68
 
71
69
  enum llama_vocab_type {
72
70
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@@ -86,24 +84,24 @@ extern "C" {
86
84
  // model file types
87
85
  enum llama_ftype {
88
86
  LLAMA_FTYPE_ALL_F32 = 0,
89
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
90
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
91
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
92
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
93
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
94
- // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
95
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
96
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
97
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
98
- LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
99
- LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
100
- LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
101
- LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
102
- LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
103
- LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
104
- LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
105
- LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
106
- LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
87
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
88
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
89
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
91
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
92
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
93
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
94
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
95
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
96
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
97
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
98
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
99
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
102
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
107
105
 
108
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
107
  };
@@ -122,41 +120,68 @@ extern "C" {
122
120
 
123
121
  typedef void (*llama_progress_callback)(float progress, void *ctx);
124
122
 
125
- struct llama_context_params {
126
- uint32_t seed; // RNG seed, -1 for random
127
- int32_t n_ctx; // text context
128
- int32_t n_batch; // prompt processing batch size
129
- int32_t n_gpu_layers; // number of layers to store in VRAM
130
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
131
-
123
+ // Input data for llama_decode
124
+ // A llama_batch object can contain input about one or many sequences
125
+ // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
126
+ //
127
+ // - token : the token ids of the input (used when embd is NULL)
128
+ // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
129
+ // - pos : the positions of the respective token in the sequence
130
+ // - seq_id : the sequence to which the respective token belongs
131
+ // - logits : if zero, the logits for the respective token will not be output
132
+ //
133
+ typedef struct llama_batch {
134
+ int32_t n_tokens;
135
+
136
+ llama_token * token;
137
+ float * embd;
138
+ llama_pos * pos;
139
+ llama_seq_id * seq_id;
140
+ int8_t * logits;
141
+
142
+ // NOTE: helpers for smooth API transition - can be deprecated in the future
143
+ // for future-proof code, use the above fields instead and ignore everything below
144
+ //
145
+ // pos[i] = all_pos_0 + i*all_pos_1
146
+ //
147
+ llama_pos all_pos_0; // used if pos == NULL
148
+ llama_pos all_pos_1; // used if pos == NULL
149
+ llama_seq_id all_seq_id; // used if seq_id == NULL
150
+ } llama_batch;
151
+
152
+ struct llama_model_params {
153
+ int32_t n_gpu_layers; // number of layers to store in VRAM
154
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
132
155
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
133
156
 
134
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
135
- float rope_freq_base; // RoPE base frequency
136
- float rope_freq_scale; // RoPE frequency scaling factor
137
-
138
157
  // called with a progress value between 0 and 1, pass NULL to disable
139
158
  llama_progress_callback progress_callback;
140
159
  // context pointer passed to the progress callback
141
160
  void * progress_callback_user_data;
142
161
 
143
162
  // Keep the booleans together to avoid misalignment during copy-by-value.
144
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
145
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
146
- bool f16_kv; // use fp16 for KV cache
147
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
148
163
  bool vocab_only; // only load the vocabulary, no weights
149
164
  bool use_mmap; // use mmap if possible
150
165
  bool use_mlock; // force system to keep model in RAM
151
- bool embedding; // embedding mode only
152
166
  };
153
167
 
154
- // Signature for logging events
155
- // Note that text includes the new line character at the end for most events.
156
- // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
157
- // if it exists.
158
- // It might not exist for progress report where '.' is output repeatedly.
159
- typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
168
+ struct llama_context_params {
169
+ uint32_t seed; // RNG seed, -1 for random
170
+ uint32_t n_ctx; // text context, 0 = from model
171
+ uint32_t n_batch; // prompt processing maximum batch size
172
+ uint32_t n_threads; // number of threads to use for generation
173
+ uint32_t n_threads_batch; // number of threads to use for batch processing
174
+
175
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
176
+ float rope_freq_base; // RoPE base frequency, 0 = from model
177
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
178
+
179
+ // Keep the booleans together to avoid misalignment during copy-by-value.
180
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
181
+ bool f16_kv; // use fp16 for KV cache, fp32 otherwise
182
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one
183
+ bool embedding; // embedding mode only
184
+ };
160
185
 
161
186
  // model quantization parameters
162
187
  typedef struct llama_model_quantize_params {
@@ -215,6 +240,8 @@ extern "C" {
215
240
  int32_t n_eval;
216
241
  };
217
242
 
243
+ // Helpers for getting default parameters
244
+ LLAMA_API struct llama_model_params llama_model_default_params(void);
218
245
  LLAMA_API struct llama_context_params llama_context_default_params(void);
219
246
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
220
247
 
@@ -228,7 +255,7 @@ extern "C" {
228
255
 
229
256
  LLAMA_API struct llama_model * llama_load_model_from_file(
230
257
  const char * path_model,
231
- struct llama_context_params params);
258
+ struct llama_model_params params);
232
259
 
233
260
  LLAMA_API void llama_free_model(struct llama_model * model);
234
261
 
@@ -245,25 +272,31 @@ extern "C" {
245
272
  LLAMA_API bool llama_mmap_supported (void);
246
273
  LLAMA_API bool llama_mlock_supported(void);
247
274
 
248
- LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
275
+ LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
276
+
249
277
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
250
- LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
251
- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
252
278
 
253
- LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
279
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
280
+
281
+ LLAMA_API int llama_n_vocab (const struct llama_model * model);
282
+ LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
283
+ LLAMA_API int llama_n_embd (const struct llama_model * model);
254
284
 
255
- LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
256
- LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
257
- LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
258
- LLAMA_API int llama_model_n_embd (const struct llama_model * model);
285
+ // Get the model's RoPE frequency scaling factor
286
+ LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
259
287
 
260
288
  // Get a string describing the model type
261
289
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
290
+
262
291
  // Returns the total size of all the tensors in the model in bytes
263
292
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
293
+
264
294
  // Returns the total number of parameters in the model
265
295
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
266
296
 
297
+ // Get a llama model tensor
298
+ LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
299
+
267
300
  // Returns 0 on success
268
301
  LLAMA_API int llama_model_quantize(
269
302
  const char * fname_inp,
@@ -279,21 +312,73 @@ extern "C" {
279
312
  LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
280
313
  struct llama_context * ctx,
281
314
  const char * path_lora,
315
+ float scale,
282
316
  const char * path_base_model,
283
317
  int n_threads),
284
- "please use llama_model_apply_lora_from_file instead");
318
+ "use llama_model_apply_lora_from_file instead");
285
319
 
286
320
  LLAMA_API int llama_model_apply_lora_from_file(
287
321
  const struct llama_model * model,
288
- const char * path_lora,
289
- const char * path_base_model,
290
- int n_threads);
322
+ const char * path_lora,
323
+ float scale,
324
+ const char * path_base_model,
325
+ int n_threads);
326
+
327
+ //
328
+ // KV cache
329
+ //
291
330
 
292
331
  // Returns the number of tokens in the KV cache
293
- LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
332
+ LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
333
+ "avoid using this, it will be removed in the future, instead - count the tokens in user code");
294
334
 
295
- // Sets the current rng seed.
296
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
335
+ // Remove all tokens data of cells in [c0, c1)
336
+ // c0 < 0 : [0, c1]
337
+ // c1 < 0 : [c0, inf)
338
+ LLAMA_API void llama_kv_cache_tokens_rm(
339
+ struct llama_context * ctx,
340
+ int32_t c0,
341
+ int32_t c1);
342
+
343
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
344
+ // p0 < 0 : [0, p1]
345
+ // p1 < 0 : [p0, inf)
346
+ LLAMA_API void llama_kv_cache_seq_rm(
347
+ struct llama_context * ctx,
348
+ llama_seq_id seq_id,
349
+ llama_pos p0,
350
+ llama_pos p1);
351
+
352
+ // Copy all tokens that belong to the specified sequence to another sequence
353
+ // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
354
+ // p0 < 0 : [0, p1]
355
+ // p1 < 0 : [p0, inf)
356
+ LLAMA_API void llama_kv_cache_seq_cp(
357
+ struct llama_context * ctx,
358
+ llama_seq_id seq_id_src,
359
+ llama_seq_id seq_id_dst,
360
+ llama_pos p0,
361
+ llama_pos p1);
362
+
363
+ // Removes all tokens that do not belong to the specified sequence
364
+ LLAMA_API void llama_kv_cache_seq_keep(
365
+ struct llama_context * ctx,
366
+ llama_seq_id seq_id);
367
+
368
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
369
+ // If the KV cache is RoPEd, the KV data is updated accordingly
370
+ // p0 < 0 : [0, p1]
371
+ // p1 < 0 : [p0, inf)
372
+ LLAMA_API void llama_kv_cache_seq_shift(
373
+ struct llama_context * ctx,
374
+ llama_seq_id seq_id,
375
+ llama_pos p0,
376
+ llama_pos p1,
377
+ llama_pos delta);
378
+
379
+ //
380
+ // State / sessions
381
+ //
297
382
 
298
383
  // Returns the maximum size in bytes of the state (rng, logits, embedding
299
384
  // and kv_cache) - will often be smaller after compacting tokens
@@ -302,48 +387,102 @@ extern "C" {
302
387
  // Copies the state to the specified destination address.
303
388
  // Destination needs to have allocated enough memory.
304
389
  // Returns the number of bytes copied
305
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
390
+ LLAMA_API size_t llama_copy_state_data(
391
+ struct llama_context * ctx,
392
+ uint8_t * dst);
306
393
 
307
394
  // Set the state reading from the specified address
308
395
  // Returns the number of bytes read
309
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
396
+ LLAMA_API size_t llama_set_state_data(
397
+ struct llama_context * ctx,
398
+ uint8_t * src);
310
399
 
311
400
  // Save/load session file
312
- LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
313
- LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
401
+ LLAMA_API bool llama_load_session_file(
402
+ struct llama_context * ctx,
403
+ const char * path_session,
404
+ llama_token * tokens_out,
405
+ size_t n_token_capacity,
406
+ size_t * n_token_count_out);
314
407
 
315
- // Run the llama inference to obtain the logits and probabilities for the next token.
408
+ LLAMA_API bool llama_save_session_file(
409
+ struct llama_context * ctx,
410
+ const char * path_session,
411
+ const llama_token * tokens,
412
+ size_t n_token_count);
413
+
414
+ //
415
+ // Decoding
416
+ //
417
+
418
+ // Run the llama inference to obtain the logits and probabilities for the next token(s).
316
419
  // tokens + n_tokens is the provided batch of new tokens to process
317
420
  // n_past is the number of tokens to use from previous eval calls
318
421
  // Returns 0 on success
319
- LLAMA_API int llama_eval(
422
+ // DEPRECATED: use llama_decode() instead
423
+ LLAMA_API DEPRECATED(int llama_eval(
320
424
  struct llama_context * ctx,
321
- const llama_token * tokens,
322
- int n_tokens,
323
- int n_past,
324
- int n_threads);
425
+ llama_token * tokens,
426
+ int32_t n_tokens,
427
+ int n_past),
428
+ "use llama_decode() instead");
325
429
 
326
430
  // Same as llama_eval, but use float matrix input directly.
327
- LLAMA_API int llama_eval_embd(
431
+ // DEPRECATED: use llama_decode() instead
432
+ LLAMA_API DEPRECATED(int llama_eval_embd(
328
433
  struct llama_context * ctx,
329
- const float * embd,
330
- int n_tokens,
331
- int n_past,
332
- int n_threads);
434
+ float * embd,
435
+ int32_t n_tokens,
436
+ int n_past),
437
+ "use llama_decode() instead");
438
+
439
+ // Return batch for single sequence of tokens starting at pos_0
440
+ //
441
+ // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
442
+ //
443
+ LLAMA_API struct llama_batch llama_batch_get_one(
444
+ llama_token * tokens,
445
+ int32_t n_tokens,
446
+ llama_pos pos_0,
447
+ llama_seq_id seq_id);
448
+
449
+ // Allocates a batch of tokens on the heap
450
+ // The batch has to be freed with llama_batch_free()
451
+ // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
452
+ // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
453
+ // The rest of the llama_batch members are allocated with size n_tokens
454
+ // All members are left uninitialized
455
+ LLAMA_API struct llama_batch llama_batch_init(
456
+ int32_t n_tokens,
457
+ int32_t embd);
458
+
459
+ // Frees a batch of tokens allocated with llama_batch_init()
460
+ LLAMA_API void llama_batch_free(struct llama_batch batch);
461
+
462
+ // Positive return values does not mean a fatal error, but rather a warning.
463
+ // 0 - success
464
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
465
+ // < 0 - error
466
+ LLAMA_API int llama_decode(
467
+ struct llama_context * ctx,
468
+ struct llama_batch batch);
333
469
 
334
- // Export a static computation graph for context of 511 and batch size of 1
335
- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
336
- // parameters here to keep things simple
337
- // IMPORTANT: do not use for anything else other than debugging and testing!
338
- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
470
+ // Set the number of threads used for decoding
471
+ // n_threads is the number of threads used for generation (single token)
472
+ // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
473
+ LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
339
474
 
340
475
  // Token logits obtained from the last call to llama_eval()
341
476
  // The logits for the last token are stored in the last row
342
- // Can be mutated in order to change the probabilities of the next token
343
- // Rows: n_tokens
477
+ // Logits for which llama_batch.logits[i] == 0 are undefined
478
+ // Rows: n_tokens provided with llama_batch
344
479
  // Cols: n_vocab
345
480
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
346
481
 
482
+ // Logits for the ith token. Equivalent to:
483
+ // llama_get_logits(ctx) + i*n_vocab
484
+ LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
485
+
347
486
  // Get the embeddings for the input
348
487
  // shape: [n_embd] (1-dimensional)
349
488
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@@ -362,6 +501,11 @@ extern "C" {
362
501
  LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
363
502
  LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
364
503
  LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
504
+ // codellama infill tokens
505
+ LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
506
+ LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
507
+ LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
508
+ LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
365
509
 
366
510
  //
367
511
  // Tokenization
@@ -372,14 +516,6 @@ extern "C" {
372
516
  // Returns the number of tokens on success, no more than n_max_tokens
373
517
  // Returns a negative number on failure - the number of tokens that would have been returned
374
518
  LLAMA_API int llama_tokenize(
375
- struct llama_context * ctx,
376
- const char * text,
377
- int text_len,
378
- llama_token * tokens,
379
- int n_max_tokens,
380
- bool add_bos);
381
-
382
- LLAMA_API int llama_tokenize_with_model(
383
519
  const struct llama_model * model,
384
520
  const char * text,
385
521
  int text_len,
@@ -392,12 +528,6 @@ extern "C" {
392
528
  // Does not write null terminator to the buffer.
393
529
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
394
530
  LLAMA_API int llama_token_to_piece(
395
- const struct llama_context * ctx,
396
- llama_token token,
397
- char * buf,
398
- int length);
399
-
400
- LLAMA_API int llama_token_to_piece_with_model(
401
531
  const struct llama_model * model,
402
532
  llama_token token,
403
533
  char * buf,
@@ -420,11 +550,25 @@ extern "C" {
420
550
  // Sampling functions
421
551
  //
422
552
 
553
+ // Sets the current rng seed.
554
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
555
+
423
556
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
424
- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
557
+ LLAMA_API void llama_sample_repetition_penalty(
558
+ struct llama_context * ctx,
559
+ llama_token_data_array * candidates,
560
+ const llama_token * last_tokens,
561
+ size_t last_tokens_size,
562
+ float penalty);
425
563
 
426
564
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
427
- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
565
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(
566
+ struct llama_context * ctx,
567
+ llama_token_data_array * candidates,
568
+ const llama_token * last_tokens,
569
+ size_t last_tokens_size,
570
+ float alpha_frequency,
571
+ float alpha_presence);
428
572
 
429
573
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
430
574
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -437,23 +581,54 @@ extern "C" {
437
581
  float scale);
438
582
 
439
583
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
440
- LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
584
+ LLAMA_API void llama_sample_softmax(
585
+ struct llama_context * ctx,
586
+ llama_token_data_array * candidates);
441
587
 
442
588
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
443
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
589
+ LLAMA_API void llama_sample_top_k(
590
+ struct llama_context * ctx,
591
+ llama_token_data_array * candidates,
592
+ int k,
593
+ size_t min_keep);
444
594
 
445
595
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
446
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
596
+ LLAMA_API void llama_sample_top_p(
597
+ struct llama_context * ctx,
598
+ llama_token_data_array * candidates,
599
+ float p,
600
+ size_t min_keep);
447
601
 
448
602
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
449
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
603
+ LLAMA_API void llama_sample_tail_free(
604
+ struct llama_context * ctx,
605
+ llama_token_data_array * candidates,
606
+ float z,
607
+ size_t min_keep);
450
608
 
451
609
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
452
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
453
- LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
610
+ LLAMA_API void llama_sample_typical(
611
+ struct llama_context * ctx,
612
+ llama_token_data_array * candidates,
613
+ float p,
614
+ size_t min_keep);
615
+
616
+ LLAMA_API void llama_sample_temp(
617
+ struct llama_context * ctx,
618
+ llama_token_data_array * candidates,
619
+ float temp);
620
+
621
+ LLAMA_API DEPRECATED(void llama_sample_temperature(
622
+ struct llama_context * ctx,
623
+ llama_token_data_array * candidates,
624
+ float temp),
625
+ "use llama_sample_temp instead");
454
626
 
455
627
  /// @details Apply constraints from grammar
456
- LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
628
+ LLAMA_API void llama_sample_grammar(
629
+ struct llama_context * ctx,
630
+ llama_token_data_array * candidates,
631
+ const struct llama_grammar * grammar);
457
632
 
458
633
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
459
634
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -461,23 +636,41 @@ extern "C" {
461
636
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
462
637
  /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
463
638
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
464
- LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
639
+ LLAMA_API llama_token llama_sample_token_mirostat(
640
+ struct llama_context * ctx,
641
+ llama_token_data_array * candidates,
642
+ float tau,
643
+ float eta,
644
+ int m,
645
+ float * mu);
465
646
 
466
647
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
467
648
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
468
649
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
469
650
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
470
651
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
471
- LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
652
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(
653
+ struct llama_context * ctx,
654
+ llama_token_data_array * candidates,
655
+ float tau,
656
+ float eta,
657
+ float * mu);
472
658
 
473
659
  /// @details Selects the token with the highest probability.
474
- LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
660
+ LLAMA_API llama_token llama_sample_token_greedy(
661
+ struct llama_context * ctx,
662
+ llama_token_data_array * candidates);
475
663
 
476
664
  /// @details Randomly selects a token from the candidates based on their probabilities.
477
- LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
665
+ LLAMA_API llama_token llama_sample_token(
666
+ struct llama_context * ctx,
667
+ llama_token_data_array * candidates);
478
668
 
479
669
  /// @details Accepts the sampled token into the grammar
480
- LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
670
+ LLAMA_API void llama_grammar_accept_token(
671
+ struct llama_context * ctx,
672
+ struct llama_grammar * grammar,
673
+ llama_token token);
481
674
 
482
675
  //
483
676
  // Beam search
@@ -485,9 +678,10 @@ extern "C" {
485
678
 
486
679
  struct llama_beam_view {
487
680
  const llama_token * tokens;
681
+
488
682
  size_t n_tokens;
489
- float p; // Cumulative beam probability (renormalized relative to all beams)
490
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
683
+ float p; // Cumulative beam probability (renormalized relative to all beams)
684
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
491
685
  };
492
686
 
493
687
  // Passed to beam_search_callback function.
@@ -496,9 +690,10 @@ extern "C" {
496
690
  // These pointers are valid only during the synchronous callback, so should not be saved.
497
691
  struct llama_beams_state {
498
692
  struct llama_beam_view * beam_views;
693
+
499
694
  size_t n_beams; // Number of elements in beam_views[].
500
695
  size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
501
- bool last_call; // True iff this is the last callback invocation.
696
+ bool last_call; // True iff this is the last callback invocation.
502
697
  };
503
698
 
504
699
  // Type of pointer to the beam_search_callback function.
@@ -513,11 +708,17 @@ extern "C" {
513
708
  /// @param n_beams Number of beams to use.
514
709
  /// @param n_past Number of tokens already evaluated.
515
710
  /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
516
- /// @param n_threads Number of threads as passed to llama_eval().
517
- LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
711
+ LLAMA_API void llama_beam_search(
712
+ struct llama_context * ctx,
713
+ llama_beam_search_callback_fn_t callback,
714
+ void * callback_data,
715
+ size_t n_beams,
716
+ int n_past,
717
+ int n_predict);
518
718
 
519
719
  // Performance information
520
720
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
721
+
521
722
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
522
723
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
523
724
 
@@ -526,7 +727,7 @@ extern "C" {
526
727
 
527
728
  // Set callback for all future logging events.
528
729
  // If this is not called, or NULL is supplied, everything is output on stderr.
529
- LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
730
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
530
731
 
531
732
  LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
532
733