llama_cpp 0.5.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -37,10 +37,12 @@
37
37
 
38
38
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
39
39
 
40
+ #define LLAMA_MAX_RNG_STATE (64*1024)
41
+
40
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
41
43
 
42
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
43
- #define LLAMA_SESSION_VERSION 1
45
+ #define LLAMA_SESSION_VERSION 2
44
46
 
45
47
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
46
48
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -60,13 +62,9 @@ extern "C" {
60
62
  struct llama_model;
61
63
  struct llama_context;
62
64
 
63
- typedef int llama_token;
64
-
65
- enum llama_log_level {
66
- LLAMA_LOG_LEVEL_ERROR = 2,
67
- LLAMA_LOG_LEVEL_WARN = 3,
68
- LLAMA_LOG_LEVEL_INFO = 4
69
- };
65
+ typedef int32_t llama_pos;
66
+ typedef int32_t llama_token;
67
+ typedef int32_t llama_seq_id;
70
68
 
71
69
  enum llama_vocab_type {
72
70
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@@ -86,24 +84,24 @@ extern "C" {
86
84
  // model file types
87
85
  enum llama_ftype {
88
86
  LLAMA_FTYPE_ALL_F32 = 0,
89
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
90
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
91
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
92
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
93
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
94
- // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
95
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
96
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
97
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
98
- LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
99
- LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
100
- LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
101
- LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
102
- LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
103
- LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
104
- LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
105
- LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
106
- LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
87
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
88
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
89
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
91
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
92
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
93
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
94
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
95
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
96
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
97
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
98
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
99
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
102
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
107
105
 
108
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
107
  };
@@ -122,41 +120,68 @@ extern "C" {
122
120
 
123
121
  typedef void (*llama_progress_callback)(float progress, void *ctx);
124
122
 
125
- struct llama_context_params {
126
- uint32_t seed; // RNG seed, -1 for random
127
- int32_t n_ctx; // text context
128
- int32_t n_batch; // prompt processing batch size
129
- int32_t n_gpu_layers; // number of layers to store in VRAM
130
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
131
-
123
+ // Input data for llama_decode
124
+ // A llama_batch object can contain input about one or many sequences
125
+ // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
126
+ //
127
+ // - token : the token ids of the input (used when embd is NULL)
128
+ // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
129
+ // - pos : the positions of the respective token in the sequence
130
+ // - seq_id : the sequence to which the respective token belongs
131
+ // - logits : if zero, the logits for the respective token will not be output
132
+ //
133
+ typedef struct llama_batch {
134
+ int32_t n_tokens;
135
+
136
+ llama_token * token;
137
+ float * embd;
138
+ llama_pos * pos;
139
+ llama_seq_id * seq_id;
140
+ int8_t * logits;
141
+
142
+ // NOTE: helpers for smooth API transition - can be deprecated in the future
143
+ // for future-proof code, use the above fields instead and ignore everything below
144
+ //
145
+ // pos[i] = all_pos_0 + i*all_pos_1
146
+ //
147
+ llama_pos all_pos_0; // used if pos == NULL
148
+ llama_pos all_pos_1; // used if pos == NULL
149
+ llama_seq_id all_seq_id; // used if seq_id == NULL
150
+ } llama_batch;
151
+
152
+ struct llama_model_params {
153
+ int32_t n_gpu_layers; // number of layers to store in VRAM
154
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
132
155
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
133
156
 
134
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
135
- float rope_freq_base; // RoPE base frequency
136
- float rope_freq_scale; // RoPE frequency scaling factor
137
-
138
157
  // called with a progress value between 0 and 1, pass NULL to disable
139
158
  llama_progress_callback progress_callback;
140
159
  // context pointer passed to the progress callback
141
160
  void * progress_callback_user_data;
142
161
 
143
162
  // Keep the booleans together to avoid misalignment during copy-by-value.
144
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
145
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
146
- bool f16_kv; // use fp16 for KV cache
147
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
148
163
  bool vocab_only; // only load the vocabulary, no weights
149
164
  bool use_mmap; // use mmap if possible
150
165
  bool use_mlock; // force system to keep model in RAM
151
- bool embedding; // embedding mode only
152
166
  };
153
167
 
154
- // Signature for logging events
155
- // Note that text includes the new line character at the end for most events.
156
- // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
157
- // if it exists.
158
- // It might not exist for progress report where '.' is output repeatedly.
159
- typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
168
+ struct llama_context_params {
169
+ uint32_t seed; // RNG seed, -1 for random
170
+ uint32_t n_ctx; // text context, 0 = from model
171
+ uint32_t n_batch; // prompt processing maximum batch size
172
+ uint32_t n_threads; // number of threads to use for generation
173
+ uint32_t n_threads_batch; // number of threads to use for batch processing
174
+
175
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
176
+ float rope_freq_base; // RoPE base frequency, 0 = from model
177
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
178
+
179
+ // Keep the booleans together to avoid misalignment during copy-by-value.
180
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
181
+ bool f16_kv; // use fp16 for KV cache, fp32 otherwise
182
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one
183
+ bool embedding; // embedding mode only
184
+ };
160
185
 
161
186
  // model quantization parameters
162
187
  typedef struct llama_model_quantize_params {
@@ -215,6 +240,8 @@ extern "C" {
215
240
  int32_t n_eval;
216
241
  };
217
242
 
243
+ // Helpers for getting default parameters
244
+ LLAMA_API struct llama_model_params llama_model_default_params(void);
218
245
  LLAMA_API struct llama_context_params llama_context_default_params(void);
219
246
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
220
247
 
@@ -228,7 +255,7 @@ extern "C" {
228
255
 
229
256
  LLAMA_API struct llama_model * llama_load_model_from_file(
230
257
  const char * path_model,
231
- struct llama_context_params params);
258
+ struct llama_model_params params);
232
259
 
233
260
  LLAMA_API void llama_free_model(struct llama_model * model);
234
261
 
@@ -245,25 +272,31 @@ extern "C" {
245
272
  LLAMA_API bool llama_mmap_supported (void);
246
273
  LLAMA_API bool llama_mlock_supported(void);
247
274
 
248
- LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
275
+ LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
276
+
249
277
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
250
- LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
251
- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
252
278
 
253
- LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
279
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
280
+
281
+ LLAMA_API int llama_n_vocab (const struct llama_model * model);
282
+ LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
283
+ LLAMA_API int llama_n_embd (const struct llama_model * model);
254
284
 
255
- LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
256
- LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
257
- LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
258
- LLAMA_API int llama_model_n_embd (const struct llama_model * model);
285
+ // Get the model's RoPE frequency scaling factor
286
+ LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
259
287
 
260
288
  // Get a string describing the model type
261
289
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
290
+
262
291
  // Returns the total size of all the tensors in the model in bytes
263
292
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
293
+
264
294
  // Returns the total number of parameters in the model
265
295
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
266
296
 
297
+ // Get a llama model tensor
298
+ LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
299
+
267
300
  // Returns 0 on success
268
301
  LLAMA_API int llama_model_quantize(
269
302
  const char * fname_inp,
@@ -279,21 +312,73 @@ extern "C" {
279
312
  LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
280
313
  struct llama_context * ctx,
281
314
  const char * path_lora,
315
+ float scale,
282
316
  const char * path_base_model,
283
317
  int n_threads),
284
- "please use llama_model_apply_lora_from_file instead");
318
+ "use llama_model_apply_lora_from_file instead");
285
319
 
286
320
  LLAMA_API int llama_model_apply_lora_from_file(
287
321
  const struct llama_model * model,
288
- const char * path_lora,
289
- const char * path_base_model,
290
- int n_threads);
322
+ const char * path_lora,
323
+ float scale,
324
+ const char * path_base_model,
325
+ int n_threads);
326
+
327
+ //
328
+ // KV cache
329
+ //
291
330
 
292
331
  // Returns the number of tokens in the KV cache
293
- LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
332
+ LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
333
+ "avoid using this, it will be removed in the future, instead - count the tokens in user code");
294
334
 
295
- // Sets the current rng seed.
296
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
335
+ // Remove all tokens data of cells in [c0, c1)
336
+ // c0 < 0 : [0, c1]
337
+ // c1 < 0 : [c0, inf)
338
+ LLAMA_API void llama_kv_cache_tokens_rm(
339
+ struct llama_context * ctx,
340
+ int32_t c0,
341
+ int32_t c1);
342
+
343
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
344
+ // p0 < 0 : [0, p1]
345
+ // p1 < 0 : [p0, inf)
346
+ LLAMA_API void llama_kv_cache_seq_rm(
347
+ struct llama_context * ctx,
348
+ llama_seq_id seq_id,
349
+ llama_pos p0,
350
+ llama_pos p1);
351
+
352
+ // Copy all tokens that belong to the specified sequence to another sequence
353
+ // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
354
+ // p0 < 0 : [0, p1]
355
+ // p1 < 0 : [p0, inf)
356
+ LLAMA_API void llama_kv_cache_seq_cp(
357
+ struct llama_context * ctx,
358
+ llama_seq_id seq_id_src,
359
+ llama_seq_id seq_id_dst,
360
+ llama_pos p0,
361
+ llama_pos p1);
362
+
363
+ // Removes all tokens that do not belong to the specified sequence
364
+ LLAMA_API void llama_kv_cache_seq_keep(
365
+ struct llama_context * ctx,
366
+ llama_seq_id seq_id);
367
+
368
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
369
+ // If the KV cache is RoPEd, the KV data is updated accordingly
370
+ // p0 < 0 : [0, p1]
371
+ // p1 < 0 : [p0, inf)
372
+ LLAMA_API void llama_kv_cache_seq_shift(
373
+ struct llama_context * ctx,
374
+ llama_seq_id seq_id,
375
+ llama_pos p0,
376
+ llama_pos p1,
377
+ llama_pos delta);
378
+
379
+ //
380
+ // State / sessions
381
+ //
297
382
 
298
383
  // Returns the maximum size in bytes of the state (rng, logits, embedding
299
384
  // and kv_cache) - will often be smaller after compacting tokens
@@ -302,48 +387,102 @@ extern "C" {
302
387
  // Copies the state to the specified destination address.
303
388
  // Destination needs to have allocated enough memory.
304
389
  // Returns the number of bytes copied
305
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
390
+ LLAMA_API size_t llama_copy_state_data(
391
+ struct llama_context * ctx,
392
+ uint8_t * dst);
306
393
 
307
394
  // Set the state reading from the specified address
308
395
  // Returns the number of bytes read
309
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
396
+ LLAMA_API size_t llama_set_state_data(
397
+ struct llama_context * ctx,
398
+ uint8_t * src);
310
399
 
311
400
  // Save/load session file
312
- LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
313
- LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
401
+ LLAMA_API bool llama_load_session_file(
402
+ struct llama_context * ctx,
403
+ const char * path_session,
404
+ llama_token * tokens_out,
405
+ size_t n_token_capacity,
406
+ size_t * n_token_count_out);
314
407
 
315
- // Run the llama inference to obtain the logits and probabilities for the next token.
408
+ LLAMA_API bool llama_save_session_file(
409
+ struct llama_context * ctx,
410
+ const char * path_session,
411
+ const llama_token * tokens,
412
+ size_t n_token_count);
413
+
414
+ //
415
+ // Decoding
416
+ //
417
+
418
+ // Run the llama inference to obtain the logits and probabilities for the next token(s).
316
419
  // tokens + n_tokens is the provided batch of new tokens to process
317
420
  // n_past is the number of tokens to use from previous eval calls
318
421
  // Returns 0 on success
319
- LLAMA_API int llama_eval(
422
+ // DEPRECATED: use llama_decode() instead
423
+ LLAMA_API DEPRECATED(int llama_eval(
320
424
  struct llama_context * ctx,
321
- const llama_token * tokens,
322
- int n_tokens,
323
- int n_past,
324
- int n_threads);
425
+ llama_token * tokens,
426
+ int32_t n_tokens,
427
+ int n_past),
428
+ "use llama_decode() instead");
325
429
 
326
430
  // Same as llama_eval, but use float matrix input directly.
327
- LLAMA_API int llama_eval_embd(
431
+ // DEPRECATED: use llama_decode() instead
432
+ LLAMA_API DEPRECATED(int llama_eval_embd(
328
433
  struct llama_context * ctx,
329
- const float * embd,
330
- int n_tokens,
331
- int n_past,
332
- int n_threads);
434
+ float * embd,
435
+ int32_t n_tokens,
436
+ int n_past),
437
+ "use llama_decode() instead");
438
+
439
+ // Return batch for single sequence of tokens starting at pos_0
440
+ //
441
+ // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
442
+ //
443
+ LLAMA_API struct llama_batch llama_batch_get_one(
444
+ llama_token * tokens,
445
+ int32_t n_tokens,
446
+ llama_pos pos_0,
447
+ llama_seq_id seq_id);
448
+
449
+ // Allocates a batch of tokens on the heap
450
+ // The batch has to be freed with llama_batch_free()
451
+ // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
452
+ // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
453
+ // The rest of the llama_batch members are allocated with size n_tokens
454
+ // All members are left uninitialized
455
+ LLAMA_API struct llama_batch llama_batch_init(
456
+ int32_t n_tokens,
457
+ int32_t embd);
458
+
459
+ // Frees a batch of tokens allocated with llama_batch_init()
460
+ LLAMA_API void llama_batch_free(struct llama_batch batch);
461
+
462
+ // Positive return values does not mean a fatal error, but rather a warning.
463
+ // 0 - success
464
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
465
+ // < 0 - error
466
+ LLAMA_API int llama_decode(
467
+ struct llama_context * ctx,
468
+ struct llama_batch batch);
333
469
 
334
- // Export a static computation graph for context of 511 and batch size of 1
335
- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
336
- // parameters here to keep things simple
337
- // IMPORTANT: do not use for anything else other than debugging and testing!
338
- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
470
+ // Set the number of threads used for decoding
471
+ // n_threads is the number of threads used for generation (single token)
472
+ // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
473
+ LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
339
474
 
340
475
  // Token logits obtained from the last call to llama_eval()
341
476
  // The logits for the last token are stored in the last row
342
- // Can be mutated in order to change the probabilities of the next token
343
- // Rows: n_tokens
477
+ // Logits for which llama_batch.logits[i] == 0 are undefined
478
+ // Rows: n_tokens provided with llama_batch
344
479
  // Cols: n_vocab
345
480
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
346
481
 
482
+ // Logits for the ith token. Equivalent to:
483
+ // llama_get_logits(ctx) + i*n_vocab
484
+ LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
485
+
347
486
  // Get the embeddings for the input
348
487
  // shape: [n_embd] (1-dimensional)
349
488
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@@ -362,6 +501,11 @@ extern "C" {
362
501
  LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
363
502
  LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
364
503
  LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
504
+ // codellama infill tokens
505
+ LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
506
+ LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
507
+ LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
508
+ LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
365
509
 
366
510
  //
367
511
  // Tokenization
@@ -372,14 +516,6 @@ extern "C" {
372
516
  // Returns the number of tokens on success, no more than n_max_tokens
373
517
  // Returns a negative number on failure - the number of tokens that would have been returned
374
518
  LLAMA_API int llama_tokenize(
375
- struct llama_context * ctx,
376
- const char * text,
377
- int text_len,
378
- llama_token * tokens,
379
- int n_max_tokens,
380
- bool add_bos);
381
-
382
- LLAMA_API int llama_tokenize_with_model(
383
519
  const struct llama_model * model,
384
520
  const char * text,
385
521
  int text_len,
@@ -392,12 +528,6 @@ extern "C" {
392
528
  // Does not write null terminator to the buffer.
393
529
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
394
530
  LLAMA_API int llama_token_to_piece(
395
- const struct llama_context * ctx,
396
- llama_token token,
397
- char * buf,
398
- int length);
399
-
400
- LLAMA_API int llama_token_to_piece_with_model(
401
531
  const struct llama_model * model,
402
532
  llama_token token,
403
533
  char * buf,
@@ -420,11 +550,25 @@ extern "C" {
420
550
  // Sampling functions
421
551
  //
422
552
 
553
+ // Sets the current rng seed.
554
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
555
+
423
556
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
424
- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
557
+ LLAMA_API void llama_sample_repetition_penalty(
558
+ struct llama_context * ctx,
559
+ llama_token_data_array * candidates,
560
+ const llama_token * last_tokens,
561
+ size_t last_tokens_size,
562
+ float penalty);
425
563
 
426
564
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
427
- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
565
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(
566
+ struct llama_context * ctx,
567
+ llama_token_data_array * candidates,
568
+ const llama_token * last_tokens,
569
+ size_t last_tokens_size,
570
+ float alpha_frequency,
571
+ float alpha_presence);
428
572
 
429
573
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
430
574
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -437,23 +581,54 @@ extern "C" {
437
581
  float scale);
438
582
 
439
583
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
440
- LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
584
+ LLAMA_API void llama_sample_softmax(
585
+ struct llama_context * ctx,
586
+ llama_token_data_array * candidates);
441
587
 
442
588
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
443
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
589
+ LLAMA_API void llama_sample_top_k(
590
+ struct llama_context * ctx,
591
+ llama_token_data_array * candidates,
592
+ int k,
593
+ size_t min_keep);
444
594
 
445
595
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
446
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
596
+ LLAMA_API void llama_sample_top_p(
597
+ struct llama_context * ctx,
598
+ llama_token_data_array * candidates,
599
+ float p,
600
+ size_t min_keep);
447
601
 
448
602
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
449
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
603
+ LLAMA_API void llama_sample_tail_free(
604
+ struct llama_context * ctx,
605
+ llama_token_data_array * candidates,
606
+ float z,
607
+ size_t min_keep);
450
608
 
451
609
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
452
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
453
- LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
610
+ LLAMA_API void llama_sample_typical(
611
+ struct llama_context * ctx,
612
+ llama_token_data_array * candidates,
613
+ float p,
614
+ size_t min_keep);
615
+
616
+ LLAMA_API void llama_sample_temp(
617
+ struct llama_context * ctx,
618
+ llama_token_data_array * candidates,
619
+ float temp);
620
+
621
+ LLAMA_API DEPRECATED(void llama_sample_temperature(
622
+ struct llama_context * ctx,
623
+ llama_token_data_array * candidates,
624
+ float temp),
625
+ "use llama_sample_temp instead");
454
626
 
455
627
  /// @details Apply constraints from grammar
456
- LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
628
+ LLAMA_API void llama_sample_grammar(
629
+ struct llama_context * ctx,
630
+ llama_token_data_array * candidates,
631
+ const struct llama_grammar * grammar);
457
632
 
458
633
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
459
634
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -461,23 +636,41 @@ extern "C" {
461
636
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
462
637
  /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
463
638
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
464
- LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
639
+ LLAMA_API llama_token llama_sample_token_mirostat(
640
+ struct llama_context * ctx,
641
+ llama_token_data_array * candidates,
642
+ float tau,
643
+ float eta,
644
+ int m,
645
+ float * mu);
465
646
 
466
647
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
467
648
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
468
649
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
469
650
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
470
651
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
471
- LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
652
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(
653
+ struct llama_context * ctx,
654
+ llama_token_data_array * candidates,
655
+ float tau,
656
+ float eta,
657
+ float * mu);
472
658
 
473
659
  /// @details Selects the token with the highest probability.
474
- LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
660
+ LLAMA_API llama_token llama_sample_token_greedy(
661
+ struct llama_context * ctx,
662
+ llama_token_data_array * candidates);
475
663
 
476
664
  /// @details Randomly selects a token from the candidates based on their probabilities.
477
- LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
665
+ LLAMA_API llama_token llama_sample_token(
666
+ struct llama_context * ctx,
667
+ llama_token_data_array * candidates);
478
668
 
479
669
  /// @details Accepts the sampled token into the grammar
480
- LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
670
+ LLAMA_API void llama_grammar_accept_token(
671
+ struct llama_context * ctx,
672
+ struct llama_grammar * grammar,
673
+ llama_token token);
481
674
 
482
675
  //
483
676
  // Beam search
@@ -485,9 +678,10 @@ extern "C" {
485
678
 
486
679
  struct llama_beam_view {
487
680
  const llama_token * tokens;
681
+
488
682
  size_t n_tokens;
489
- float p; // Cumulative beam probability (renormalized relative to all beams)
490
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
683
+ float p; // Cumulative beam probability (renormalized relative to all beams)
684
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
491
685
  };
492
686
 
493
687
  // Passed to beam_search_callback function.
@@ -496,9 +690,10 @@ extern "C" {
496
690
  // These pointers are valid only during the synchronous callback, so should not be saved.
497
691
  struct llama_beams_state {
498
692
  struct llama_beam_view * beam_views;
693
+
499
694
  size_t n_beams; // Number of elements in beam_views[].
500
695
  size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
501
- bool last_call; // True iff this is the last callback invocation.
696
+ bool last_call; // True iff this is the last callback invocation.
502
697
  };
503
698
 
504
699
  // Type of pointer to the beam_search_callback function.
@@ -513,11 +708,17 @@ extern "C" {
513
708
  /// @param n_beams Number of beams to use.
514
709
  /// @param n_past Number of tokens already evaluated.
515
710
  /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
516
- /// @param n_threads Number of threads as passed to llama_eval().
517
- LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
711
+ LLAMA_API void llama_beam_search(
712
+ struct llama_context * ctx,
713
+ llama_beam_search_callback_fn_t callback,
714
+ void * callback_data,
715
+ size_t n_beams,
716
+ int n_past,
717
+ int n_predict);
518
718
 
519
719
  // Performance information
520
720
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
721
+
521
722
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
522
723
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
523
724
 
@@ -526,7 +727,7 @@ extern "C" {
526
727
 
527
728
  // Set callback for all future logging events.
528
729
  // If this is not called, or NULL is supplied, everything is output on stderr.
529
- LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
730
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
530
731
 
531
732
  LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
532
733