llama_cpp 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,8 @@
37
37
 
38
38
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
39
39
 
40
+ #define LLAMA_MAX_RNG_STATE (64*1024)
41
+
40
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
41
43
 
42
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
@@ -60,13 +62,9 @@ extern "C" {
60
62
  struct llama_model;
61
63
  struct llama_context;
62
64
 
63
- typedef int llama_token;
64
-
65
- enum llama_log_level {
66
- LLAMA_LOG_LEVEL_ERROR = 2,
67
- LLAMA_LOG_LEVEL_WARN = 3,
68
- LLAMA_LOG_LEVEL_INFO = 4
69
- };
65
+ typedef int32_t llama_pos;
66
+ typedef int32_t llama_token;
67
+ typedef int32_t llama_seq_id;
70
68
 
71
69
  enum llama_vocab_type {
72
70
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@@ -86,24 +84,24 @@ extern "C" {
86
84
  // model file types
87
85
  enum llama_ftype {
88
86
  LLAMA_FTYPE_ALL_F32 = 0,
89
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
90
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
91
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
92
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
93
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
94
- // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
95
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
96
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
97
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
98
- LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
99
- LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
100
- LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
101
- LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
102
- LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
103
- LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
104
- LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
105
- LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
106
- LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
87
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
88
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
89
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
91
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
92
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
93
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
94
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
95
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
96
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
97
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
98
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
99
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
102
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
107
105
 
108
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
107
  };
@@ -122,41 +120,68 @@ extern "C" {
122
120
 
123
121
  typedef void (*llama_progress_callback)(float progress, void *ctx);
124
122
 
125
- struct llama_context_params {
126
- uint32_t seed; // RNG seed, -1 for random
127
- int32_t n_ctx; // text context
128
- int32_t n_batch; // prompt processing batch size
129
- int32_t n_gpu_layers; // number of layers to store in VRAM
130
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
131
-
123
+ // Input data for llama_decode
124
+ // A llama_batch object can contain input about one or many sequences
125
+ // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
126
+ //
127
+ // - token : the token ids of the input (used when embd is NULL)
128
+ // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
129
+ // - pos : the positions of the respective token in the sequence
130
+ // - seq_id : the sequence to which the respective token belongs
131
+ // - logits : if zero, the logits for the respective token will not be output
132
+ //
133
+ typedef struct llama_batch {
134
+ int32_t n_tokens;
135
+
136
+ llama_token * token;
137
+ float * embd;
138
+ llama_pos * pos;
139
+ llama_seq_id * seq_id;
140
+ int8_t * logits;
141
+
142
+ // NOTE: helpers for smooth API transition - can be deprecated in the future
143
+ // for future-proof code, use the above fields instead and ignore everything below
144
+ //
145
+ // pos[i] = all_pos_0 + i*all_pos_1
146
+ //
147
+ llama_pos all_pos_0; // used if pos == NULL
148
+ llama_pos all_pos_1; // used if pos == NULL
149
+ llama_seq_id all_seq_id; // used if seq_id == NULL
150
+ } llama_batch;
151
+
152
+ struct llama_model_params {
153
+ int32_t n_gpu_layers; // number of layers to store in VRAM
154
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
132
155
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
133
156
 
134
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
135
- float rope_freq_base; // RoPE base frequency
136
- float rope_freq_scale; // RoPE frequency scaling factor
137
-
138
157
  // called with a progress value between 0 and 1, pass NULL to disable
139
158
  llama_progress_callback progress_callback;
140
159
  // context pointer passed to the progress callback
141
160
  void * progress_callback_user_data;
142
161
 
143
162
  // Keep the booleans together to avoid misalignment during copy-by-value.
144
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
145
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
146
- bool f16_kv; // use fp16 for KV cache
147
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
148
163
  bool vocab_only; // only load the vocabulary, no weights
149
164
  bool use_mmap; // use mmap if possible
150
165
  bool use_mlock; // force system to keep model in RAM
151
- bool embedding; // embedding mode only
152
166
  };
153
167
 
154
- // Signature for logging events
155
- // Note that text includes the new line character at the end for most events.
156
- // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
157
- // if it exists.
158
- // It might not exist for progress report where '.' is output repeatedly.
159
- typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
168
+ struct llama_context_params {
169
+ uint32_t seed; // RNG seed, -1 for random
170
+ uint32_t n_ctx; // text context
171
+ uint32_t n_batch; // prompt processing batch size
172
+ uint32_t n_threads; // number of threads to use for generation
173
+ uint32_t n_threads_batch; // number of threads to use for batch processing
174
+
175
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
176
+ float rope_freq_base; // RoPE base frequency
177
+ float rope_freq_scale; // RoPE frequency scaling factor
178
+
179
+ // Keep the booleans together to avoid misalignment during copy-by-value.
180
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
181
+ bool f16_kv; // use fp16 for KV cache
182
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one
183
+ bool embedding; // embedding mode only
184
+ };
160
185
 
161
186
  // model quantization parameters
162
187
  typedef struct llama_model_quantize_params {
@@ -215,6 +240,8 @@ extern "C" {
215
240
  int32_t n_eval;
216
241
  };
217
242
 
243
+ // Helpers for getting default parameters
244
+ LLAMA_API struct llama_model_params llama_model_default_params(void);
218
245
  LLAMA_API struct llama_context_params llama_context_default_params(void);
219
246
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
220
247
 
@@ -228,7 +255,7 @@ extern "C" {
228
255
 
229
256
  LLAMA_API struct llama_model * llama_load_model_from_file(
230
257
  const char * path_model,
231
- struct llama_context_params params);
258
+ struct llama_model_params params);
232
259
 
233
260
  LLAMA_API void llama_free_model(struct llama_model * model);
234
261
 
@@ -245,25 +272,28 @@ extern "C" {
245
272
  LLAMA_API bool llama_mmap_supported (void);
246
273
  LLAMA_API bool llama_mlock_supported(void);
247
274
 
248
- LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
275
+ LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
276
+
249
277
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
250
- LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
251
- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
252
278
 
253
- LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
279
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
254
280
 
255
- LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
256
- LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
257
- LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
258
- LLAMA_API int llama_model_n_embd (const struct llama_model * model);
281
+ LLAMA_API int llama_n_vocab (const struct llama_model * model);
282
+ LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
283
+ LLAMA_API int llama_n_embd (const struct llama_model * model);
259
284
 
260
285
  // Get a string describing the model type
261
286
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
287
+
262
288
  // Returns the total size of all the tensors in the model in bytes
263
289
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
290
+
264
291
  // Returns the total number of parameters in the model
265
292
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
266
293
 
294
+ // Get a llama model tensor
295
+ LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
296
+
267
297
  // Returns 0 on success
268
298
  LLAMA_API int llama_model_quantize(
269
299
  const char * fname_inp,
@@ -279,21 +309,65 @@ extern "C" {
279
309
  LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
280
310
  struct llama_context * ctx,
281
311
  const char * path_lora,
312
+ float scale,
282
313
  const char * path_base_model,
283
314
  int n_threads),
284
- "please use llama_model_apply_lora_from_file instead");
315
+ "use llama_model_apply_lora_from_file instead");
285
316
 
286
317
  LLAMA_API int llama_model_apply_lora_from_file(
287
318
  const struct llama_model * model,
288
- const char * path_lora,
289
- const char * path_base_model,
290
- int n_threads);
319
+ const char * path_lora,
320
+ float scale,
321
+ const char * path_base_model,
322
+ int n_threads);
323
+
324
+ //
325
+ // KV cache
326
+ //
291
327
 
292
328
  // Returns the number of tokens in the KV cache
293
- LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
329
+ LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
330
+ "avoid using this, it will be removed in the future, instead - count the tokens in user code");
294
331
 
295
- // Sets the current rng seed.
296
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
332
+ // Remove all tokens data of cells in [c0, c1)
333
+ LLAMA_API void llama_kv_cache_tokens_rm(
334
+ struct llama_context * ctx,
335
+ int32_t c0,
336
+ int32_t c1);
337
+
338
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
339
+ LLAMA_API void llama_kv_cache_seq_rm(
340
+ struct llama_context * ctx,
341
+ llama_seq_id seq_id,
342
+ llama_pos p0,
343
+ llama_pos p1);
344
+
345
+ // Copy all tokens that belong to the specified sequence to another sequence
346
+ // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
347
+ LLAMA_API void llama_kv_cache_seq_cp(
348
+ struct llama_context * ctx,
349
+ llama_seq_id seq_id_src,
350
+ llama_seq_id seq_id_dst,
351
+ llama_pos p0,
352
+ llama_pos p1);
353
+
354
+ // Removes all tokens that do not belong to the specified sequence
355
+ LLAMA_API void llama_kv_cache_seq_keep(
356
+ struct llama_context * ctx,
357
+ llama_seq_id seq_id);
358
+
359
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
360
+ // If the KV cache is RoPEd, the KV data is updated accordingly
361
+ LLAMA_API void llama_kv_cache_seq_shift(
362
+ struct llama_context * ctx,
363
+ llama_seq_id seq_id,
364
+ llama_pos p0,
365
+ llama_pos p1,
366
+ llama_pos delta);
367
+
368
+ //
369
+ // State / sessions
370
+ //
297
371
 
298
372
  // Returns the maximum size in bytes of the state (rng, logits, embedding
299
373
  // and kv_cache) - will often be smaller after compacting tokens
@@ -302,48 +376,102 @@ extern "C" {
302
376
  // Copies the state to the specified destination address.
303
377
  // Destination needs to have allocated enough memory.
304
378
  // Returns the number of bytes copied
305
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
379
+ LLAMA_API size_t llama_copy_state_data(
380
+ struct llama_context * ctx,
381
+ uint8_t * dst);
306
382
 
307
383
  // Set the state reading from the specified address
308
384
  // Returns the number of bytes read
309
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
385
+ LLAMA_API size_t llama_set_state_data(
386
+ struct llama_context * ctx,
387
+ uint8_t * src);
310
388
 
311
389
  // Save/load session file
312
- LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
313
- LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
390
+ LLAMA_API bool llama_load_session_file(
391
+ struct llama_context * ctx,
392
+ const char * path_session,
393
+ llama_token * tokens_out,
394
+ size_t n_token_capacity,
395
+ size_t * n_token_count_out);
396
+
397
+ LLAMA_API bool llama_save_session_file(
398
+ struct llama_context * ctx,
399
+ const char * path_session,
400
+ const llama_token * tokens,
401
+ size_t n_token_count);
402
+
403
+ //
404
+ // Decoding
405
+ //
314
406
 
315
- // Run the llama inference to obtain the logits and probabilities for the next token.
407
+ // Run the llama inference to obtain the logits and probabilities for the next token(s).
316
408
  // tokens + n_tokens is the provided batch of new tokens to process
317
409
  // n_past is the number of tokens to use from previous eval calls
318
410
  // Returns 0 on success
319
- LLAMA_API int llama_eval(
411
+ // DEPRECATED: use llama_decode() instead
412
+ LLAMA_API DEPRECATED(int llama_eval(
320
413
  struct llama_context * ctx,
321
- const llama_token * tokens,
322
- int n_tokens,
323
- int n_past,
324
- int n_threads);
414
+ llama_token * tokens,
415
+ int32_t n_tokens,
416
+ int n_past),
417
+ "use llama_decode() instead");
325
418
 
326
419
  // Same as llama_eval, but use float matrix input directly.
327
- LLAMA_API int llama_eval_embd(
420
+ // DEPRECATED: use llama_decode() instead
421
+ LLAMA_API DEPRECATED(int llama_eval_embd(
328
422
  struct llama_context * ctx,
329
- const float * embd,
330
- int n_tokens,
331
- int n_past,
332
- int n_threads);
423
+ float * embd,
424
+ int32_t n_tokens,
425
+ int n_past),
426
+ "use llama_decode() instead");
333
427
 
334
- // Export a static computation graph for context of 511 and batch size of 1
335
- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
336
- // parameters here to keep things simple
337
- // IMPORTANT: do not use for anything else other than debugging and testing!
338
- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
428
+ // Return batch for single sequence of tokens starting at pos_0
429
+ //
430
+ // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
431
+ //
432
+ LLAMA_API struct llama_batch llama_batch_get_one(
433
+ llama_token * tokens,
434
+ int32_t n_tokens,
435
+ llama_pos pos_0,
436
+ llama_seq_id seq_id);
437
+
438
+ // Allocates a batch of tokens on the heap
439
+ // The batch has to be freed with llama_batch_free()
440
+ // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
441
+ // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
442
+ // The rest of the llama_batch members are allocated with size n_tokens
443
+ // All members are left uninitialized
444
+ LLAMA_API struct llama_batch llama_batch_init(
445
+ int32_t n_tokens,
446
+ int32_t embd);
447
+
448
+ // Frees a batch of tokens allocated with llama_batch_init()
449
+ LLAMA_API void llama_batch_free(struct llama_batch batch);
450
+
451
+ // Positive return values does not mean a fatal error, but rather a warning.
452
+ // 0 - success
453
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
454
+ // < 0 - error
455
+ LLAMA_API int llama_decode(
456
+ struct llama_context * ctx,
457
+ struct llama_batch batch);
458
+
459
+ // Set the number of threads used for decoding
460
+ // n_threads is the number of threads used for generation (single token)
461
+ // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
462
+ LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
339
463
 
340
464
  // Token logits obtained from the last call to llama_eval()
341
465
  // The logits for the last token are stored in the last row
342
- // Can be mutated in order to change the probabilities of the next token
343
- // Rows: n_tokens
466
+ // Logits for which llama_batch.logits[i] == 0 are undefined
467
+ // Rows: n_tokens provided with llama_batch
344
468
  // Cols: n_vocab
345
469
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
346
470
 
471
+ // Logits for the ith token. Equivalent to:
472
+ // llama_get_logits(ctx) + i*n_vocab
473
+ LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
474
+
347
475
  // Get the embeddings for the input
348
476
  // shape: [n_embd] (1-dimensional)
349
477
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@@ -372,14 +500,6 @@ extern "C" {
372
500
  // Returns the number of tokens on success, no more than n_max_tokens
373
501
  // Returns a negative number on failure - the number of tokens that would have been returned
374
502
  LLAMA_API int llama_tokenize(
375
- struct llama_context * ctx,
376
- const char * text,
377
- int text_len,
378
- llama_token * tokens,
379
- int n_max_tokens,
380
- bool add_bos);
381
-
382
- LLAMA_API int llama_tokenize_with_model(
383
503
  const struct llama_model * model,
384
504
  const char * text,
385
505
  int text_len,
@@ -392,12 +512,6 @@ extern "C" {
392
512
  // Does not write null terminator to the buffer.
393
513
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
394
514
  LLAMA_API int llama_token_to_piece(
395
- const struct llama_context * ctx,
396
- llama_token token,
397
- char * buf,
398
- int length);
399
-
400
- LLAMA_API int llama_token_to_piece_with_model(
401
515
  const struct llama_model * model,
402
516
  llama_token token,
403
517
  char * buf,
@@ -420,11 +534,25 @@ extern "C" {
420
534
  // Sampling functions
421
535
  //
422
536
 
537
+ // Sets the current rng seed.
538
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
539
+
423
540
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
424
- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
541
+ LLAMA_API void llama_sample_repetition_penalty(
542
+ struct llama_context * ctx,
543
+ llama_token_data_array * candidates,
544
+ const llama_token * last_tokens,
545
+ size_t last_tokens_size,
546
+ float penalty);
425
547
 
426
548
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
427
- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
549
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(
550
+ struct llama_context * ctx,
551
+ llama_token_data_array * candidates,
552
+ const llama_token * last_tokens,
553
+ size_t last_tokens_size,
554
+ float alpha_frequency,
555
+ float alpha_presence);
428
556
 
429
557
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
430
558
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -437,23 +565,54 @@ extern "C" {
437
565
  float scale);
438
566
 
439
567
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
440
- LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
568
+ LLAMA_API void llama_sample_softmax(
569
+ struct llama_context * ctx,
570
+ llama_token_data_array * candidates);
441
571
 
442
572
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
443
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
573
+ LLAMA_API void llama_sample_top_k(
574
+ struct llama_context * ctx,
575
+ llama_token_data_array * candidates,
576
+ int k,
577
+ size_t min_keep);
444
578
 
445
579
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
446
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
580
+ LLAMA_API void llama_sample_top_p(
581
+ struct llama_context * ctx,
582
+ llama_token_data_array * candidates,
583
+ float p,
584
+ size_t min_keep);
447
585
 
448
586
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
449
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
587
+ LLAMA_API void llama_sample_tail_free(
588
+ struct llama_context * ctx,
589
+ llama_token_data_array * candidates,
590
+ float z,
591
+ size_t min_keep);
450
592
 
451
593
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
452
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
453
- LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
594
+ LLAMA_API void llama_sample_typical(
595
+ struct llama_context * ctx,
596
+ llama_token_data_array * candidates,
597
+ float p,
598
+ size_t min_keep);
599
+
600
+ LLAMA_API void llama_sample_temp(
601
+ struct llama_context * ctx,
602
+ llama_token_data_array * candidates,
603
+ float temp);
604
+
605
+ LLAMA_API DEPRECATED(void llama_sample_temperature(
606
+ struct llama_context * ctx,
607
+ llama_token_data_array * candidates,
608
+ float temp),
609
+ "use llama_sample_temp instead");
454
610
 
455
611
  /// @details Apply constraints from grammar
456
- LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
612
+ LLAMA_API void llama_sample_grammar(
613
+ struct llama_context * ctx,
614
+ llama_token_data_array * candidates,
615
+ const struct llama_grammar * grammar);
457
616
 
458
617
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
459
618
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -461,23 +620,41 @@ extern "C" {
461
620
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
462
621
  /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
463
622
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
464
- LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
623
+ LLAMA_API llama_token llama_sample_token_mirostat(
624
+ struct llama_context * ctx,
625
+ llama_token_data_array * candidates,
626
+ float tau,
627
+ float eta,
628
+ int m,
629
+ float * mu);
465
630
 
466
631
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
467
632
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
468
633
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
469
634
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
470
635
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
471
- LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
636
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(
637
+ struct llama_context * ctx,
638
+ llama_token_data_array * candidates,
639
+ float tau,
640
+ float eta,
641
+ float * mu);
472
642
 
473
643
  /// @details Selects the token with the highest probability.
474
- LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
644
+ LLAMA_API llama_token llama_sample_token_greedy(
645
+ struct llama_context * ctx,
646
+ llama_token_data_array * candidates);
475
647
 
476
648
  /// @details Randomly selects a token from the candidates based on their probabilities.
477
- LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
649
+ LLAMA_API llama_token llama_sample_token(
650
+ struct llama_context * ctx,
651
+ llama_token_data_array * candidates);
478
652
 
479
653
  /// @details Accepts the sampled token into the grammar
480
- LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
654
+ LLAMA_API void llama_grammar_accept_token(
655
+ struct llama_context * ctx,
656
+ struct llama_grammar * grammar,
657
+ llama_token token);
481
658
 
482
659
  //
483
660
  // Beam search
@@ -485,9 +662,10 @@ extern "C" {
485
662
 
486
663
  struct llama_beam_view {
487
664
  const llama_token * tokens;
665
+
488
666
  size_t n_tokens;
489
- float p; // Cumulative beam probability (renormalized relative to all beams)
490
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
667
+ float p; // Cumulative beam probability (renormalized relative to all beams)
668
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
491
669
  };
492
670
 
493
671
  // Passed to beam_search_callback function.
@@ -496,9 +674,10 @@ extern "C" {
496
674
  // These pointers are valid only during the synchronous callback, so should not be saved.
497
675
  struct llama_beams_state {
498
676
  struct llama_beam_view * beam_views;
677
+
499
678
  size_t n_beams; // Number of elements in beam_views[].
500
679
  size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
501
- bool last_call; // True iff this is the last callback invocation.
680
+ bool last_call; // True iff this is the last callback invocation.
502
681
  };
503
682
 
504
683
  // Type of pointer to the beam_search_callback function.
@@ -513,11 +692,17 @@ extern "C" {
513
692
  /// @param n_beams Number of beams to use.
514
693
  /// @param n_past Number of tokens already evaluated.
515
694
  /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
516
- /// @param n_threads Number of threads as passed to llama_eval().
517
- LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
695
+ LLAMA_API void llama_beam_search(
696
+ struct llama_context * ctx,
697
+ llama_beam_search_callback_fn_t callback,
698
+ void * callback_data,
699
+ size_t n_beams,
700
+ int n_past,
701
+ int n_predict);
518
702
 
519
703
  // Performance information
520
704
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
705
+
521
706
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
522
707
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
523
708
 
@@ -526,7 +711,7 @@ extern "C" {
526
711
 
527
712
  // Set callback for all future logging events.
528
713
  // If this is not called, or NULL is supplied, everything is output on stderr.
529
- LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
714
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
530
715
 
531
716
  LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
532
717
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.5.3'
6
+ VERSION = '0.6.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1266'
9
+ LLAMA_CPP_VERSION = 'b1292'
10
10
  end