llama_cpp 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -37,6 +37,8 @@
37
37
 
38
38
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
39
39
 
40
+ #define LLAMA_MAX_RNG_STATE (64*1024)
41
+
40
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
41
43
 
42
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
@@ -60,13 +62,9 @@ extern "C" {
60
62
  struct llama_model;
61
63
  struct llama_context;
62
64
 
63
- typedef int llama_token;
64
-
65
- enum llama_log_level {
66
- LLAMA_LOG_LEVEL_ERROR = 2,
67
- LLAMA_LOG_LEVEL_WARN = 3,
68
- LLAMA_LOG_LEVEL_INFO = 4
69
- };
65
+ typedef int32_t llama_pos;
66
+ typedef int32_t llama_token;
67
+ typedef int32_t llama_seq_id;
70
68
 
71
69
  enum llama_vocab_type {
72
70
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@@ -86,24 +84,24 @@ extern "C" {
86
84
  // model file types
87
85
  enum llama_ftype {
88
86
  LLAMA_FTYPE_ALL_F32 = 0,
89
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
90
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
91
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
92
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
93
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
94
- // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
95
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
96
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
97
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
98
- LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
99
- LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
100
- LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
101
- LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
102
- LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
103
- LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
104
- LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
105
- LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
106
- LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
87
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
88
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
89
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
91
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
92
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
93
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
94
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
95
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
96
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
97
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
98
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
99
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
102
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
107
105
 
108
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
107
  };
@@ -122,41 +120,68 @@ extern "C" {
122
120
 
123
121
  typedef void (*llama_progress_callback)(float progress, void *ctx);
124
122
 
125
- struct llama_context_params {
126
- uint32_t seed; // RNG seed, -1 for random
127
- int32_t n_ctx; // text context
128
- int32_t n_batch; // prompt processing batch size
129
- int32_t n_gpu_layers; // number of layers to store in VRAM
130
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
131
-
123
+ // Input data for llama_decode
124
+ // A llama_batch object can contain input about one or many sequences
125
+ // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
126
+ //
127
+ // - token : the token ids of the input (used when embd is NULL)
128
+ // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
129
+ // - pos : the positions of the respective token in the sequence
130
+ // - seq_id : the sequence to which the respective token belongs
131
+ // - logits : if zero, the logits for the respective token will not be output
132
+ //
133
+ typedef struct llama_batch {
134
+ int32_t n_tokens;
135
+
136
+ llama_token * token;
137
+ float * embd;
138
+ llama_pos * pos;
139
+ llama_seq_id * seq_id;
140
+ int8_t * logits;
141
+
142
+ // NOTE: helpers for smooth API transition - can be deprecated in the future
143
+ // for future-proof code, use the above fields instead and ignore everything below
144
+ //
145
+ // pos[i] = all_pos_0 + i*all_pos_1
146
+ //
147
+ llama_pos all_pos_0; // used if pos == NULL
148
+ llama_pos all_pos_1; // used if pos == NULL
149
+ llama_seq_id all_seq_id; // used if seq_id == NULL
150
+ } llama_batch;
151
+
152
+ struct llama_model_params {
153
+ int32_t n_gpu_layers; // number of layers to store in VRAM
154
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
132
155
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
133
156
 
134
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
135
- float rope_freq_base; // RoPE base frequency
136
- float rope_freq_scale; // RoPE frequency scaling factor
137
-
138
157
  // called with a progress value between 0 and 1, pass NULL to disable
139
158
  llama_progress_callback progress_callback;
140
159
  // context pointer passed to the progress callback
141
160
  void * progress_callback_user_data;
142
161
 
143
162
  // Keep the booleans together to avoid misalignment during copy-by-value.
144
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
145
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
146
- bool f16_kv; // use fp16 for KV cache
147
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
148
163
  bool vocab_only; // only load the vocabulary, no weights
149
164
  bool use_mmap; // use mmap if possible
150
165
  bool use_mlock; // force system to keep model in RAM
151
- bool embedding; // embedding mode only
152
166
  };
153
167
 
154
- // Signature for logging events
155
- // Note that text includes the new line character at the end for most events.
156
- // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
157
- // if it exists.
158
- // It might not exist for progress report where '.' is output repeatedly.
159
- typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
168
+ struct llama_context_params {
169
+ uint32_t seed; // RNG seed, -1 for random
170
+ uint32_t n_ctx; // text context
171
+ uint32_t n_batch; // prompt processing batch size
172
+ uint32_t n_threads; // number of threads to use for generation
173
+ uint32_t n_threads_batch; // number of threads to use for batch processing
174
+
175
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
176
+ float rope_freq_base; // RoPE base frequency
177
+ float rope_freq_scale; // RoPE frequency scaling factor
178
+
179
+ // Keep the booleans together to avoid misalignment during copy-by-value.
180
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
181
+ bool f16_kv; // use fp16 for KV cache
182
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one
183
+ bool embedding; // embedding mode only
184
+ };
160
185
 
161
186
  // model quantization parameters
162
187
  typedef struct llama_model_quantize_params {
@@ -215,6 +240,8 @@ extern "C" {
215
240
  int32_t n_eval;
216
241
  };
217
242
 
243
+ // Helpers for getting default parameters
244
+ LLAMA_API struct llama_model_params llama_model_default_params(void);
218
245
  LLAMA_API struct llama_context_params llama_context_default_params(void);
219
246
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
220
247
 
@@ -228,7 +255,7 @@ extern "C" {
228
255
 
229
256
  LLAMA_API struct llama_model * llama_load_model_from_file(
230
257
  const char * path_model,
231
- struct llama_context_params params);
258
+ struct llama_model_params params);
232
259
 
233
260
  LLAMA_API void llama_free_model(struct llama_model * model);
234
261
 
@@ -245,25 +272,28 @@ extern "C" {
245
272
  LLAMA_API bool llama_mmap_supported (void);
246
273
  LLAMA_API bool llama_mlock_supported(void);
247
274
 
248
- LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
275
+ LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
276
+
249
277
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
250
- LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
251
- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
252
278
 
253
- LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
279
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
254
280
 
255
- LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
256
- LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
257
- LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
258
- LLAMA_API int llama_model_n_embd (const struct llama_model * model);
281
+ LLAMA_API int llama_n_vocab (const struct llama_model * model);
282
+ LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
283
+ LLAMA_API int llama_n_embd (const struct llama_model * model);
259
284
 
260
285
  // Get a string describing the model type
261
286
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
287
+
262
288
  // Returns the total size of all the tensors in the model in bytes
263
289
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
290
+
264
291
  // Returns the total number of parameters in the model
265
292
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
266
293
 
294
+ // Get a llama model tensor
295
+ LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
296
+
267
297
  // Returns 0 on success
268
298
  LLAMA_API int llama_model_quantize(
269
299
  const char * fname_inp,
@@ -279,21 +309,65 @@ extern "C" {
279
309
  LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
280
310
  struct llama_context * ctx,
281
311
  const char * path_lora,
312
+ float scale,
282
313
  const char * path_base_model,
283
314
  int n_threads),
284
- "please use llama_model_apply_lora_from_file instead");
315
+ "use llama_model_apply_lora_from_file instead");
285
316
 
286
317
  LLAMA_API int llama_model_apply_lora_from_file(
287
318
  const struct llama_model * model,
288
- const char * path_lora,
289
- const char * path_base_model,
290
- int n_threads);
319
+ const char * path_lora,
320
+ float scale,
321
+ const char * path_base_model,
322
+ int n_threads);
323
+
324
+ //
325
+ // KV cache
326
+ //
291
327
 
292
328
  // Returns the number of tokens in the KV cache
293
- LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
329
+ LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
330
+ "avoid using this, it will be removed in the future, instead - count the tokens in user code");
294
331
 
295
- // Sets the current rng seed.
296
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
332
+ // Remove all tokens data of cells in [c0, c1)
333
+ LLAMA_API void llama_kv_cache_tokens_rm(
334
+ struct llama_context * ctx,
335
+ int32_t c0,
336
+ int32_t c1);
337
+
338
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
339
+ LLAMA_API void llama_kv_cache_seq_rm(
340
+ struct llama_context * ctx,
341
+ llama_seq_id seq_id,
342
+ llama_pos p0,
343
+ llama_pos p1);
344
+
345
+ // Copy all tokens that belong to the specified sequence to another sequence
346
+ // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
347
+ LLAMA_API void llama_kv_cache_seq_cp(
348
+ struct llama_context * ctx,
349
+ llama_seq_id seq_id_src,
350
+ llama_seq_id seq_id_dst,
351
+ llama_pos p0,
352
+ llama_pos p1);
353
+
354
+ // Removes all tokens that do not belong to the specified sequence
355
+ LLAMA_API void llama_kv_cache_seq_keep(
356
+ struct llama_context * ctx,
357
+ llama_seq_id seq_id);
358
+
359
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
360
+ // If the KV cache is RoPEd, the KV data is updated accordingly
361
+ LLAMA_API void llama_kv_cache_seq_shift(
362
+ struct llama_context * ctx,
363
+ llama_seq_id seq_id,
364
+ llama_pos p0,
365
+ llama_pos p1,
366
+ llama_pos delta);
367
+
368
+ //
369
+ // State / sessions
370
+ //
297
371
 
298
372
  // Returns the maximum size in bytes of the state (rng, logits, embedding
299
373
  // and kv_cache) - will often be smaller after compacting tokens
@@ -302,48 +376,102 @@ extern "C" {
302
376
  // Copies the state to the specified destination address.
303
377
  // Destination needs to have allocated enough memory.
304
378
  // Returns the number of bytes copied
305
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
379
+ LLAMA_API size_t llama_copy_state_data(
380
+ struct llama_context * ctx,
381
+ uint8_t * dst);
306
382
 
307
383
  // Set the state reading from the specified address
308
384
  // Returns the number of bytes read
309
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
385
+ LLAMA_API size_t llama_set_state_data(
386
+ struct llama_context * ctx,
387
+ uint8_t * src);
310
388
 
311
389
  // Save/load session file
312
- LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
313
- LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
390
+ LLAMA_API bool llama_load_session_file(
391
+ struct llama_context * ctx,
392
+ const char * path_session,
393
+ llama_token * tokens_out,
394
+ size_t n_token_capacity,
395
+ size_t * n_token_count_out);
314
396
 
315
- // Run the llama inference to obtain the logits and probabilities for the next token.
397
+ LLAMA_API bool llama_save_session_file(
398
+ struct llama_context * ctx,
399
+ const char * path_session,
400
+ const llama_token * tokens,
401
+ size_t n_token_count);
402
+
403
+ //
404
+ // Decoding
405
+ //
406
+
407
+ // Run the llama inference to obtain the logits and probabilities for the next token(s).
316
408
  // tokens + n_tokens is the provided batch of new tokens to process
317
409
  // n_past is the number of tokens to use from previous eval calls
318
410
  // Returns 0 on success
319
- LLAMA_API int llama_eval(
411
+ // DEPRECATED: use llama_decode() instead
412
+ LLAMA_API DEPRECATED(int llama_eval(
320
413
  struct llama_context * ctx,
321
- const llama_token * tokens,
322
- int n_tokens,
323
- int n_past,
324
- int n_threads);
414
+ llama_token * tokens,
415
+ int32_t n_tokens,
416
+ int n_past),
417
+ "use llama_decode() instead");
325
418
 
326
419
  // Same as llama_eval, but use float matrix input directly.
327
- LLAMA_API int llama_eval_embd(
420
+ // DEPRECATED: use llama_decode() instead
421
+ LLAMA_API DEPRECATED(int llama_eval_embd(
328
422
  struct llama_context * ctx,
329
- const float * embd,
330
- int n_tokens,
331
- int n_past,
332
- int n_threads);
423
+ float * embd,
424
+ int32_t n_tokens,
425
+ int n_past),
426
+ "use llama_decode() instead");
333
427
 
334
- // Export a static computation graph for context of 511 and batch size of 1
335
- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
336
- // parameters here to keep things simple
337
- // IMPORTANT: do not use for anything else other than debugging and testing!
338
- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
428
+ // Return batch for single sequence of tokens starting at pos_0
429
+ //
430
+ // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
431
+ //
432
+ LLAMA_API struct llama_batch llama_batch_get_one(
433
+ llama_token * tokens,
434
+ int32_t n_tokens,
435
+ llama_pos pos_0,
436
+ llama_seq_id seq_id);
437
+
438
+ // Allocates a batch of tokens on the heap
439
+ // The batch has to be freed with llama_batch_free()
440
+ // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
441
+ // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
442
+ // The rest of the llama_batch members are allocated with size n_tokens
443
+ // All members are left uninitialized
444
+ LLAMA_API struct llama_batch llama_batch_init(
445
+ int32_t n_tokens,
446
+ int32_t embd);
447
+
448
+ // Frees a batch of tokens allocated with llama_batch_init()
449
+ LLAMA_API void llama_batch_free(struct llama_batch batch);
450
+
451
+ // Positive return values does not mean a fatal error, but rather a warning.
452
+ // 0 - success
453
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
454
+ // < 0 - error
455
+ LLAMA_API int llama_decode(
456
+ struct llama_context * ctx,
457
+ struct llama_batch batch);
458
+
459
+ // Set the number of threads used for decoding
460
+ // n_threads is the number of threads used for generation (single token)
461
+ // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
462
+ LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
339
463
 
340
464
  // Token logits obtained from the last call to llama_eval()
341
465
  // The logits for the last token are stored in the last row
342
- // Can be mutated in order to change the probabilities of the next token
343
- // Rows: n_tokens
466
+ // Logits for which llama_batch.logits[i] == 0 are undefined
467
+ // Rows: n_tokens provided with llama_batch
344
468
  // Cols: n_vocab
345
469
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
346
470
 
471
+ // Logits for the ith token. Equivalent to:
472
+ // llama_get_logits(ctx) + i*n_vocab
473
+ LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
474
+
347
475
  // Get the embeddings for the input
348
476
  // shape: [n_embd] (1-dimensional)
349
477
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@@ -372,15 +500,9 @@ extern "C" {
372
500
  // Returns the number of tokens on success, no more than n_max_tokens
373
501
  // Returns a negative number on failure - the number of tokens that would have been returned
374
502
  LLAMA_API int llama_tokenize(
375
- struct llama_context * ctx,
376
- const char * text,
377
- llama_token * tokens,
378
- int n_max_tokens,
379
- bool add_bos);
380
-
381
- LLAMA_API int llama_tokenize_with_model(
382
503
  const struct llama_model * model,
383
504
  const char * text,
505
+ int text_len,
384
506
  llama_token * tokens,
385
507
  int n_max_tokens,
386
508
  bool add_bos);
@@ -390,12 +512,6 @@ extern "C" {
390
512
  // Does not write null terminator to the buffer.
391
513
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
392
514
  LLAMA_API int llama_token_to_piece(
393
- const struct llama_context * ctx,
394
- llama_token token,
395
- char * buf,
396
- int length);
397
-
398
- LLAMA_API int llama_token_to_piece_with_model(
399
515
  const struct llama_model * model,
400
516
  llama_token token,
401
517
  char * buf,
@@ -418,11 +534,25 @@ extern "C" {
418
534
  // Sampling functions
419
535
  //
420
536
 
537
+ // Sets the current rng seed.
538
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
539
+
421
540
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
422
- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
541
+ LLAMA_API void llama_sample_repetition_penalty(
542
+ struct llama_context * ctx,
543
+ llama_token_data_array * candidates,
544
+ const llama_token * last_tokens,
545
+ size_t last_tokens_size,
546
+ float penalty);
423
547
 
424
548
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
425
- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
549
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(
550
+ struct llama_context * ctx,
551
+ llama_token_data_array * candidates,
552
+ const llama_token * last_tokens,
553
+ size_t last_tokens_size,
554
+ float alpha_frequency,
555
+ float alpha_presence);
426
556
 
427
557
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
428
558
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -435,23 +565,54 @@ extern "C" {
435
565
  float scale);
436
566
 
437
567
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
438
- LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
568
+ LLAMA_API void llama_sample_softmax(
569
+ struct llama_context * ctx,
570
+ llama_token_data_array * candidates);
439
571
 
440
572
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
441
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
573
+ LLAMA_API void llama_sample_top_k(
574
+ struct llama_context * ctx,
575
+ llama_token_data_array * candidates,
576
+ int k,
577
+ size_t min_keep);
442
578
 
443
579
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
444
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
580
+ LLAMA_API void llama_sample_top_p(
581
+ struct llama_context * ctx,
582
+ llama_token_data_array * candidates,
583
+ float p,
584
+ size_t min_keep);
445
585
 
446
586
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
447
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
587
+ LLAMA_API void llama_sample_tail_free(
588
+ struct llama_context * ctx,
589
+ llama_token_data_array * candidates,
590
+ float z,
591
+ size_t min_keep);
448
592
 
449
593
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
450
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
451
- LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
594
+ LLAMA_API void llama_sample_typical(
595
+ struct llama_context * ctx,
596
+ llama_token_data_array * candidates,
597
+ float p,
598
+ size_t min_keep);
599
+
600
+ LLAMA_API void llama_sample_temp(
601
+ struct llama_context * ctx,
602
+ llama_token_data_array * candidates,
603
+ float temp);
604
+
605
+ LLAMA_API DEPRECATED(void llama_sample_temperature(
606
+ struct llama_context * ctx,
607
+ llama_token_data_array * candidates,
608
+ float temp),
609
+ "use llama_sample_temp instead");
452
610
 
453
611
  /// @details Apply constraints from grammar
454
- LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
612
+ LLAMA_API void llama_sample_grammar(
613
+ struct llama_context * ctx,
614
+ llama_token_data_array * candidates,
615
+ const struct llama_grammar * grammar);
455
616
 
456
617
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
457
618
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -459,23 +620,41 @@ extern "C" {
459
620
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
460
621
  /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
461
622
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
462
- LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
623
+ LLAMA_API llama_token llama_sample_token_mirostat(
624
+ struct llama_context * ctx,
625
+ llama_token_data_array * candidates,
626
+ float tau,
627
+ float eta,
628
+ int m,
629
+ float * mu);
463
630
 
464
631
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
465
632
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
466
633
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
467
634
  /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
468
635
  /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
469
- LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
636
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(
637
+ struct llama_context * ctx,
638
+ llama_token_data_array * candidates,
639
+ float tau,
640
+ float eta,
641
+ float * mu);
470
642
 
471
643
  /// @details Selects the token with the highest probability.
472
- LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
644
+ LLAMA_API llama_token llama_sample_token_greedy(
645
+ struct llama_context * ctx,
646
+ llama_token_data_array * candidates);
473
647
 
474
648
  /// @details Randomly selects a token from the candidates based on their probabilities.
475
- LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
649
+ LLAMA_API llama_token llama_sample_token(
650
+ struct llama_context * ctx,
651
+ llama_token_data_array * candidates);
476
652
 
477
653
  /// @details Accepts the sampled token into the grammar
478
- LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
654
+ LLAMA_API void llama_grammar_accept_token(
655
+ struct llama_context * ctx,
656
+ struct llama_grammar * grammar,
657
+ llama_token token);
479
658
 
480
659
  //
481
660
  // Beam search
@@ -483,9 +662,10 @@ extern "C" {
483
662
 
484
663
  struct llama_beam_view {
485
664
  const llama_token * tokens;
665
+
486
666
  size_t n_tokens;
487
- float p; // Cumulative beam probability (renormalized relative to all beams)
488
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
667
+ float p; // Cumulative beam probability (renormalized relative to all beams)
668
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
489
669
  };
490
670
 
491
671
  // Passed to beam_search_callback function.
@@ -494,9 +674,10 @@ extern "C" {
494
674
  // These pointers are valid only during the synchronous callback, so should not be saved.
495
675
  struct llama_beams_state {
496
676
  struct llama_beam_view * beam_views;
677
+
497
678
  size_t n_beams; // Number of elements in beam_views[].
498
679
  size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
499
- bool last_call; // True iff this is the last callback invocation.
680
+ bool last_call; // True iff this is the last callback invocation.
500
681
  };
501
682
 
502
683
  // Type of pointer to the beam_search_callback function.
@@ -511,11 +692,17 @@ extern "C" {
511
692
  /// @param n_beams Number of beams to use.
512
693
  /// @param n_past Number of tokens already evaluated.
513
694
  /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
514
- /// @param n_threads Number of threads as passed to llama_eval().
515
- LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
695
+ LLAMA_API void llama_beam_search(
696
+ struct llama_context * ctx,
697
+ llama_beam_search_callback_fn_t callback,
698
+ void * callback_data,
699
+ size_t n_beams,
700
+ int n_past,
701
+ int n_predict);
516
702
 
517
703
  // Performance information
518
704
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
705
+
519
706
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
520
707
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
521
708
 
@@ -524,7 +711,7 @@ extern "C" {
524
711
 
525
712
  // Set callback for all future logging events.
526
713
  // If this is not called, or NULL is supplied, everything is output on stderr.
527
- LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
714
+ LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
528
715
 
529
716
  LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
530
717
 
@@ -540,7 +727,9 @@ extern "C" {
540
727
 
541
728
  struct ggml_tensor;
542
729
 
543
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
730
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
731
+ struct llama_context * ctx
732
+ );
544
733
 
545
734
  #endif // LLAMA_API_INTERNAL
546
735
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.5.2'
6
+ VERSION = '0.6.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1'
9
+ LLAMA_CPP_VERSION = 'b1292'
10
10
  end