llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -59,9 +59,10 @@ extern "C" {
59
59
  typedef int32_t llama_seq_id;
60
60
 
61
61
  enum llama_vocab_type {
62
- LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
- LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
- LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
62
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
65
66
  };
66
67
 
67
68
  // note: these values should be synchronized with ggml_rope
@@ -129,6 +130,7 @@ extern "C" {
129
130
  };
130
131
 
131
132
  enum llama_pooling_type {
133
+ LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
132
134
  LLAMA_POOLING_TYPE_NONE = 0,
133
135
  LLAMA_POOLING_TYPE_MEAN = 1,
134
136
  LLAMA_POOLING_TYPE_CLS = 2,
@@ -162,7 +164,7 @@ extern "C" {
162
164
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
163
165
  // - pos : the positions of the respective token in the sequence
164
166
  // - seq_id : the sequence to which the respective token belongs
165
- // - logits : if zero, the logits for the respective token will not be output
167
+ // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
166
168
  //
167
169
  typedef struct llama_batch {
168
170
  int32_t n_tokens;
@@ -172,7 +174,7 @@ extern "C" {
172
174
  llama_pos * pos;
173
175
  int32_t * n_seq_id;
174
176
  llama_seq_id ** seq_id;
175
- int8_t * logits;
177
+ int8_t * logits; // TODO: rename this to "output"
176
178
 
177
179
  // NOTE: helpers for smooth API transition - can be deprecated in the future
178
180
  // for future-proof code, use the above fields instead and ignore everything below
@@ -233,10 +235,15 @@ extern "C" {
233
235
  struct llama_context_params {
234
236
  uint32_t seed; // RNG seed, -1 for random
235
237
  uint32_t n_ctx; // text context, 0 = from model
236
- uint32_t n_batch; // prompt processing maximum batch size
238
+ uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
239
+ uint32_t n_ubatch; // physical maximum batch size
240
+ uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
237
241
  uint32_t n_threads; // number of threads to use for generation
238
242
  uint32_t n_threads_batch; // number of threads to use for batch processing
239
- int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
243
+
244
+ enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
245
+ enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
246
+ // (ignored if no pooling layer)
240
247
 
241
248
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
242
249
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -255,10 +262,15 @@ extern "C" {
255
262
  enum ggml_type type_v; // data type for V cache
256
263
 
257
264
  // Keep the booleans together to avoid misalignment during copy-by-value.
258
- bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
259
- bool embedding; // embedding mode only
265
+ bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
266
+ bool embeddings; // if true, extract embeddings (together with logits)
260
267
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
261
- bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
268
+
269
+ // Abort callback
270
+ // if it returns true, execution of llama_decode() will be aborted
271
+ // currently works only with CPU execution
272
+ ggml_abort_callback abort_callback;
273
+ void * abort_callback_data;
262
274
  };
263
275
 
264
276
  // model quantization parameters
@@ -268,7 +280,7 @@ extern "C" {
268
280
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
269
281
  bool quantize_output_tensor; // quantize output.weight
270
282
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
271
- bool pure; // disable k-quant mixtures and quantize all tensors to the same type
283
+ bool pure; // quantize all tensors to the default type
272
284
  void * imatrix; // pointer to importance matrix data
273
285
  } llama_model_quantize_params;
274
286
 
@@ -367,6 +379,8 @@ extern "C" {
367
379
 
368
380
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
369
381
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
382
+ LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
383
+ LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
370
384
 
371
385
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
372
386
  LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@@ -445,7 +459,7 @@ extern "C" {
445
459
  // Maximum number of sequences that can exist in a cell. It's not an error
446
460
  // if there are more sequences in a cell than this value, however they will
447
461
  // not be visible in the view cells_sequences.
448
- int32_t n_max_seq;
462
+ int32_t n_seq_max;
449
463
 
450
464
  // Number of tokens in the cache. For example, if there are two populated
451
465
  // cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -465,12 +479,12 @@ extern "C" {
465
479
  // Information for an individual cell.
466
480
  struct llama_kv_cache_view_cell * cells;
467
481
 
468
- // The sequences for each cell. There will be n_max_seq items per cell.
482
+ // The sequences for each cell. There will be n_seq_max items per cell.
469
483
  llama_seq_id * cells_sequences;
470
484
  };
471
485
 
472
486
  // Create an empty KV cache view. (use only for debugging purposes)
473
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
487
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
474
488
 
475
489
  // Free a KV cache view. (use only for debugging purposes)
476
490
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -493,7 +507,7 @@ extern "C" {
493
507
  // seq_id < 0 : match any sequence
494
508
  // p0 < 0 : [0, p1]
495
509
  // p1 < 0 : [p0, inf)
496
- LLAMA_API void llama_kv_cache_seq_rm(
510
+ LLAMA_API bool llama_kv_cache_seq_rm(
497
511
  struct llama_context * ctx,
498
512
  llama_seq_id seq_id,
499
513
  llama_pos p0,
@@ -632,7 +646,19 @@ extern "C" {
632
646
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
633
647
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
634
648
 
635
- // Token logits obtained from the last call to llama_eval()
649
+ // Set whether to use causal attention or not
650
+ // If set to true, the model will only attend to the past tokens
651
+ LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
652
+
653
+ // Set abort callback
654
+ LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
655
+
656
+ // Wait until all computations are finished
657
+ // This is automatically done when using one of the functions below to obtain the computation results
658
+ // and is not necessary to call it explicitly in most cases
659
+ LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
+
661
+ // Token logits obtained from the last call to llama_decode()
636
662
  // The logits for the last token are stored in the last row
637
663
  // Logits for which llama_batch.logits[i] == 0 are undefined
638
664
  // Rows: n_tokens provided with llama_batch
@@ -643,14 +669,20 @@ extern "C" {
643
669
  // llama_get_logits(ctx) + i*n_vocab
644
670
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
645
671
 
646
- // Get the embeddings for the input
647
- // shape: [n_embd] (1-dimensional)
672
+ // Get all output token embeddings
673
+ // shape: [n_tokens*n_embd] (1-dimensional)
648
674
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
649
675
 
650
- // Get the embeddings for the ith sequence
676
+ // Get the embeddings for the ith token
651
677
  // llama_get_embeddings(ctx) + i*n_embd
678
+ // shape: [n_embd] (1-dimensional)
652
679
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
653
680
 
681
+ // Get the embeddings for a sequence id
682
+ // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
683
+ // shape: [n_embd] (1-dimensional)
684
+ LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
685
+
654
686
  //
655
687
  // Vocab
656
688
  //
@@ -684,7 +716,7 @@ extern "C" {
684
716
 
685
717
  /// @details Convert the provided text into tokens.
686
718
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
687
- /// @return Returns the number of tokens on success, no more than n_max_tokens
719
+ /// @return Returns the number of tokens on success, no more than n_tokens_max
688
720
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
689
721
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
690
722
  /// Does not insert a leading space.
@@ -693,7 +725,7 @@ extern "C" {
693
725
  const char * text,
694
726
  int32_t text_len,
695
727
  llama_token * tokens,
696
- int32_t n_max_tokens,
728
+ int32_t n_tokens_max,
697
729
  bool add_bos,
698
730
  bool special);
699
731