llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,9 +59,10 @@ extern "C" {
59
59
  typedef int32_t llama_seq_id;
60
60
 
61
61
  enum llama_vocab_type {
62
- LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
- LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
- LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
62
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
65
66
  };
66
67
 
67
68
  // note: these values should be synchronized with ggml_rope
@@ -129,6 +130,7 @@ extern "C" {
129
130
  };
130
131
 
131
132
  enum llama_pooling_type {
133
+ LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
132
134
  LLAMA_POOLING_TYPE_NONE = 0,
133
135
  LLAMA_POOLING_TYPE_MEAN = 1,
134
136
  LLAMA_POOLING_TYPE_CLS = 2,
@@ -162,7 +164,7 @@ extern "C" {
162
164
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
163
165
  // - pos : the positions of the respective token in the sequence
164
166
  // - seq_id : the sequence to which the respective token belongs
165
- // - logits : if zero, the logits for the respective token will not be output
167
+ // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
166
168
  //
167
169
  typedef struct llama_batch {
168
170
  int32_t n_tokens;
@@ -172,7 +174,7 @@ extern "C" {
172
174
  llama_pos * pos;
173
175
  int32_t * n_seq_id;
174
176
  llama_seq_id ** seq_id;
175
- int8_t * logits;
177
+ int8_t * logits; // TODO: rename this to "output"
176
178
 
177
179
  // NOTE: helpers for smooth API transition - can be deprecated in the future
178
180
  // for future-proof code, use the above fields instead and ignore everything below
@@ -233,10 +235,15 @@ extern "C" {
233
235
  struct llama_context_params {
234
236
  uint32_t seed; // RNG seed, -1 for random
235
237
  uint32_t n_ctx; // text context, 0 = from model
236
- uint32_t n_batch; // prompt processing maximum batch size
238
+ uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
239
+ uint32_t n_ubatch; // physical maximum batch size
240
+ uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
237
241
  uint32_t n_threads; // number of threads to use for generation
238
242
  uint32_t n_threads_batch; // number of threads to use for batch processing
239
- int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
243
+
244
+ enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
245
+ enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
246
+ // (ignored if no pooling layer)
240
247
 
241
248
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
242
249
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -255,10 +262,15 @@ extern "C" {
255
262
  enum ggml_type type_v; // data type for V cache
256
263
 
257
264
  // Keep the booleans together to avoid misalignment during copy-by-value.
258
- bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
259
- bool embedding; // embedding mode only
265
+ bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
266
+ bool embeddings; // if true, extract embeddings (together with logits)
260
267
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
261
- bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
268
+
269
+ // Abort callback
270
+ // if it returns true, execution of llama_decode() will be aborted
271
+ // currently works only with CPU execution
272
+ ggml_abort_callback abort_callback;
273
+ void * abort_callback_data;
262
274
  };
263
275
 
264
276
  // model quantization parameters
@@ -268,7 +280,7 @@ extern "C" {
268
280
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
269
281
  bool quantize_output_tensor; // quantize output.weight
270
282
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
271
- bool pure; // disable k-quant mixtures and quantize all tensors to the same type
283
+ bool pure; // quantize all tensors to the default type
272
284
  void * imatrix; // pointer to importance matrix data
273
285
  } llama_model_quantize_params;
274
286
 
@@ -367,6 +379,8 @@ extern "C" {
367
379
 
368
380
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
369
381
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
382
+ LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
383
+ LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
370
384
 
371
385
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
372
386
  LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@@ -445,7 +459,7 @@ extern "C" {
445
459
  // Maximum number of sequences that can exist in a cell. It's not an error
446
460
  // if there are more sequences in a cell than this value, however they will
447
461
  // not be visible in the view cells_sequences.
448
- int32_t n_max_seq;
462
+ int32_t n_seq_max;
449
463
 
450
464
  // Number of tokens in the cache. For example, if there are two populated
451
465
  // cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -465,12 +479,12 @@ extern "C" {
465
479
  // Information for an individual cell.
466
480
  struct llama_kv_cache_view_cell * cells;
467
481
 
468
- // The sequences for each cell. There will be n_max_seq items per cell.
482
+ // The sequences for each cell. There will be n_seq_max items per cell.
469
483
  llama_seq_id * cells_sequences;
470
484
  };
471
485
 
472
486
  // Create an empty KV cache view. (use only for debugging purposes)
473
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
487
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
474
488
 
475
489
  // Free a KV cache view. (use only for debugging purposes)
476
490
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -493,7 +507,7 @@ extern "C" {
493
507
  // seq_id < 0 : match any sequence
494
508
  // p0 < 0 : [0, p1]
495
509
  // p1 < 0 : [p0, inf)
496
- LLAMA_API void llama_kv_cache_seq_rm(
510
+ LLAMA_API bool llama_kv_cache_seq_rm(
497
511
  struct llama_context * ctx,
498
512
  llama_seq_id seq_id,
499
513
  llama_pos p0,
@@ -632,7 +646,19 @@ extern "C" {
632
646
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
633
647
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
634
648
 
635
- // Token logits obtained from the last call to llama_eval()
649
+ // Set whether to use causal attention or not
650
+ // If set to true, the model will only attend to the past tokens
651
+ LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
652
+
653
+ // Set abort callback
654
+ LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
655
+
656
+ // Wait until all computations are finished
657
+ // This is automatically done when using one of the functions below to obtain the computation results
658
+ // and is not necessary to call it explicitly in most cases
659
+ LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
+
661
+ // Token logits obtained from the last call to llama_decode()
636
662
  // The logits for the last token are stored in the last row
637
663
  // Logits for which llama_batch.logits[i] == 0 are undefined
638
664
  // Rows: n_tokens provided with llama_batch
@@ -643,14 +669,20 @@ extern "C" {
643
669
  // llama_get_logits(ctx) + i*n_vocab
644
670
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
645
671
 
646
- // Get the embeddings for the input
647
- // shape: [n_embd] (1-dimensional)
672
+ // Get all output token embeddings
673
+ // shape: [n_tokens*n_embd] (1-dimensional)
648
674
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
649
675
 
650
- // Get the embeddings for the ith sequence
676
+ // Get the embeddings for the ith token
651
677
  // llama_get_embeddings(ctx) + i*n_embd
678
+ // shape: [n_embd] (1-dimensional)
652
679
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
653
680
 
681
+ // Get the embeddings for a sequence id
682
+ // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
683
+ // shape: [n_embd] (1-dimensional)
684
+ LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
685
+
654
686
  //
655
687
  // Vocab
656
688
  //
@@ -684,7 +716,7 @@ extern "C" {
684
716
 
685
717
  /// @details Convert the provided text into tokens.
686
718
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
687
- /// @return Returns the number of tokens on success, no more than n_max_tokens
719
+ /// @return Returns the number of tokens on success, no more than n_tokens_max
688
720
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
689
721
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
690
722
  /// Does not insert a leading space.
@@ -693,7 +725,7 @@ extern "C" {
693
725
  const char * text,
694
726
  int32_t text_len,
695
727
  llama_token * tokens,
696
- int32_t n_max_tokens,
728
+ int32_t n_tokens_max,
697
729
  bool add_bos,
698
730
  bool special);
699
731