llama_cpp 0.14.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,9 +59,10 @@ extern "C" {
59
59
  typedef int32_t llama_seq_id;
60
60
 
61
61
  enum llama_vocab_type {
62
- LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
- LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
- LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
62
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
65
66
  };
66
67
 
67
68
  // note: these values should be synchronized with ggml_rope
@@ -234,7 +235,9 @@ extern "C" {
234
235
  struct llama_context_params {
235
236
  uint32_t seed; // RNG seed, -1 for random
236
237
  uint32_t n_ctx; // text context, 0 = from model
237
- uint32_t n_batch; // prompt processing maximum batch size
238
+ uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
239
+ uint32_t n_ubatch; // physical maximum batch size
240
+ uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
238
241
  uint32_t n_threads; // number of threads to use for generation
239
242
  uint32_t n_threads_batch; // number of threads to use for batch processing
240
243
 
@@ -277,7 +280,7 @@ extern "C" {
277
280
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
278
281
  bool quantize_output_tensor; // quantize output.weight
279
282
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
280
- bool pure; // disable k-quant mixtures and quantize all tensors to the same type
283
+ bool pure; // quantize all tensors to the default type
281
284
  void * imatrix; // pointer to importance matrix data
282
285
  } llama_model_quantize_params;
283
286
 
@@ -376,6 +379,8 @@ extern "C" {
376
379
 
377
380
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
378
381
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
382
+ LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
383
+ LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
379
384
 
380
385
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
381
386
  LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@@ -454,7 +459,7 @@ extern "C" {
454
459
  // Maximum number of sequences that can exist in a cell. It's not an error
455
460
  // if there are more sequences in a cell than this value, however they will
456
461
  // not be visible in the view cells_sequences.
457
- int32_t n_max_seq;
462
+ int32_t n_seq_max;
458
463
 
459
464
  // Number of tokens in the cache. For example, if there are two populated
460
465
  // cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -474,12 +479,12 @@ extern "C" {
474
479
  // Information for an individual cell.
475
480
  struct llama_kv_cache_view_cell * cells;
476
481
 
477
- // The sequences for each cell. There will be n_max_seq items per cell.
482
+ // The sequences for each cell. There will be n_seq_max items per cell.
478
483
  llama_seq_id * cells_sequences;
479
484
  };
480
485
 
481
486
  // Create an empty KV cache view. (use only for debugging purposes)
482
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
487
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
483
488
 
484
489
  // Free a KV cache view. (use only for debugging purposes)
485
490
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -502,7 +507,7 @@ extern "C" {
502
507
  // seq_id < 0 : match any sequence
503
508
  // p0 < 0 : [0, p1]
504
509
  // p1 < 0 : [p0, inf)
505
- LLAMA_API void llama_kv_cache_seq_rm(
510
+ LLAMA_API bool llama_kv_cache_seq_rm(
506
511
  struct llama_context * ctx,
507
512
  llama_seq_id seq_id,
508
513
  llama_pos p0,
@@ -641,9 +646,18 @@ extern "C" {
641
646
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
642
647
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
643
648
 
649
+ // Set whether to use causal attention or not
650
+ // If set to true, the model will only attend to the past tokens
651
+ LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
652
+
644
653
  // Set abort callback
645
654
  LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
646
655
 
656
+ // Wait until all computations are finished
657
+ // This is automatically done when using one of the functions below to obtain the computation results
658
+ // and is not necessary to call it explicitly in most cases
659
+ LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
+
647
661
  // Token logits obtained from the last call to llama_decode()
648
662
  // The logits for the last token are stored in the last row
649
663
  // Logits for which llama_batch.logits[i] == 0 are undefined
@@ -702,7 +716,7 @@ extern "C" {
702
716
 
703
717
  /// @details Convert the provided text into tokens.
704
718
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
705
- /// @return Returns the number of tokens on success, no more than n_max_tokens
719
+ /// @return Returns the number of tokens on success, no more than n_tokens_max
706
720
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
707
721
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
708
722
  /// Does not insert a leading space.
@@ -711,7 +725,7 @@ extern "C" {
711
725
  const char * text,
712
726
  int32_t text_len,
713
727
  llama_token * tokens,
714
- int32_t n_max_tokens,
728
+ int32_t n_tokens_max,
715
729
  bool add_bos,
716
730
  bool special);
717
731