llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,7 +39,7 @@
39
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
40
 
41
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
42
+ #define LLAMA_SESSION_VERSION 5
43
43
 
44
44
  #ifdef __cplusplus
45
45
  extern "C" {
@@ -60,9 +60,9 @@ extern "C" {
60
60
 
61
61
  enum llama_vocab_type {
62
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
66
  };
67
67
 
68
68
  // note: these values should be synchronized with ggml_rope
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
120
121
 
121
122
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
123
  };
@@ -275,13 +276,16 @@ extern "C" {
275
276
 
276
277
  // model quantization parameters
277
278
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
279
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
280
+ enum llama_ftype ftype; // quantize to this llama_ftype
281
+ enum ggml_type output_tensor_type; // output tensor type
282
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
283
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
284
+ bool quantize_output_tensor; // quantize output.weight
285
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
286
+ bool pure; // quantize all tensors to the default type
287
+ void * imatrix; // pointer to importance matrix data
288
+ void * kv_overrides; // pointer to vector containing overrides
285
289
  } llama_model_quantize_params;
286
290
 
287
291
  // grammar types
@@ -388,6 +392,7 @@ extern "C" {
388
392
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
389
393
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
390
394
  LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
395
+ LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
391
396
 
392
397
  // Get the model's RoPE frequency scaling factor
393
398
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -435,10 +440,24 @@ extern "C" {
435
440
  // Returns 0 on success
436
441
  LLAMA_API int32_t llama_model_apply_lora_from_file(
437
442
  const struct llama_model * model,
438
- const char * path_lora,
439
- float scale,
440
- const char * path_base_model,
441
- int32_t n_threads);
443
+ const char * path_lora,
444
+ float scale,
445
+ const char * path_base_model,
446
+ int32_t n_threads);
447
+
448
+ // Apply a loaded control vector to a llama_context, or if data is NULL, clear
449
+ // the currently loaded vector.
450
+ // n_embd should be the size of a single layer's control, and data should point
451
+ // to an n_embd x n_layers buffer starting from layer 1.
452
+ // il_start and il_end are the layer range the vector should apply to (both inclusive)
453
+ // See llama_control_vector_load in common to load a control vector.
454
+ LLAMA_API int32_t llama_control_vector_apply(
455
+ struct llama_context * lctx,
456
+ const float * data,
457
+ size_t len,
458
+ int32_t n_embd,
459
+ int32_t il_start,
460
+ int32_t il_end);
442
461
 
443
462
  //
444
463
  // KV cache
@@ -659,23 +678,29 @@ extern "C" {
659
678
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
679
 
661
680
  // Token logits obtained from the last call to llama_decode()
662
- // The logits for the last token are stored in the last row
663
- // Logits for which llama_batch.logits[i] == 0 are undefined
664
- // Rows: n_tokens provided with llama_batch
681
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
682
+ // in the order they have appeared in the batch.
683
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
665
684
  // Cols: n_vocab
666
685
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
667
686
 
668
687
  // Logits for the ith token. Equivalent to:
669
- // llama_get_logits(ctx) + i*n_vocab
688
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
689
+ // returns NULL for invalid ids.
670
690
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
671
691
 
672
- // Get all output token embeddings
673
- // shape: [n_tokens*n_embd] (1-dimensional)
692
+ // Get all output token embeddings.
693
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
694
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
695
+ // in the order they have appeared in the batch.
696
+ // shape: [n_outputs*n_embd]
697
+ // Otherwise, returns NULL.
674
698
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
675
699
 
676
- // Get the embeddings for the ith token
677
- // llama_get_embeddings(ctx) + i*n_embd
700
+ // Get the embeddings for the ith token. Equivalent to:
701
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
678
702
  // shape: [n_embd] (1-dimensional)
703
+ // returns NULL for invalid ids.
679
704
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
680
705
 
681
706
  // Get the embeddings for a sequence id
@@ -945,6 +970,16 @@ extern "C" {
945
970
  int32_t n_past,
946
971
  int32_t n_predict);
947
972
 
973
+ /// @details Build a split GGUF final path for this chunk.
974
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
975
+ // Returns the split_path length.
976
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
977
+
978
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
979
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
980
+ // Returns the split_prefix length.
981
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
982
+
948
983
  // Performance information
949
984
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
950
985
 
@@ -972,10 +1007,38 @@ extern "C" {
972
1007
 
973
1008
  struct ggml_tensor;
974
1009
 
1010
+ struct llama_partial_utf8 {
1011
+ uint32_t value; // bit value so far (unshifted)
1012
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1013
+ };
1014
+
1015
+ struct llama_grammar {
1016
+ const std::vector<std::vector<llama_grammar_element>> rules;
1017
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1018
+
1019
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1020
+ llama_partial_utf8 partial_utf8;
1021
+ };
1022
+
1023
+ struct llama_grammar_candidate {
1024
+ size_t index;
1025
+ const uint32_t * code_points;
1026
+ llama_partial_utf8 partial_utf8;
1027
+ };
1028
+
975
1029
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
976
1030
  struct llama_context * ctx
977
1031
  );
978
1032
 
1033
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
1034
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1035
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1036
+ const uint32_t chr);
1037
+
1038
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1039
+ const std::string & src,
1040
+ llama_partial_utf8 partial_start);
1041
+
979
1042
  #endif // LLAMA_API_INTERNAL
980
1043
 
981
1044
  #endif // LLAMA_H