llama_cpp 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,7 +39,7 @@
39
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
40
 
41
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
42
+ #define LLAMA_SESSION_VERSION 5
43
43
 
44
44
  #ifdef __cplusplus
45
45
  extern "C" {
@@ -60,9 +60,9 @@ extern "C" {
60
60
 
61
61
  enum llama_vocab_type {
62
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
66
  };
67
67
 
68
68
  // note: these values should be synchronized with ggml_rope
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
120
121
 
121
122
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
123
  };
@@ -275,13 +276,16 @@ extern "C" {
275
276
 
276
277
  // model quantization parameters
277
278
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
279
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
280
+ enum llama_ftype ftype; // quantize to this llama_ftype
281
+ enum ggml_type output_tensor_type; // output tensor type
282
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
283
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
284
+ bool quantize_output_tensor; // quantize output.weight
285
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
286
+ bool pure; // quantize all tensors to the default type
287
+ void * imatrix; // pointer to importance matrix data
288
+ void * kv_overrides; // pointer to vector containing overrides
285
289
  } llama_model_quantize_params;
286
290
 
287
291
  // grammar types
@@ -674,23 +678,29 @@ extern "C" {
674
678
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
675
679
 
676
680
  // Token logits obtained from the last call to llama_decode()
677
- // The logits for the last token are stored in the last row
678
- // Logits for which llama_batch.logits[i] == 0 are undefined
679
- // Rows: n_tokens provided with llama_batch
681
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
682
+ // in the order they have appeared in the batch.
683
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
680
684
  // Cols: n_vocab
681
685
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
682
686
 
683
687
  // Logits for the ith token. Equivalent to:
684
- // llama_get_logits(ctx) + i*n_vocab
688
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
689
+ // returns NULL for invalid ids.
685
690
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
686
691
 
687
- // Get all output token embeddings
688
- // shape: [n_tokens*n_embd] (1-dimensional)
692
+ // Get all output token embeddings.
693
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
694
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
695
+ // in the order they have appeared in the batch.
696
+ // shape: [n_outputs*n_embd]
697
+ // Otherwise, returns NULL.
689
698
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
690
699
 
691
- // Get the embeddings for the ith token
692
- // llama_get_embeddings(ctx) + i*n_embd
700
+ // Get the embeddings for the ith token. Equivalent to:
701
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
693
702
  // shape: [n_embd] (1-dimensional)
703
+ // returns NULL for invalid ids.
694
704
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
695
705
 
696
706
  // Get the embeddings for a sequence id
@@ -960,6 +970,16 @@ extern "C" {
960
970
  int32_t n_past,
961
971
  int32_t n_predict);
962
972
 
973
+ /// @details Build a split GGUF final path for this chunk.
974
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
975
+ // Returns the split_path length.
976
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
977
+
978
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
979
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
980
+ // Returns the split_prefix length.
981
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
982
+
963
983
  // Performance information
964
984
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
965
985
 
@@ -987,10 +1007,38 @@ extern "C" {
987
1007
 
988
1008
  struct ggml_tensor;
989
1009
 
1010
+ struct llama_partial_utf8 {
1011
+ uint32_t value; // bit value so far (unshifted)
1012
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1013
+ };
1014
+
1015
+ struct llama_grammar {
1016
+ const std::vector<std::vector<llama_grammar_element>> rules;
1017
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1018
+
1019
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1020
+ llama_partial_utf8 partial_utf8;
1021
+ };
1022
+
1023
+ struct llama_grammar_candidate {
1024
+ size_t index;
1025
+ const uint32_t * code_points;
1026
+ llama_partial_utf8 partial_utf8;
1027
+ };
1028
+
990
1029
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
991
1030
  struct llama_context * ctx
992
1031
  );
993
1032
 
1033
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
1034
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1035
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1036
+ const uint32_t chr);
1037
+
1038
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1039
+ const std::string & src,
1040
+ llama_partial_utf8 partial_start);
1041
+
994
1042
  #endif // LLAMA_API_INTERNAL
995
1043
 
996
1044
  #endif // LLAMA_H