llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,7 +39,7 @@
39
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
40
 
41
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
42
+ #define LLAMA_SESSION_VERSION 5
43
43
 
44
44
  #ifdef __cplusplus
45
45
  extern "C" {
@@ -60,9 +60,9 @@ extern "C" {
60
60
 
61
61
  enum llama_vocab_type {
62
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
66
  };
67
67
 
68
68
  // note: these values should be synchronized with ggml_rope
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
120
121
 
121
122
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
123
  };
@@ -275,13 +276,16 @@ extern "C" {
275
276
 
276
277
  // model quantization parameters
277
278
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
279
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
280
+ enum llama_ftype ftype; // quantize to this llama_ftype
281
+ enum ggml_type output_tensor_type; // output tensor type
282
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
283
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
284
+ bool quantize_output_tensor; // quantize output.weight
285
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
286
+ bool pure; // quantize all tensors to the default type
287
+ void * imatrix; // pointer to importance matrix data
288
+ void * kv_overrides; // pointer to vector containing overrides
285
289
  } llama_model_quantize_params;
286
290
 
287
291
  // grammar types
@@ -674,23 +678,29 @@ extern "C" {
674
678
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
675
679
 
676
680
  // Token logits obtained from the last call to llama_decode()
677
- // The logits for the last token are stored in the last row
678
- // Logits for which llama_batch.logits[i] == 0 are undefined
679
- // Rows: n_tokens provided with llama_batch
681
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
682
+ // in the order they have appeared in the batch.
683
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
680
684
  // Cols: n_vocab
681
685
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
682
686
 
683
687
  // Logits for the ith token. Equivalent to:
684
- // llama_get_logits(ctx) + i*n_vocab
688
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
689
+ // returns NULL for invalid ids.
685
690
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
686
691
 
687
- // Get all output token embeddings
688
- // shape: [n_tokens*n_embd] (1-dimensional)
692
+ // Get all output token embeddings.
693
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
694
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
695
+ // in the order they have appeared in the batch.
696
+ // shape: [n_outputs*n_embd]
697
+ // Otherwise, returns NULL.
689
698
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
690
699
 
691
- // Get the embeddings for the ith token
692
- // llama_get_embeddings(ctx) + i*n_embd
700
+ // Get the embeddings for the ith token. Equivalent to:
701
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
693
702
  // shape: [n_embd] (1-dimensional)
703
+ // returns NULL for invalid ids.
694
704
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
695
705
 
696
706
  // Get the embeddings for a sequence id
@@ -960,6 +970,16 @@ extern "C" {
960
970
  int32_t n_past,
961
971
  int32_t n_predict);
962
972
 
973
+ /// @details Build a split GGUF final path for this chunk.
974
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
975
+ // Returns the split_path length.
976
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
977
+
978
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
979
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
980
+ // Returns the split_prefix length.
981
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
982
+
963
983
  // Performance information
964
984
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
965
985
 
@@ -987,10 +1007,38 @@ extern "C" {
987
1007
 
988
1008
  struct ggml_tensor;
989
1009
 
1010
+ struct llama_partial_utf8 {
1011
+ uint32_t value; // bit value so far (unshifted)
1012
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1013
+ };
1014
+
1015
+ struct llama_grammar {
1016
+ const std::vector<std::vector<llama_grammar_element>> rules;
1017
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1018
+
1019
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1020
+ llama_partial_utf8 partial_utf8;
1021
+ };
1022
+
1023
+ struct llama_grammar_candidate {
1024
+ size_t index;
1025
+ const uint32_t * code_points;
1026
+ llama_partial_utf8 partial_utf8;
1027
+ };
1028
+
990
1029
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
991
1030
  struct llama_context * ctx
992
1031
  );
993
1032
 
1033
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
1034
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1035
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1036
+ const uint32_t chr);
1037
+
1038
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1039
+ const std::string & src,
1040
+ llama_partial_utf8 partial_start);
1041
+
994
1042
  #endif // LLAMA_API_INTERNAL
995
1043
 
996
1044
  #endif // LLAMA_H