llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,7 +39,7 @@
39
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
40
 
41
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
42
+ #define LLAMA_SESSION_VERSION 5
43
43
 
44
44
  #ifdef __cplusplus
45
45
  extern "C" {
@@ -60,9 +60,9 @@ extern "C" {
60
60
 
61
61
  enum llama_vocab_type {
62
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
66
  };
67
67
 
68
68
  // note: these values should be synchronized with ggml_rope
@@ -117,6 +117,7 @@ extern "C" {
117
117
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
118
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
119
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
120
121
 
121
122
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
123
  };
@@ -275,13 +276,16 @@ extern "C" {
275
276
 
276
277
  // model quantization parameters
277
278
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
279
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
280
+ enum llama_ftype ftype; // quantize to this llama_ftype
281
+ enum ggml_type output_tensor_type; // output tensor type
282
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
283
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
284
+ bool quantize_output_tensor; // quantize output.weight
285
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
286
+ bool pure; // quantize all tensors to the default type
287
+ void * imatrix; // pointer to importance matrix data
288
+ void * kv_overrides; // pointer to vector containing overrides
285
289
  } llama_model_quantize_params;
286
290
 
287
291
  // grammar types
@@ -388,6 +392,7 @@ extern "C" {
388
392
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
389
393
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
390
394
  LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
395
+ LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
391
396
 
392
397
  // Get the model's RoPE frequency scaling factor
393
398
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -435,10 +440,24 @@ extern "C" {
435
440
  // Returns 0 on success
436
441
  LLAMA_API int32_t llama_model_apply_lora_from_file(
437
442
  const struct llama_model * model,
438
- const char * path_lora,
439
- float scale,
440
- const char * path_base_model,
441
- int32_t n_threads);
443
+ const char * path_lora,
444
+ float scale,
445
+ const char * path_base_model,
446
+ int32_t n_threads);
447
+
448
+ // Apply a loaded control vector to a llama_context, or if data is NULL, clear
449
+ // the currently loaded vector.
450
+ // n_embd should be the size of a single layer's control, and data should point
451
+ // to an n_embd x n_layers buffer starting from layer 1.
452
+ // il_start and il_end are the layer range the vector should apply to (both inclusive)
453
+ // See llama_control_vector_load in common to load a control vector.
454
+ LLAMA_API int32_t llama_control_vector_apply(
455
+ struct llama_context * lctx,
456
+ const float * data,
457
+ size_t len,
458
+ int32_t n_embd,
459
+ int32_t il_start,
460
+ int32_t il_end);
442
461
 
443
462
  //
444
463
  // KV cache
@@ -659,23 +678,29 @@ extern "C" {
659
678
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
660
679
 
661
680
  // Token logits obtained from the last call to llama_decode()
662
- // The logits for the last token are stored in the last row
663
- // Logits for which llama_batch.logits[i] == 0 are undefined
664
- // Rows: n_tokens provided with llama_batch
681
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
682
+ // in the order they have appeared in the batch.
683
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
665
684
  // Cols: n_vocab
666
685
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
667
686
 
668
687
  // Logits for the ith token. Equivalent to:
669
- // llama_get_logits(ctx) + i*n_vocab
688
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
689
+ // returns NULL for invalid ids.
670
690
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
671
691
 
672
- // Get all output token embeddings
673
- // shape: [n_tokens*n_embd] (1-dimensional)
692
+ // Get all output token embeddings.
693
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
694
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
695
+ // in the order they have appeared in the batch.
696
+ // shape: [n_outputs*n_embd]
697
+ // Otherwise, returns NULL.
674
698
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
675
699
 
676
- // Get the embeddings for the ith token
677
- // llama_get_embeddings(ctx) + i*n_embd
700
+ // Get the embeddings for the ith token. Equivalent to:
701
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
678
702
  // shape: [n_embd] (1-dimensional)
703
+ // returns NULL for invalid ids.
679
704
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
680
705
 
681
706
  // Get the embeddings for a sequence id
@@ -945,6 +970,16 @@ extern "C" {
945
970
  int32_t n_past,
946
971
  int32_t n_predict);
947
972
 
973
+ /// @details Build a split GGUF final path for this chunk.
974
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
975
+ // Returns the split_path length.
976
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
977
+
978
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
979
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
980
+ // Returns the split_prefix length.
981
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
982
+
948
983
  // Performance information
949
984
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
950
985
 
@@ -972,10 +1007,38 @@ extern "C" {
972
1007
 
973
1008
  struct ggml_tensor;
974
1009
 
1010
+ struct llama_partial_utf8 {
1011
+ uint32_t value; // bit value so far (unshifted)
1012
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1013
+ };
1014
+
1015
+ struct llama_grammar {
1016
+ const std::vector<std::vector<llama_grammar_element>> rules;
1017
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1018
+
1019
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1020
+ llama_partial_utf8 partial_utf8;
1021
+ };
1022
+
1023
+ struct llama_grammar_candidate {
1024
+ size_t index;
1025
+ const uint32_t * code_points;
1026
+ llama_partial_utf8 partial_utf8;
1027
+ };
1028
+
975
1029
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
976
1030
  struct llama_context * ctx
977
1031
  );
978
1032
 
1033
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
1034
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1035
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1036
+ const uint32_t chr);
1037
+
1038
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1039
+ const std::string & src,
1040
+ llama_partial_utf8 partial_start);
1041
+
979
1042
  #endif // LLAMA_API_INTERNAL
980
1043
 
981
1044
  #endif // LLAMA_H