llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,9 +37,13 @@
37
37
 
38
38
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
39
39
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
+ #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
40
41
 
41
42
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
- #define LLAMA_SESSION_VERSION 4
43
+ #define LLAMA_SESSION_VERSION 5
44
+
45
+ #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
46
+ #define LLAMA_STATE_SEQ_VERSION 1
43
47
 
44
48
  #ifdef __cplusplus
45
49
  extern "C" {
@@ -60,9 +64,9 @@ extern "C" {
60
64
 
61
65
  enum llama_vocab_type {
62
66
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
67
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
68
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
69
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
70
  };
67
71
 
68
72
  // note: these values should be synchronized with ggml_rope
@@ -117,6 +121,7 @@ extern "C" {
117
121
  LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
118
122
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119
123
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
124
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
120
125
 
121
126
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
122
127
  };
@@ -275,13 +280,16 @@ extern "C" {
275
280
 
276
281
  // model quantization parameters
277
282
  typedef struct llama_model_quantize_params {
278
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279
- enum llama_ftype ftype; // quantize to this llama_ftype
280
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
281
- bool quantize_output_tensor; // quantize output.weight
282
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283
- bool pure; // quantize all tensors to the default type
284
- void * imatrix; // pointer to importance matrix data
283
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
284
+ enum llama_ftype ftype; // quantize to this llama_ftype
285
+ enum ggml_type output_tensor_type; // output tensor type
286
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
287
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
288
+ bool quantize_output_tensor; // quantize output.weight
289
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
290
+ bool pure; // quantize all tensors to the default type
291
+ void * imatrix; // pointer to importance matrix data
292
+ void * kv_overrides; // pointer to vector containing overrides
285
293
  } llama_model_quantize_params;
286
294
 
287
295
  // grammar types
@@ -519,6 +527,7 @@ extern "C" {
519
527
  struct llama_context * ctx);
520
528
 
521
529
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
530
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
522
531
  // seq_id < 0 : match any sequence
523
532
  // p0 < 0 : [0, p1]
524
533
  // p1 < 0 : [p0, inf)
@@ -590,35 +599,93 @@ extern "C" {
590
599
 
591
600
  // Returns the maximum size in bytes of the state (rng, logits, embedding
592
601
  // and kv_cache) - will often be smaller after compacting tokens
593
- LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
602
+ LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
603
+ LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
604
+ "use llama_state_get_size instead");
594
605
 
595
606
  // Copies the state to the specified destination address.
596
607
  // Destination needs to have allocated enough memory.
597
608
  // Returns the number of bytes copied
598
- LLAMA_API size_t llama_copy_state_data(
609
+ LLAMA_API size_t llama_state_get_data(
599
610
  struct llama_context * ctx,
600
611
  uint8_t * dst);
612
+ LLAMA_API DEPRECATED(size_t llama_copy_state_data(
613
+ struct llama_context * ctx,
614
+ uint8_t * dst),
615
+ "use llama_state_get_data instead");
601
616
 
602
617
  // Set the state reading from the specified address
603
618
  // Returns the number of bytes read
604
- LLAMA_API size_t llama_set_state_data(
619
+ LLAMA_API size_t llama_state_set_data(
605
620
  struct llama_context * ctx,
606
621
  const uint8_t * src);
622
+ LLAMA_API DEPRECATED(size_t llama_set_state_data(
623
+ struct llama_context * ctx,
624
+ const uint8_t * src),
625
+ "use llama_state_set_data instead");
607
626
 
608
627
  // Save/load session file
609
- LLAMA_API bool llama_load_session_file(
628
+ LLAMA_API bool llama_state_load_file(
610
629
  struct llama_context * ctx,
611
630
  const char * path_session,
612
631
  llama_token * tokens_out,
613
632
  size_t n_token_capacity,
614
633
  size_t * n_token_count_out);
634
+ LLAMA_API DEPRECATED(bool llama_load_session_file(
635
+ struct llama_context * ctx,
636
+ const char * path_session,
637
+ llama_token * tokens_out,
638
+ size_t n_token_capacity,
639
+ size_t * n_token_count_out),
640
+ "use llama_state_load_file instead");
615
641
 
616
- LLAMA_API bool llama_save_session_file(
642
+ LLAMA_API bool llama_state_save_file(
643
+ struct llama_context * ctx,
644
+ const char * path_session,
645
+ const llama_token * tokens,
646
+ size_t n_token_count);
647
+ LLAMA_API DEPRECATED(bool llama_save_session_file(
617
648
  struct llama_context * ctx,
618
649
  const char * path_session,
650
+ const llama_token * tokens,
651
+ size_t n_token_count),
652
+ "use llama_state_save_file instead");
653
+
654
+ // Get the exact size needed to copy the KV cache of a single sequence
655
+ LLAMA_API size_t llama_state_seq_get_size(
656
+ struct llama_context * ctx,
657
+ llama_seq_id seq_id);
658
+
659
+ // Copy the KV cache of a single sequence into the specified buffer
660
+ LLAMA_API size_t llama_state_seq_get_data(
661
+ struct llama_context * ctx,
662
+ uint8_t * dst,
663
+ llama_seq_id seq_id);
664
+
665
+ // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
666
+ // Returns:
667
+ // - Positive: Ok
668
+ // - Zero: Failed to load
669
+ LLAMA_API size_t llama_state_seq_set_data(
670
+ struct llama_context * ctx,
671
+ const uint8_t * src,
672
+ llama_seq_id dest_seq_id);
673
+
674
+ LLAMA_API size_t llama_state_seq_save_file(
675
+ struct llama_context * ctx,
676
+ const char * filepath,
677
+ llama_seq_id seq_id,
619
678
  const llama_token * tokens,
620
679
  size_t n_token_count);
621
680
 
681
+ LLAMA_API size_t llama_state_seq_load_file(
682
+ struct llama_context * ctx,
683
+ const char * filepath,
684
+ llama_seq_id dest_seq_id,
685
+ llama_token * tokens_out,
686
+ size_t n_token_capacity,
687
+ size_t * n_token_count_out);
688
+
622
689
  //
623
690
  // Decoding
624
691
  //
@@ -674,23 +741,31 @@ extern "C" {
674
741
  LLAMA_API void llama_synchronize(struct llama_context * ctx);
675
742
 
676
743
  // Token logits obtained from the last call to llama_decode()
677
- // The logits for the last token are stored in the last row
678
- // Logits for which llama_batch.logits[i] == 0 are undefined
679
- // Rows: n_tokens provided with llama_batch
744
+ // The logits for which llama_batch.logits[i] != 0 are stored contiguously
745
+ // in the order they have appeared in the batch.
746
+ // Rows: number of tokens for which llama_batch.logits[i] != 0
680
747
  // Cols: n_vocab
681
748
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
682
749
 
683
- // Logits for the ith token. Equivalent to:
684
- // llama_get_logits(ctx) + i*n_vocab
750
+ // Logits for the ith token. For positive indices, Equivalent to:
751
+ // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
752
+ // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
753
+ // returns NULL for invalid ids.
685
754
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
686
755
 
687
- // Get all output token embeddings
688
- // shape: [n_tokens*n_embd] (1-dimensional)
756
+ // Get all output token embeddings.
757
+ // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
758
+ // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
759
+ // in the order they have appeared in the batch.
760
+ // shape: [n_outputs*n_embd]
761
+ // Otherwise, returns NULL.
689
762
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
690
763
 
691
- // Get the embeddings for the ith token
692
- // llama_get_embeddings(ctx) + i*n_embd
764
+ // Get the embeddings for the ith token. For positive indices, Equivalent to:
765
+ // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
766
+ // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
693
767
  // shape: [n_embd] (1-dimensional)
768
+ // returns NULL for invalid ids.
694
769
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
695
770
 
696
771
  // Get the embeddings for a sequence id
@@ -711,6 +786,8 @@ extern "C" {
711
786
  // Special tokens
712
787
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
713
788
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
789
+ LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
790
+ LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
714
791
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
715
792
 
716
793
  // Returns -1 if unknown, 1 for true or 0 for false.
@@ -733,16 +810,16 @@ extern "C" {
733
810
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
734
811
  /// @return Returns the number of tokens on success, no more than n_tokens_max
735
812
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
736
- /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
737
- /// Does not insert a leading space.
813
+ /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
814
+ /// as plaintext. Does not insert a leading space.
738
815
  LLAMA_API int32_t llama_tokenize(
739
816
  const struct llama_model * model,
740
817
  const char * text,
741
818
  int32_t text_len,
742
819
  llama_token * tokens,
743
820
  int32_t n_tokens_max,
744
- bool add_bos,
745
- bool special);
821
+ bool add_special,
822
+ bool parse_special);
746
823
 
747
824
  // Token Id -> Piece.
748
825
  // Uses the vocabulary in the provided context.
@@ -960,6 +1037,16 @@ extern "C" {
960
1037
  int32_t n_past,
961
1038
  int32_t n_predict);
962
1039
 
1040
+ /// @details Build a split GGUF final path for this chunk.
1041
+ /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
1042
+ // Returns the split_path length.
1043
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
1044
+
1045
+ /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
1046
+ /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
1047
+ // Returns the split_prefix length.
1048
+ LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
1049
+
963
1050
  // Performance information
964
1051
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
965
1052
 
@@ -987,10 +1074,39 @@ extern "C" {
987
1074
 
988
1075
  struct ggml_tensor;
989
1076
 
1077
+ struct llama_partial_utf8 {
1078
+ uint32_t value; // bit value so far (unshifted)
1079
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1080
+ };
1081
+
1082
+ struct llama_grammar {
1083
+ const std::vector<std::vector<llama_grammar_element>> rules;
1084
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1085
+
1086
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1087
+ llama_partial_utf8 partial_utf8;
1088
+ };
1089
+
1090
+ struct llama_grammar_candidate {
1091
+ size_t index;
1092
+ const uint32_t * code_points;
1093
+ llama_partial_utf8 partial_utf8;
1094
+ };
1095
+
990
1096
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
991
1097
  struct llama_context * ctx
992
1098
  );
993
1099
 
1100
+ void llama_grammar_accept(
1101
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1102
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1103
+ const uint32_t chr,
1104
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
1105
+
1106
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1107
+ const std::string & src,
1108
+ llama_partial_utf8 partial_start);
1109
+
994
1110
  #endif // LLAMA_API_INTERNAL
995
1111
 
996
1112
  #endif // LLAMA_H