llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -37,9 +37,13 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
|
+
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
40
41
|
|
41
42
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
-
#define LLAMA_SESSION_VERSION
|
43
|
+
#define LLAMA_SESSION_VERSION 5
|
44
|
+
|
45
|
+
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
|
+
#define LLAMA_STATE_SEQ_VERSION 1
|
43
47
|
|
44
48
|
#ifdef __cplusplus
|
45
49
|
extern "C" {
|
@@ -60,9 +64,9 @@ extern "C" {
|
|
60
64
|
|
61
65
|
enum llama_vocab_type {
|
62
66
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
64
|
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
65
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
67
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
68
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
69
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
66
70
|
};
|
67
71
|
|
68
72
|
// note: these values should be synchronized with ggml_rope
|
@@ -117,6 +121,7 @@ extern "C" {
|
|
117
121
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
118
122
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
119
123
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
124
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
120
125
|
|
121
126
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
122
127
|
};
|
@@ -275,13 +280,16 @@ extern "C" {
|
|
275
280
|
|
276
281
|
// model quantization parameters
|
277
282
|
typedef struct llama_model_quantize_params {
|
278
|
-
int32_t nthread;
|
279
|
-
enum llama_ftype ftype;
|
280
|
-
|
281
|
-
|
282
|
-
bool
|
283
|
-
bool
|
284
|
-
|
283
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
284
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
285
|
+
enum ggml_type output_tensor_type; // output tensor type
|
286
|
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
287
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
288
|
+
bool quantize_output_tensor; // quantize output.weight
|
289
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
290
|
+
bool pure; // quantize all tensors to the default type
|
291
|
+
void * imatrix; // pointer to importance matrix data
|
292
|
+
void * kv_overrides; // pointer to vector containing overrides
|
285
293
|
} llama_model_quantize_params;
|
286
294
|
|
287
295
|
// grammar types
|
@@ -519,6 +527,7 @@ extern "C" {
|
|
519
527
|
struct llama_context * ctx);
|
520
528
|
|
521
529
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
530
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
522
531
|
// seq_id < 0 : match any sequence
|
523
532
|
// p0 < 0 : [0, p1]
|
524
533
|
// p1 < 0 : [p0, inf)
|
@@ -590,35 +599,93 @@ extern "C" {
|
|
590
599
|
|
591
600
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
592
601
|
// and kv_cache) - will often be smaller after compacting tokens
|
593
|
-
LLAMA_API size_t
|
602
|
+
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
603
|
+
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
604
|
+
"use llama_state_get_size instead");
|
594
605
|
|
595
606
|
// Copies the state to the specified destination address.
|
596
607
|
// Destination needs to have allocated enough memory.
|
597
608
|
// Returns the number of bytes copied
|
598
|
-
LLAMA_API size_t
|
609
|
+
LLAMA_API size_t llama_state_get_data(
|
599
610
|
struct llama_context * ctx,
|
600
611
|
uint8_t * dst);
|
612
|
+
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
613
|
+
struct llama_context * ctx,
|
614
|
+
uint8_t * dst),
|
615
|
+
"use llama_state_get_data instead");
|
601
616
|
|
602
617
|
// Set the state reading from the specified address
|
603
618
|
// Returns the number of bytes read
|
604
|
-
LLAMA_API size_t
|
619
|
+
LLAMA_API size_t llama_state_set_data(
|
605
620
|
struct llama_context * ctx,
|
606
621
|
const uint8_t * src);
|
622
|
+
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
623
|
+
struct llama_context * ctx,
|
624
|
+
const uint8_t * src),
|
625
|
+
"use llama_state_set_data instead");
|
607
626
|
|
608
627
|
// Save/load session file
|
609
|
-
LLAMA_API bool
|
628
|
+
LLAMA_API bool llama_state_load_file(
|
610
629
|
struct llama_context * ctx,
|
611
630
|
const char * path_session,
|
612
631
|
llama_token * tokens_out,
|
613
632
|
size_t n_token_capacity,
|
614
633
|
size_t * n_token_count_out);
|
634
|
+
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
635
|
+
struct llama_context * ctx,
|
636
|
+
const char * path_session,
|
637
|
+
llama_token * tokens_out,
|
638
|
+
size_t n_token_capacity,
|
639
|
+
size_t * n_token_count_out),
|
640
|
+
"use llama_state_load_file instead");
|
615
641
|
|
616
|
-
LLAMA_API bool
|
642
|
+
LLAMA_API bool llama_state_save_file(
|
643
|
+
struct llama_context * ctx,
|
644
|
+
const char * path_session,
|
645
|
+
const llama_token * tokens,
|
646
|
+
size_t n_token_count);
|
647
|
+
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
617
648
|
struct llama_context * ctx,
|
618
649
|
const char * path_session,
|
650
|
+
const llama_token * tokens,
|
651
|
+
size_t n_token_count),
|
652
|
+
"use llama_state_save_file instead");
|
653
|
+
|
654
|
+
// Get the exact size needed to copy the KV cache of a single sequence
|
655
|
+
LLAMA_API size_t llama_state_seq_get_size(
|
656
|
+
struct llama_context * ctx,
|
657
|
+
llama_seq_id seq_id);
|
658
|
+
|
659
|
+
// Copy the KV cache of a single sequence into the specified buffer
|
660
|
+
LLAMA_API size_t llama_state_seq_get_data(
|
661
|
+
struct llama_context * ctx,
|
662
|
+
uint8_t * dst,
|
663
|
+
llama_seq_id seq_id);
|
664
|
+
|
665
|
+
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
666
|
+
// Returns:
|
667
|
+
// - Positive: Ok
|
668
|
+
// - Zero: Failed to load
|
669
|
+
LLAMA_API size_t llama_state_seq_set_data(
|
670
|
+
struct llama_context * ctx,
|
671
|
+
const uint8_t * src,
|
672
|
+
llama_seq_id dest_seq_id);
|
673
|
+
|
674
|
+
LLAMA_API size_t llama_state_seq_save_file(
|
675
|
+
struct llama_context * ctx,
|
676
|
+
const char * filepath,
|
677
|
+
llama_seq_id seq_id,
|
619
678
|
const llama_token * tokens,
|
620
679
|
size_t n_token_count);
|
621
680
|
|
681
|
+
LLAMA_API size_t llama_state_seq_load_file(
|
682
|
+
struct llama_context * ctx,
|
683
|
+
const char * filepath,
|
684
|
+
llama_seq_id dest_seq_id,
|
685
|
+
llama_token * tokens_out,
|
686
|
+
size_t n_token_capacity,
|
687
|
+
size_t * n_token_count_out);
|
688
|
+
|
622
689
|
//
|
623
690
|
// Decoding
|
624
691
|
//
|
@@ -674,23 +741,31 @@ extern "C" {
|
|
674
741
|
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
675
742
|
|
676
743
|
// Token logits obtained from the last call to llama_decode()
|
677
|
-
// The logits for
|
678
|
-
//
|
679
|
-
// Rows:
|
744
|
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
745
|
+
// in the order they have appeared in the batch.
|
746
|
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
680
747
|
// Cols: n_vocab
|
681
748
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
682
749
|
|
683
|
-
// Logits for the ith token. Equivalent to:
|
684
|
-
// llama_get_logits(ctx) + i*n_vocab
|
750
|
+
// Logits for the ith token. For positive indices, Equivalent to:
|
751
|
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
752
|
+
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
753
|
+
// returns NULL for invalid ids.
|
685
754
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
686
755
|
|
687
|
-
// Get all output token embeddings
|
688
|
-
//
|
756
|
+
// Get all output token embeddings.
|
757
|
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
758
|
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
759
|
+
// in the order they have appeared in the batch.
|
760
|
+
// shape: [n_outputs*n_embd]
|
761
|
+
// Otherwise, returns NULL.
|
689
762
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
690
763
|
|
691
|
-
// Get the embeddings for the ith token
|
692
|
-
// llama_get_embeddings(ctx) + i*n_embd
|
764
|
+
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
765
|
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
766
|
+
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
693
767
|
// shape: [n_embd] (1-dimensional)
|
768
|
+
// returns NULL for invalid ids.
|
694
769
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
695
770
|
|
696
771
|
// Get the embeddings for a sequence id
|
@@ -711,6 +786,8 @@ extern "C" {
|
|
711
786
|
// Special tokens
|
712
787
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
713
788
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
789
|
+
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
790
|
+
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
714
791
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
715
792
|
|
716
793
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
@@ -733,16 +810,16 @@ extern "C" {
|
|
733
810
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
734
811
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
735
812
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
736
|
-
/// @param
|
737
|
-
///
|
813
|
+
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
814
|
+
/// as plaintext. Does not insert a leading space.
|
738
815
|
LLAMA_API int32_t llama_tokenize(
|
739
816
|
const struct llama_model * model,
|
740
817
|
const char * text,
|
741
818
|
int32_t text_len,
|
742
819
|
llama_token * tokens,
|
743
820
|
int32_t n_tokens_max,
|
744
|
-
bool
|
745
|
-
bool
|
821
|
+
bool add_special,
|
822
|
+
bool parse_special);
|
746
823
|
|
747
824
|
// Token Id -> Piece.
|
748
825
|
// Uses the vocabulary in the provided context.
|
@@ -960,6 +1037,16 @@ extern "C" {
|
|
960
1037
|
int32_t n_past,
|
961
1038
|
int32_t n_predict);
|
962
1039
|
|
1040
|
+
/// @details Build a split GGUF final path for this chunk.
|
1041
|
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
1042
|
+
// Returns the split_path length.
|
1043
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
1044
|
+
|
1045
|
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
1046
|
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
1047
|
+
// Returns the split_prefix length.
|
1048
|
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
1049
|
+
|
963
1050
|
// Performance information
|
964
1051
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
965
1052
|
|
@@ -987,10 +1074,39 @@ extern "C" {
|
|
987
1074
|
|
988
1075
|
struct ggml_tensor;
|
989
1076
|
|
1077
|
+
struct llama_partial_utf8 {
|
1078
|
+
uint32_t value; // bit value so far (unshifted)
|
1079
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1080
|
+
};
|
1081
|
+
|
1082
|
+
struct llama_grammar {
|
1083
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1084
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1085
|
+
|
1086
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1087
|
+
llama_partial_utf8 partial_utf8;
|
1088
|
+
};
|
1089
|
+
|
1090
|
+
struct llama_grammar_candidate {
|
1091
|
+
size_t index;
|
1092
|
+
const uint32_t * code_points;
|
1093
|
+
llama_partial_utf8 partial_utf8;
|
1094
|
+
};
|
1095
|
+
|
990
1096
|
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
991
1097
|
struct llama_context * ctx
|
992
1098
|
);
|
993
1099
|
|
1100
|
+
void llama_grammar_accept(
|
1101
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1102
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1103
|
+
const uint32_t chr,
|
1104
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
1105
|
+
|
1106
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1107
|
+
const std::string & src,
|
1108
|
+
llama_partial_utf8 partial_start);
|
1109
|
+
|
994
1110
|
#endif // LLAMA_API_INTERNAL
|
995
1111
|
|
996
1112
|
#endif // LLAMA_H
|