llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -39,7 +39,7 @@
|
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
40
|
|
41
41
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
-
#define LLAMA_SESSION_VERSION
|
42
|
+
#define LLAMA_SESSION_VERSION 5
|
43
43
|
|
44
44
|
#ifdef __cplusplus
|
45
45
|
extern "C" {
|
@@ -60,9 +60,9 @@ extern "C" {
|
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
62
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
64
|
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
65
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
66
66
|
};
|
67
67
|
|
68
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -117,6 +117,7 @@ extern "C" {
|
|
117
117
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
118
118
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
119
119
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
120
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
120
121
|
|
121
122
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
122
123
|
};
|
@@ -275,13 +276,16 @@ extern "C" {
|
|
275
276
|
|
276
277
|
// model quantization parameters
|
277
278
|
typedef struct llama_model_quantize_params {
|
278
|
-
int32_t nthread;
|
279
|
-
enum llama_ftype ftype;
|
280
|
-
|
281
|
-
|
282
|
-
bool
|
283
|
-
bool
|
284
|
-
|
279
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
280
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
281
|
+
enum ggml_type output_tensor_type; // output tensor type
|
282
|
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
283
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
284
|
+
bool quantize_output_tensor; // quantize output.weight
|
285
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
286
|
+
bool pure; // quantize all tensors to the default type
|
287
|
+
void * imatrix; // pointer to importance matrix data
|
288
|
+
void * kv_overrides; // pointer to vector containing overrides
|
285
289
|
} llama_model_quantize_params;
|
286
290
|
|
287
291
|
// grammar types
|
@@ -674,23 +678,29 @@ extern "C" {
|
|
674
678
|
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
675
679
|
|
676
680
|
// Token logits obtained from the last call to llama_decode()
|
677
|
-
// The logits for
|
678
|
-
//
|
679
|
-
// Rows:
|
681
|
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
682
|
+
// in the order they have appeared in the batch.
|
683
|
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
680
684
|
// Cols: n_vocab
|
681
685
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
682
686
|
|
683
687
|
// Logits for the ith token. Equivalent to:
|
684
|
-
// llama_get_logits(ctx) + i*n_vocab
|
688
|
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
689
|
+
// returns NULL for invalid ids.
|
685
690
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
686
691
|
|
687
|
-
// Get all output token embeddings
|
688
|
-
//
|
692
|
+
// Get all output token embeddings.
|
693
|
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
694
|
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
695
|
+
// in the order they have appeared in the batch.
|
696
|
+
// shape: [n_outputs*n_embd]
|
697
|
+
// Otherwise, returns NULL.
|
689
698
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
690
699
|
|
691
|
-
// Get the embeddings for the ith token
|
692
|
-
// llama_get_embeddings(ctx) + i*n_embd
|
700
|
+
// Get the embeddings for the ith token. Equivalent to:
|
701
|
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
693
702
|
// shape: [n_embd] (1-dimensional)
|
703
|
+
// returns NULL for invalid ids.
|
694
704
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
695
705
|
|
696
706
|
// Get the embeddings for a sequence id
|
@@ -960,6 +970,16 @@ extern "C" {
|
|
960
970
|
int32_t n_past,
|
961
971
|
int32_t n_predict);
|
962
972
|
|
973
|
+
/// @details Build a split GGUF final path for this chunk.
|
974
|
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
975
|
+
// Returns the split_path length.
|
976
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
977
|
+
|
978
|
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
979
|
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
980
|
+
// Returns the split_prefix length.
|
981
|
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
982
|
+
|
963
983
|
// Performance information
|
964
984
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
965
985
|
|
@@ -987,10 +1007,38 @@ extern "C" {
|
|
987
1007
|
|
988
1008
|
struct ggml_tensor;
|
989
1009
|
|
1010
|
+
struct llama_partial_utf8 {
|
1011
|
+
uint32_t value; // bit value so far (unshifted)
|
1012
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1013
|
+
};
|
1014
|
+
|
1015
|
+
struct llama_grammar {
|
1016
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1017
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1018
|
+
|
1019
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1020
|
+
llama_partial_utf8 partial_utf8;
|
1021
|
+
};
|
1022
|
+
|
1023
|
+
struct llama_grammar_candidate {
|
1024
|
+
size_t index;
|
1025
|
+
const uint32_t * code_points;
|
1026
|
+
llama_partial_utf8 partial_utf8;
|
1027
|
+
};
|
1028
|
+
|
990
1029
|
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
991
1030
|
struct llama_context * ctx
|
992
1031
|
);
|
993
1032
|
|
1033
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
1034
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1035
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1036
|
+
const uint32_t chr);
|
1037
|
+
|
1038
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1039
|
+
const std::string & src,
|
1040
|
+
llama_partial_utf8 partial_start);
|
1041
|
+
|
994
1042
|
#endif // LLAMA_API_INTERNAL
|
995
1043
|
|
996
1044
|
#endif // LLAMA_H
|