llama_cpp 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -39,7 +39,7 @@
|
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
40
|
|
41
41
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
-
#define LLAMA_SESSION_VERSION
|
42
|
+
#define LLAMA_SESSION_VERSION 5
|
43
43
|
|
44
44
|
#ifdef __cplusplus
|
45
45
|
extern "C" {
|
@@ -60,9 +60,9 @@ extern "C" {
|
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
62
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
64
|
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
65
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
66
66
|
};
|
67
67
|
|
68
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -117,6 +117,7 @@ extern "C" {
|
|
117
117
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
118
118
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
119
119
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
120
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
120
121
|
|
121
122
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
122
123
|
};
|
@@ -275,13 +276,16 @@ extern "C" {
|
|
275
276
|
|
276
277
|
// model quantization parameters
|
277
278
|
typedef struct llama_model_quantize_params {
|
278
|
-
int32_t nthread;
|
279
|
-
enum llama_ftype ftype;
|
280
|
-
|
281
|
-
|
282
|
-
bool
|
283
|
-
bool
|
284
|
-
|
279
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
280
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
281
|
+
enum ggml_type output_tensor_type; // output tensor type
|
282
|
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
283
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
284
|
+
bool quantize_output_tensor; // quantize output.weight
|
285
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
286
|
+
bool pure; // quantize all tensors to the default type
|
287
|
+
void * imatrix; // pointer to importance matrix data
|
288
|
+
void * kv_overrides; // pointer to vector containing overrides
|
285
289
|
} llama_model_quantize_params;
|
286
290
|
|
287
291
|
// grammar types
|
@@ -674,23 +678,29 @@ extern "C" {
|
|
674
678
|
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
675
679
|
|
676
680
|
// Token logits obtained from the last call to llama_decode()
|
677
|
-
// The logits for
|
678
|
-
//
|
679
|
-
// Rows:
|
681
|
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
682
|
+
// in the order they have appeared in the batch.
|
683
|
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
680
684
|
// Cols: n_vocab
|
681
685
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
682
686
|
|
683
687
|
// Logits for the ith token. Equivalent to:
|
684
|
-
// llama_get_logits(ctx) + i*n_vocab
|
688
|
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
689
|
+
// returns NULL for invalid ids.
|
685
690
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
686
691
|
|
687
|
-
// Get all output token embeddings
|
688
|
-
//
|
692
|
+
// Get all output token embeddings.
|
693
|
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
694
|
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
695
|
+
// in the order they have appeared in the batch.
|
696
|
+
// shape: [n_outputs*n_embd]
|
697
|
+
// Otherwise, returns NULL.
|
689
698
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
690
699
|
|
691
|
-
// Get the embeddings for the ith token
|
692
|
-
// llama_get_embeddings(ctx) + i*n_embd
|
700
|
+
// Get the embeddings for the ith token. Equivalent to:
|
701
|
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
693
702
|
// shape: [n_embd] (1-dimensional)
|
703
|
+
// returns NULL for invalid ids.
|
694
704
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
695
705
|
|
696
706
|
// Get the embeddings for a sequence id
|
@@ -960,6 +970,16 @@ extern "C" {
|
|
960
970
|
int32_t n_past,
|
961
971
|
int32_t n_predict);
|
962
972
|
|
973
|
+
/// @details Build a split GGUF final path for this chunk.
|
974
|
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
975
|
+
// Returns the split_path length.
|
976
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
977
|
+
|
978
|
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
979
|
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
980
|
+
// Returns the split_prefix length.
|
981
|
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
982
|
+
|
963
983
|
// Performance information
|
964
984
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
965
985
|
|
@@ -987,10 +1007,38 @@ extern "C" {
|
|
987
1007
|
|
988
1008
|
struct ggml_tensor;
|
989
1009
|
|
1010
|
+
struct llama_partial_utf8 {
|
1011
|
+
uint32_t value; // bit value so far (unshifted)
|
1012
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1013
|
+
};
|
1014
|
+
|
1015
|
+
struct llama_grammar {
|
1016
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1017
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1018
|
+
|
1019
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1020
|
+
llama_partial_utf8 partial_utf8;
|
1021
|
+
};
|
1022
|
+
|
1023
|
+
struct llama_grammar_candidate {
|
1024
|
+
size_t index;
|
1025
|
+
const uint32_t * code_points;
|
1026
|
+
llama_partial_utf8 partial_utf8;
|
1027
|
+
};
|
1028
|
+
|
990
1029
|
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
991
1030
|
struct llama_context * ctx
|
992
1031
|
);
|
993
1032
|
|
1033
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
1034
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1035
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1036
|
+
const uint32_t chr);
|
1037
|
+
|
1038
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1039
|
+
const std::string & src,
|
1040
|
+
llama_partial_utf8 partial_start);
|
1041
|
+
|
994
1042
|
#endif // LLAMA_API_INTERNAL
|
995
1043
|
|
996
1044
|
#endif // LLAMA_H
|