llama_cpp 0.14.2 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -39,7 +39,7 @@
|
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
40
|
|
41
41
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
-
#define LLAMA_SESSION_VERSION
|
42
|
+
#define LLAMA_SESSION_VERSION 5
|
43
43
|
|
44
44
|
#ifdef __cplusplus
|
45
45
|
extern "C" {
|
@@ -60,9 +60,9 @@ extern "C" {
|
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
62
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
64
|
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
65
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
66
66
|
};
|
67
67
|
|
68
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -117,6 +117,7 @@ extern "C" {
|
|
117
117
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
118
118
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
119
119
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
120
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
120
121
|
|
121
122
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
122
123
|
};
|
@@ -275,13 +276,16 @@ extern "C" {
|
|
275
276
|
|
276
277
|
// model quantization parameters
|
277
278
|
typedef struct llama_model_quantize_params {
|
278
|
-
int32_t nthread;
|
279
|
-
enum llama_ftype ftype;
|
280
|
-
|
281
|
-
|
282
|
-
bool
|
283
|
-
bool
|
284
|
-
|
279
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
280
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
281
|
+
enum ggml_type output_tensor_type; // output tensor type
|
282
|
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
283
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
284
|
+
bool quantize_output_tensor; // quantize output.weight
|
285
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
286
|
+
bool pure; // quantize all tensors to the default type
|
287
|
+
void * imatrix; // pointer to importance matrix data
|
288
|
+
void * kv_overrides; // pointer to vector containing overrides
|
285
289
|
} llama_model_quantize_params;
|
286
290
|
|
287
291
|
// grammar types
|
@@ -388,6 +392,7 @@ extern "C" {
|
|
388
392
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
389
393
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
390
394
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
395
|
+
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
391
396
|
|
392
397
|
// Get the model's RoPE frequency scaling factor
|
393
398
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
@@ -435,10 +440,24 @@ extern "C" {
|
|
435
440
|
// Returns 0 on success
|
436
441
|
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
437
442
|
const struct llama_model * model,
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
443
|
+
const char * path_lora,
|
444
|
+
float scale,
|
445
|
+
const char * path_base_model,
|
446
|
+
int32_t n_threads);
|
447
|
+
|
448
|
+
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
449
|
+
// the currently loaded vector.
|
450
|
+
// n_embd should be the size of a single layer's control, and data should point
|
451
|
+
// to an n_embd x n_layers buffer starting from layer 1.
|
452
|
+
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
453
|
+
// See llama_control_vector_load in common to load a control vector.
|
454
|
+
LLAMA_API int32_t llama_control_vector_apply(
|
455
|
+
struct llama_context * lctx,
|
456
|
+
const float * data,
|
457
|
+
size_t len,
|
458
|
+
int32_t n_embd,
|
459
|
+
int32_t il_start,
|
460
|
+
int32_t il_end);
|
442
461
|
|
443
462
|
//
|
444
463
|
// KV cache
|
@@ -659,23 +678,29 @@ extern "C" {
|
|
659
678
|
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
660
679
|
|
661
680
|
// Token logits obtained from the last call to llama_decode()
|
662
|
-
// The logits for
|
663
|
-
//
|
664
|
-
// Rows:
|
681
|
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
682
|
+
// in the order they have appeared in the batch.
|
683
|
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
665
684
|
// Cols: n_vocab
|
666
685
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
667
686
|
|
668
687
|
// Logits for the ith token. Equivalent to:
|
669
|
-
// llama_get_logits(ctx) + i*n_vocab
|
688
|
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
689
|
+
// returns NULL for invalid ids.
|
670
690
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
671
691
|
|
672
|
-
// Get all output token embeddings
|
673
|
-
//
|
692
|
+
// Get all output token embeddings.
|
693
|
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
694
|
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
695
|
+
// in the order they have appeared in the batch.
|
696
|
+
// shape: [n_outputs*n_embd]
|
697
|
+
// Otherwise, returns NULL.
|
674
698
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
675
699
|
|
676
|
-
// Get the embeddings for the ith token
|
677
|
-
// llama_get_embeddings(ctx) + i*n_embd
|
700
|
+
// Get the embeddings for the ith token. Equivalent to:
|
701
|
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
678
702
|
// shape: [n_embd] (1-dimensional)
|
703
|
+
// returns NULL for invalid ids.
|
679
704
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
680
705
|
|
681
706
|
// Get the embeddings for a sequence id
|
@@ -945,6 +970,16 @@ extern "C" {
|
|
945
970
|
int32_t n_past,
|
946
971
|
int32_t n_predict);
|
947
972
|
|
973
|
+
/// @details Build a split GGUF final path for this chunk.
|
974
|
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
975
|
+
// Returns the split_path length.
|
976
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
977
|
+
|
978
|
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
979
|
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
980
|
+
// Returns the split_prefix length.
|
981
|
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
982
|
+
|
948
983
|
// Performance information
|
949
984
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
950
985
|
|
@@ -972,10 +1007,38 @@ extern "C" {
|
|
972
1007
|
|
973
1008
|
struct ggml_tensor;
|
974
1009
|
|
1010
|
+
struct llama_partial_utf8 {
|
1011
|
+
uint32_t value; // bit value so far (unshifted)
|
1012
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1013
|
+
};
|
1014
|
+
|
1015
|
+
struct llama_grammar {
|
1016
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1017
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1018
|
+
|
1019
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1020
|
+
llama_partial_utf8 partial_utf8;
|
1021
|
+
};
|
1022
|
+
|
1023
|
+
struct llama_grammar_candidate {
|
1024
|
+
size_t index;
|
1025
|
+
const uint32_t * code_points;
|
1026
|
+
llama_partial_utf8 partial_utf8;
|
1027
|
+
};
|
1028
|
+
|
975
1029
|
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
976
1030
|
struct llama_context * ctx
|
977
1031
|
);
|
978
1032
|
|
1033
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
1034
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1035
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1036
|
+
const uint32_t chr);
|
1037
|
+
|
1038
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1039
|
+
const std::string & src,
|
1040
|
+
llama_partial_utf8 partial_start);
|
1041
|
+
|
979
1042
|
#endif // LLAMA_API_INTERNAL
|
980
1043
|
|
981
1044
|
#endif // LLAMA_H
|