llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -39,7 +39,7 @@
|
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
40
|
|
41
41
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
-
#define LLAMA_SESSION_VERSION
|
42
|
+
#define LLAMA_SESSION_VERSION 5
|
43
43
|
|
44
44
|
#ifdef __cplusplus
|
45
45
|
extern "C" {
|
@@ -60,9 +60,9 @@ extern "C" {
|
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
62
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
64
|
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
65
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
66
66
|
};
|
67
67
|
|
68
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -117,6 +117,7 @@ extern "C" {
|
|
117
117
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
118
118
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
119
119
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
120
|
+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
120
121
|
|
121
122
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
122
123
|
};
|
@@ -275,13 +276,16 @@ extern "C" {
|
|
275
276
|
|
276
277
|
// model quantization parameters
|
277
278
|
typedef struct llama_model_quantize_params {
|
278
|
-
int32_t nthread;
|
279
|
-
enum llama_ftype ftype;
|
280
|
-
|
281
|
-
|
282
|
-
bool
|
283
|
-
bool
|
284
|
-
|
279
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
280
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
281
|
+
enum ggml_type output_tensor_type; // output tensor type
|
282
|
+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
283
|
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
284
|
+
bool quantize_output_tensor; // quantize output.weight
|
285
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
286
|
+
bool pure; // quantize all tensors to the default type
|
287
|
+
void * imatrix; // pointer to importance matrix data
|
288
|
+
void * kv_overrides; // pointer to vector containing overrides
|
285
289
|
} llama_model_quantize_params;
|
286
290
|
|
287
291
|
// grammar types
|
@@ -388,6 +392,7 @@ extern "C" {
|
|
388
392
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
389
393
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
390
394
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
395
|
+
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
391
396
|
|
392
397
|
// Get the model's RoPE frequency scaling factor
|
393
398
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
@@ -435,10 +440,24 @@ extern "C" {
|
|
435
440
|
// Returns 0 on success
|
436
441
|
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
437
442
|
const struct llama_model * model,
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
443
|
+
const char * path_lora,
|
444
|
+
float scale,
|
445
|
+
const char * path_base_model,
|
446
|
+
int32_t n_threads);
|
447
|
+
|
448
|
+
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
449
|
+
// the currently loaded vector.
|
450
|
+
// n_embd should be the size of a single layer's control, and data should point
|
451
|
+
// to an n_embd x n_layers buffer starting from layer 1.
|
452
|
+
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
453
|
+
// See llama_control_vector_load in common to load a control vector.
|
454
|
+
LLAMA_API int32_t llama_control_vector_apply(
|
455
|
+
struct llama_context * lctx,
|
456
|
+
const float * data,
|
457
|
+
size_t len,
|
458
|
+
int32_t n_embd,
|
459
|
+
int32_t il_start,
|
460
|
+
int32_t il_end);
|
442
461
|
|
443
462
|
//
|
444
463
|
// KV cache
|
@@ -659,23 +678,29 @@ extern "C" {
|
|
659
678
|
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
660
679
|
|
661
680
|
// Token logits obtained from the last call to llama_decode()
|
662
|
-
// The logits for
|
663
|
-
//
|
664
|
-
// Rows:
|
681
|
+
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
682
|
+
// in the order they have appeared in the batch.
|
683
|
+
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
665
684
|
// Cols: n_vocab
|
666
685
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
667
686
|
|
668
687
|
// Logits for the ith token. Equivalent to:
|
669
|
-
// llama_get_logits(ctx) + i*n_vocab
|
688
|
+
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
689
|
+
// returns NULL for invalid ids.
|
670
690
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
671
691
|
|
672
|
-
// Get all output token embeddings
|
673
|
-
//
|
692
|
+
// Get all output token embeddings.
|
693
|
+
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
694
|
+
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
695
|
+
// in the order they have appeared in the batch.
|
696
|
+
// shape: [n_outputs*n_embd]
|
697
|
+
// Otherwise, returns NULL.
|
674
698
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
675
699
|
|
676
|
-
// Get the embeddings for the ith token
|
677
|
-
// llama_get_embeddings(ctx) + i*n_embd
|
700
|
+
// Get the embeddings for the ith token. Equivalent to:
|
701
|
+
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
678
702
|
// shape: [n_embd] (1-dimensional)
|
703
|
+
// returns NULL for invalid ids.
|
679
704
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
680
705
|
|
681
706
|
// Get the embeddings for a sequence id
|
@@ -945,6 +970,16 @@ extern "C" {
|
|
945
970
|
int32_t n_past,
|
946
971
|
int32_t n_predict);
|
947
972
|
|
973
|
+
/// @details Build a split GGUF final path for this chunk.
|
974
|
+
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
975
|
+
// Returns the split_path length.
|
976
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
977
|
+
|
978
|
+
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
979
|
+
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
980
|
+
// Returns the split_prefix length.
|
981
|
+
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
982
|
+
|
948
983
|
// Performance information
|
949
984
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
950
985
|
|
@@ -972,10 +1007,38 @@ extern "C" {
|
|
972
1007
|
|
973
1008
|
struct ggml_tensor;
|
974
1009
|
|
1010
|
+
struct llama_partial_utf8 {
|
1011
|
+
uint32_t value; // bit value so far (unshifted)
|
1012
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1013
|
+
};
|
1014
|
+
|
1015
|
+
struct llama_grammar {
|
1016
|
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
1017
|
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1018
|
+
|
1019
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1020
|
+
llama_partial_utf8 partial_utf8;
|
1021
|
+
};
|
1022
|
+
|
1023
|
+
struct llama_grammar_candidate {
|
1024
|
+
size_t index;
|
1025
|
+
const uint32_t * code_points;
|
1026
|
+
llama_partial_utf8 partial_utf8;
|
1027
|
+
};
|
1028
|
+
|
975
1029
|
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
976
1030
|
struct llama_context * ctx
|
977
1031
|
);
|
978
1032
|
|
1033
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
1034
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1035
|
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1036
|
+
const uint32_t chr);
|
1037
|
+
|
1038
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1039
|
+
const std::string & src,
|
1040
|
+
llama_partial_utf8 partial_start);
|
1041
|
+
|
979
1042
|
#endif // LLAMA_API_INTERNAL
|
980
1043
|
|
981
1044
|
#endif // LLAMA_H
|