llama_cpp 0.14.6 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -40,7 +40,7 @@
|
|
40
40
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
41
41
|
|
42
42
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION
|
43
|
+
#define LLAMA_SESSION_VERSION 6
|
44
44
|
|
45
45
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
46
|
#define LLAMA_STATE_SEQ_VERSION 1
|
@@ -69,6 +69,18 @@ extern "C" {
|
|
69
69
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
70
70
|
};
|
71
71
|
|
72
|
+
// pre-tokenization types
|
73
|
+
enum llama_vocab_pre_type {
|
74
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
75
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
76
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
77
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
78
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
79
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
+
};
|
83
|
+
|
72
84
|
// note: these values should be synchronized with ggml_rope
|
73
85
|
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
74
86
|
enum llama_rope_type {
|
@@ -195,15 +207,19 @@ extern "C" {
|
|
195
207
|
LLAMA_KV_OVERRIDE_TYPE_INT,
|
196
208
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
197
209
|
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
210
|
+
LLAMA_KV_OVERRIDE_TYPE_STR,
|
198
211
|
};
|
199
212
|
|
200
213
|
struct llama_model_kv_override {
|
201
|
-
char key[128];
|
202
214
|
enum llama_model_kv_override_type tag;
|
215
|
+
|
216
|
+
char key[128];
|
217
|
+
|
203
218
|
union {
|
204
|
-
int64_t
|
205
|
-
double
|
206
|
-
bool
|
219
|
+
int64_t val_i64;
|
220
|
+
double val_f64;
|
221
|
+
bool val_bool;
|
222
|
+
char val_str[128];
|
207
223
|
};
|
208
224
|
};
|
209
225
|
|
@@ -232,9 +248,10 @@ extern "C" {
|
|
232
248
|
const struct llama_model_kv_override * kv_overrides;
|
233
249
|
|
234
250
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
235
|
-
bool vocab_only;
|
236
|
-
bool use_mmap;
|
237
|
-
bool use_mlock;
|
251
|
+
bool vocab_only; // only load the vocabulary, no weights
|
252
|
+
bool use_mmap; // use mmap if possible
|
253
|
+
bool use_mlock; // force system to keep model in RAM
|
254
|
+
bool check_tensors; // validate model tensor data
|
238
255
|
};
|
239
256
|
|
240
257
|
struct llama_context_params {
|
@@ -270,6 +287,7 @@ extern "C" {
|
|
270
287
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
271
288
|
bool embeddings; // if true, extract embeddings (together with logits)
|
272
289
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
290
|
+
bool flash_attn; // whether to use flash attention
|
273
291
|
|
274
292
|
// Abort callback
|
275
293
|
// if it returns true, execution of llama_decode() will be aborted
|
@@ -288,6 +306,7 @@ extern "C" {
|
|
288
306
|
bool quantize_output_tensor; // quantize output.weight
|
289
307
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
290
308
|
bool pure; // quantize all tensors to the default type
|
309
|
+
bool keep_split; // quantize to the same number of shards
|
291
310
|
void * imatrix; // pointer to importance matrix data
|
292
311
|
void * kv_overrides; // pointer to vector containing overrides
|
293
312
|
} llama_model_quantize_params;
|
@@ -390,8 +409,10 @@ extern "C" {
|
|
390
409
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
391
410
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
392
411
|
|
393
|
-
LLAMA_API enum
|
394
|
-
|
412
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
413
|
+
|
414
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
415
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
395
416
|
|
396
417
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
397
418
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
@@ -522,7 +543,7 @@ extern "C" {
|
|
522
543
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
523
544
|
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
524
545
|
|
525
|
-
// Clear the KV cache
|
546
|
+
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
526
547
|
LLAMA_API void llama_kv_cache_clear(
|
527
548
|
struct llama_context * ctx);
|
528
549
|
|
@@ -783,6 +804,9 @@ extern "C" {
|
|
783
804
|
|
784
805
|
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
785
806
|
|
807
|
+
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
808
|
+
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
809
|
+
|
786
810
|
// Special tokens
|
787
811
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
788
812
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
@@ -796,7 +820,7 @@ extern "C" {
|
|
796
820
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
797
821
|
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
798
822
|
|
799
|
-
//
|
823
|
+
// Codellama infill tokens
|
800
824
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
801
825
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
802
826
|
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
@@ -825,11 +849,13 @@ extern "C" {
|
|
825
849
|
// Uses the vocabulary in the provided context.
|
826
850
|
// Does not write null terminator to the buffer.
|
827
851
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
852
|
+
// @param special If true, special tokens are rendered in the output.
|
828
853
|
LLAMA_API int32_t llama_token_to_piece(
|
829
854
|
const struct llama_model * model,
|
830
855
|
llama_token token,
|
831
856
|
char * buf,
|
832
|
-
int32_t length
|
857
|
+
int32_t length,
|
858
|
+
bool special);
|
833
859
|
|
834
860
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
835
861
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
@@ -982,7 +1008,7 @@ extern "C" {
|
|
982
1008
|
struct llama_context * ctx,
|
983
1009
|
llama_token_data_array * candidates);
|
984
1010
|
|
985
|
-
/// @details Randomly selects a token from the candidates based on their probabilities.
|
1011
|
+
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
986
1012
|
LLAMA_API llama_token llama_sample_token(
|
987
1013
|
struct llama_context * ctx,
|
988
1014
|
llama_token_data_array * candidates);
|
@@ -1069,8 +1095,9 @@ extern "C" {
|
|
1069
1095
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
1070
1096
|
#ifdef LLAMA_API_INTERNAL
|
1071
1097
|
|
1072
|
-
#include <
|
1098
|
+
#include <random>
|
1073
1099
|
#include <string>
|
1100
|
+
#include <vector>
|
1074
1101
|
|
1075
1102
|
struct ggml_tensor;
|
1076
1103
|
|
@@ -1107,6 +1134,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
1107
1134
|
const std::string & src,
|
1108
1135
|
llama_partial_utf8 partial_start);
|
1109
1136
|
|
1137
|
+
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
1138
|
+
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
1139
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
1140
|
+
|
1110
1141
|
#endif // LLAMA_API_INTERNAL
|
1111
1142
|
|
1112
1143
|
#endif // LLAMA_H
|