llama_cpp 0.14.6 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -40,7 +40,7 @@
|
|
40
40
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
41
41
|
|
42
42
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION
|
43
|
+
#define LLAMA_SESSION_VERSION 6
|
44
44
|
|
45
45
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
46
|
#define LLAMA_STATE_SEQ_VERSION 1
|
@@ -69,6 +69,18 @@ extern "C" {
|
|
69
69
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
70
70
|
};
|
71
71
|
|
72
|
+
// pre-tokenization types
|
73
|
+
enum llama_vocab_pre_type {
|
74
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
75
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
76
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
77
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
78
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
79
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
+
};
|
83
|
+
|
72
84
|
// note: these values should be synchronized with ggml_rope
|
73
85
|
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
74
86
|
enum llama_rope_type {
|
@@ -195,15 +207,19 @@ extern "C" {
|
|
195
207
|
LLAMA_KV_OVERRIDE_TYPE_INT,
|
196
208
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
197
209
|
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
210
|
+
LLAMA_KV_OVERRIDE_TYPE_STR,
|
198
211
|
};
|
199
212
|
|
200
213
|
struct llama_model_kv_override {
|
201
|
-
char key[128];
|
202
214
|
enum llama_model_kv_override_type tag;
|
215
|
+
|
216
|
+
char key[128];
|
217
|
+
|
203
218
|
union {
|
204
|
-
int64_t
|
205
|
-
double
|
206
|
-
bool
|
219
|
+
int64_t val_i64;
|
220
|
+
double val_f64;
|
221
|
+
bool val_bool;
|
222
|
+
char val_str[128];
|
207
223
|
};
|
208
224
|
};
|
209
225
|
|
@@ -232,9 +248,10 @@ extern "C" {
|
|
232
248
|
const struct llama_model_kv_override * kv_overrides;
|
233
249
|
|
234
250
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
235
|
-
bool vocab_only;
|
236
|
-
bool use_mmap;
|
237
|
-
bool use_mlock;
|
251
|
+
bool vocab_only; // only load the vocabulary, no weights
|
252
|
+
bool use_mmap; // use mmap if possible
|
253
|
+
bool use_mlock; // force system to keep model in RAM
|
254
|
+
bool check_tensors; // validate model tensor data
|
238
255
|
};
|
239
256
|
|
240
257
|
struct llama_context_params {
|
@@ -270,6 +287,7 @@ extern "C" {
|
|
270
287
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
271
288
|
bool embeddings; // if true, extract embeddings (together with logits)
|
272
289
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
290
|
+
bool flash_attn; // whether to use flash attention
|
273
291
|
|
274
292
|
// Abort callback
|
275
293
|
// if it returns true, execution of llama_decode() will be aborted
|
@@ -288,6 +306,7 @@ extern "C" {
|
|
288
306
|
bool quantize_output_tensor; // quantize output.weight
|
289
307
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
290
308
|
bool pure; // quantize all tensors to the default type
|
309
|
+
bool keep_split; // quantize to the same number of shards
|
291
310
|
void * imatrix; // pointer to importance matrix data
|
292
311
|
void * kv_overrides; // pointer to vector containing overrides
|
293
312
|
} llama_model_quantize_params;
|
@@ -390,8 +409,10 @@ extern "C" {
|
|
390
409
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
391
410
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
392
411
|
|
393
|
-
LLAMA_API enum
|
394
|
-
|
412
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
413
|
+
|
414
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
415
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
395
416
|
|
396
417
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
397
418
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
@@ -522,7 +543,7 @@ extern "C" {
|
|
522
543
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
523
544
|
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
524
545
|
|
525
|
-
// Clear the KV cache
|
546
|
+
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
526
547
|
LLAMA_API void llama_kv_cache_clear(
|
527
548
|
struct llama_context * ctx);
|
528
549
|
|
@@ -783,6 +804,9 @@ extern "C" {
|
|
783
804
|
|
784
805
|
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
785
806
|
|
807
|
+
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
808
|
+
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
809
|
+
|
786
810
|
// Special tokens
|
787
811
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
788
812
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
@@ -796,7 +820,7 @@ extern "C" {
|
|
796
820
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
797
821
|
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
798
822
|
|
799
|
-
//
|
823
|
+
// Codellama infill tokens
|
800
824
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
801
825
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
802
826
|
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
@@ -825,11 +849,13 @@ extern "C" {
|
|
825
849
|
// Uses the vocabulary in the provided context.
|
826
850
|
// Does not write null terminator to the buffer.
|
827
851
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
852
|
+
// @param special If true, special tokens are rendered in the output.
|
828
853
|
LLAMA_API int32_t llama_token_to_piece(
|
829
854
|
const struct llama_model * model,
|
830
855
|
llama_token token,
|
831
856
|
char * buf,
|
832
|
-
int32_t length
|
857
|
+
int32_t length,
|
858
|
+
bool special);
|
833
859
|
|
834
860
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
835
861
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
@@ -982,7 +1008,7 @@ extern "C" {
|
|
982
1008
|
struct llama_context * ctx,
|
983
1009
|
llama_token_data_array * candidates);
|
984
1010
|
|
985
|
-
/// @details Randomly selects a token from the candidates based on their probabilities.
|
1011
|
+
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
986
1012
|
LLAMA_API llama_token llama_sample_token(
|
987
1013
|
struct llama_context * ctx,
|
988
1014
|
llama_token_data_array * candidates);
|
@@ -1069,8 +1095,9 @@ extern "C" {
|
|
1069
1095
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
1070
1096
|
#ifdef LLAMA_API_INTERNAL
|
1071
1097
|
|
1072
|
-
#include <
|
1098
|
+
#include <random>
|
1073
1099
|
#include <string>
|
1100
|
+
#include <vector>
|
1074
1101
|
|
1075
1102
|
struct ggml_tensor;
|
1076
1103
|
|
@@ -1107,6 +1134,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
1107
1134
|
const std::string & src,
|
1108
1135
|
llama_partial_utf8 partial_start);
|
1109
1136
|
|
1137
|
+
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
1138
|
+
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
1139
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
1140
|
+
|
1110
1141
|
#endif // LLAMA_API_INTERNAL
|
1111
1142
|
|
1112
1143
|
#endif // LLAMA_H
|