llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
|
@@ -64,6 +64,15 @@ extern "C" {
|
|
|
64
64
|
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
|
65
65
|
};
|
|
66
66
|
|
|
67
|
+
// note: these values should be synchronized with ggml_rope
|
|
68
|
+
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
|
69
|
+
enum llama_rope_type {
|
|
70
|
+
LLAMA_ROPE_TYPE_NONE = -1,
|
|
71
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
|
72
|
+
LLAMA_ROPE_TYPE_NEOX = 2,
|
|
73
|
+
LLAMA_ROPE_TYPE_GLM = 4,
|
|
74
|
+
};
|
|
75
|
+
|
|
67
76
|
enum llama_token_type {
|
|
68
77
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
|
69
78
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
|
@@ -98,24 +107,37 @@ extern "C" {
|
|
|
98
107
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
|
99
108
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
|
100
109
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
|
101
|
-
|
|
110
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
|
102
111
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
|
112
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
|
113
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
|
114
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
|
115
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
|
116
|
+
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
|
117
|
+
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
|
118
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
|
103
119
|
|
|
104
120
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
105
121
|
};
|
|
106
122
|
|
|
107
123
|
enum llama_rope_scaling_type {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
124
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
|
125
|
+
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
|
126
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
|
127
|
+
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
|
128
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
enum llama_pooling_type {
|
|
132
|
+
LLAMA_POOLING_TYPE_NONE = 0,
|
|
133
|
+
LLAMA_POOLING_TYPE_MEAN = 1,
|
|
134
|
+
LLAMA_POOLING_TYPE_CLS = 2,
|
|
113
135
|
};
|
|
114
136
|
|
|
115
137
|
enum llama_split_mode {
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
138
|
+
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
|
139
|
+
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
|
140
|
+
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
|
119
141
|
};
|
|
120
142
|
|
|
121
143
|
typedef struct llama_token_data {
|
|
@@ -163,9 +185,9 @@ extern "C" {
|
|
|
163
185
|
} llama_batch;
|
|
164
186
|
|
|
165
187
|
enum llama_model_kv_override_type {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
188
|
+
LLAMA_KV_OVERRIDE_TYPE_INT,
|
|
189
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
|
190
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
|
169
191
|
};
|
|
170
192
|
|
|
171
193
|
struct llama_model_kv_override {
|
|
@@ -224,6 +246,7 @@ extern "C" {
|
|
|
224
246
|
float yarn_beta_fast; // YaRN low correction dim
|
|
225
247
|
float yarn_beta_slow; // YaRN high correction dim
|
|
226
248
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
|
249
|
+
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
|
227
250
|
|
|
228
251
|
ggml_backend_sched_eval_callback cb_eval;
|
|
229
252
|
void * cb_eval_user_data;
|
|
@@ -232,7 +255,6 @@ extern "C" {
|
|
|
232
255
|
enum ggml_type type_v; // data type for V cache
|
|
233
256
|
|
|
234
257
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
235
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
|
236
258
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
237
259
|
bool embedding; // embedding mode only
|
|
238
260
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
@@ -298,6 +320,12 @@ extern "C" {
|
|
|
298
320
|
int32_t n_eval;
|
|
299
321
|
};
|
|
300
322
|
|
|
323
|
+
// used in chat template
|
|
324
|
+
typedef struct llama_chat_message {
|
|
325
|
+
const char * role;
|
|
326
|
+
const char * content;
|
|
327
|
+
} llama_chat_message;
|
|
328
|
+
|
|
301
329
|
// Helpers for getting default parameters
|
|
302
330
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
|
303
331
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
|
@@ -306,7 +334,10 @@ extern "C" {
|
|
|
306
334
|
// Initialize the llama + ggml backend
|
|
307
335
|
// If numa is true, use NUMA optimizations
|
|
308
336
|
// Call once at the start of the program
|
|
309
|
-
LLAMA_API void llama_backend_init(
|
|
337
|
+
LLAMA_API void llama_backend_init(void);
|
|
338
|
+
|
|
339
|
+
//optional:
|
|
340
|
+
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
|
310
341
|
|
|
311
342
|
// Call once at the end of the program - currently only used for MPI
|
|
312
343
|
LLAMA_API void llama_backend_free(void);
|
|
@@ -332,15 +363,13 @@ extern "C" {
|
|
|
332
363
|
LLAMA_API bool llama_supports_mlock (void);
|
|
333
364
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
|
334
365
|
|
|
335
|
-
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
|
336
|
-
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
|
337
|
-
|
|
338
366
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
|
339
367
|
|
|
340
368
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
341
369
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
342
370
|
|
|
343
371
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
|
372
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
344
373
|
|
|
345
374
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
|
346
375
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
|
@@ -390,14 +419,6 @@ extern "C" {
|
|
|
390
419
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
|
391
420
|
// will be applied on top of the previous one
|
|
392
421
|
// Returns 0 on success
|
|
393
|
-
LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
|
|
394
|
-
struct llama_context * ctx,
|
|
395
|
-
const char * path_lora,
|
|
396
|
-
float scale,
|
|
397
|
-
const char * path_base_model,
|
|
398
|
-
int32_t n_threads),
|
|
399
|
-
"use llama_model_apply_lora_from_file instead");
|
|
400
|
-
|
|
401
422
|
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
|
402
423
|
const struct llama_model * model,
|
|
403
424
|
const char * path_lora,
|
|
@@ -495,10 +516,12 @@ extern "C" {
|
|
|
495
516
|
llama_seq_id seq_id);
|
|
496
517
|
|
|
497
518
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
498
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly
|
|
519
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
520
|
+
// - lazily on next llama_decode()
|
|
521
|
+
// - explicitly with llama_kv_cache_update()
|
|
499
522
|
// p0 < 0 : [0, p1]
|
|
500
523
|
// p1 < 0 : [p0, inf)
|
|
501
|
-
LLAMA_API void
|
|
524
|
+
LLAMA_API void llama_kv_cache_seq_add(
|
|
502
525
|
struct llama_context * ctx,
|
|
503
526
|
llama_seq_id seq_id,
|
|
504
527
|
llama_pos p0,
|
|
@@ -506,7 +529,9 @@ extern "C" {
|
|
|
506
529
|
llama_pos delta);
|
|
507
530
|
|
|
508
531
|
// Integer division of the positions by factor of `d > 1`
|
|
509
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly
|
|
532
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
533
|
+
// - lazily on next llama_decode()
|
|
534
|
+
// - explicitly with llama_kv_cache_update()
|
|
510
535
|
// p0 < 0 : [0, p1]
|
|
511
536
|
// p1 < 0 : [p0, inf)
|
|
512
537
|
LLAMA_API void llama_kv_cache_seq_div(
|
|
@@ -516,6 +541,20 @@ extern "C" {
|
|
|
516
541
|
llama_pos p1,
|
|
517
542
|
int d);
|
|
518
543
|
|
|
544
|
+
// Returns the largest position present in the KV cache for the specified sequence
|
|
545
|
+
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
|
546
|
+
struct llama_context * ctx,
|
|
547
|
+
llama_seq_id seq_id);
|
|
548
|
+
|
|
549
|
+
// Defragment the KV cache
|
|
550
|
+
// This will be applied:
|
|
551
|
+
// - lazily on next llama_decode()
|
|
552
|
+
// - explicitly with llama_kv_cache_update()
|
|
553
|
+
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
|
554
|
+
|
|
555
|
+
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
556
|
+
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
|
557
|
+
|
|
519
558
|
//
|
|
520
559
|
// State / sessions
|
|
521
560
|
//
|
|
@@ -535,7 +574,7 @@ extern "C" {
|
|
|
535
574
|
// Returns the number of bytes read
|
|
536
575
|
LLAMA_API size_t llama_set_state_data(
|
|
537
576
|
struct llama_context * ctx,
|
|
538
|
-
|
|
577
|
+
const uint8_t * src);
|
|
539
578
|
|
|
540
579
|
// Save/load session file
|
|
541
580
|
LLAMA_API bool llama_load_session_file(
|
|
@@ -555,27 +594,6 @@ extern "C" {
|
|
|
555
594
|
// Decoding
|
|
556
595
|
//
|
|
557
596
|
|
|
558
|
-
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
|
559
|
-
// tokens + n_tokens is the provided batch of new tokens to process
|
|
560
|
-
// n_past is the number of tokens to use from previous eval calls
|
|
561
|
-
// Returns 0 on success
|
|
562
|
-
// DEPRECATED: use llama_decode() instead
|
|
563
|
-
LLAMA_API DEPRECATED(int llama_eval(
|
|
564
|
-
struct llama_context * ctx,
|
|
565
|
-
llama_token * tokens,
|
|
566
|
-
int32_t n_tokens,
|
|
567
|
-
int32_t n_past),
|
|
568
|
-
"use llama_decode() instead");
|
|
569
|
-
|
|
570
|
-
// Same as llama_eval, but use float matrix input directly.
|
|
571
|
-
// DEPRECATED: use llama_decode() instead
|
|
572
|
-
LLAMA_API DEPRECATED(int llama_eval_embd(
|
|
573
|
-
struct llama_context * ctx,
|
|
574
|
-
float * embd,
|
|
575
|
-
int32_t n_tokens,
|
|
576
|
-
int32_t n_past),
|
|
577
|
-
"use llama_decode() instead");
|
|
578
|
-
|
|
579
597
|
// Return batch for single sequence of tokens starting at pos_0
|
|
580
598
|
//
|
|
581
599
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
|
@@ -689,6 +707,25 @@ extern "C" {
|
|
|
689
707
|
char * buf,
|
|
690
708
|
int32_t length);
|
|
691
709
|
|
|
710
|
+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
|
711
|
+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
|
712
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
713
|
+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
|
714
|
+
/// @param chat Pointer to a list of multiple llama_chat_message
|
|
715
|
+
/// @param n_msg Number of llama_chat_message in this chat
|
|
716
|
+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
|
717
|
+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
|
718
|
+
/// @param length The size of the allocated buffer
|
|
719
|
+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
|
720
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
|
721
|
+
const struct llama_model * model,
|
|
722
|
+
const char * tmpl,
|
|
723
|
+
const struct llama_chat_message * chat,
|
|
724
|
+
size_t n_msg,
|
|
725
|
+
bool add_ass,
|
|
726
|
+
char * buf,
|
|
727
|
+
int32_t length);
|
|
728
|
+
|
|
692
729
|
//
|
|
693
730
|
// Grammar
|
|
694
731
|
//
|
|
@@ -730,13 +767,6 @@ extern "C" {
|
|
|
730
767
|
float * logits_guidance,
|
|
731
768
|
float scale);
|
|
732
769
|
|
|
733
|
-
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
|
734
|
-
struct llama_context * ctx,
|
|
735
|
-
llama_token_data_array * candidates,
|
|
736
|
-
struct llama_context * guidance_ctx,
|
|
737
|
-
float scale),
|
|
738
|
-
"use llama_sample_apply_guidance() instead");
|
|
739
|
-
|
|
740
770
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
741
771
|
LLAMA_API void llama_sample_softmax(
|
|
742
772
|
struct llama_context * ctx,
|
|
@@ -790,12 +820,6 @@ extern "C" {
|
|
|
790
820
|
llama_token_data_array * candidates,
|
|
791
821
|
float temp);
|
|
792
822
|
|
|
793
|
-
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
|
794
|
-
struct llama_context * ctx,
|
|
795
|
-
llama_token_data_array * candidates,
|
|
796
|
-
float temp),
|
|
797
|
-
"use llama_sample_temp instead");
|
|
798
|
-
|
|
799
823
|
/// @details Apply constraints from grammar
|
|
800
824
|
LLAMA_API void llama_sample_grammar(
|
|
801
825
|
struct llama_context * ctx,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
|
2
2
|
GF_CC_IS_GCC = 1
|
|
3
|
-
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null
|
|
3
|
+
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
|
|
4
4
|
else
|
|
5
5
|
GF_CC_IS_CLANG = 1
|
|
6
6
|
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|