llama_cpp 0.12.7 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -64,6 +64,15 @@ extern "C" {
|
|
64
64
|
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
65
65
|
};
|
66
66
|
|
67
|
+
// note: these values should be synchronized with ggml_rope
|
68
|
+
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
69
|
+
enum llama_rope_type {
|
70
|
+
LLAMA_ROPE_TYPE_NONE = -1,
|
71
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
72
|
+
LLAMA_ROPE_TYPE_NEOX = 2,
|
73
|
+
LLAMA_ROPE_TYPE_GLM = 4,
|
74
|
+
};
|
75
|
+
|
67
76
|
enum llama_token_type {
|
68
77
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
69
78
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
@@ -98,32 +107,38 @@ extern "C" {
|
|
98
107
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
99
108
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
100
109
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
101
|
-
|
110
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
102
111
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
103
112
|
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
104
113
|
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
114
|
+
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
115
|
+
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
116
|
+
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
117
|
+
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
118
|
+
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
105
119
|
|
106
120
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
107
121
|
};
|
108
122
|
|
109
123
|
enum llama_rope_scaling_type {
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
124
|
+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
125
|
+
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
126
|
+
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
127
|
+
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
128
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
115
129
|
};
|
116
130
|
|
117
131
|
enum llama_pooling_type {
|
118
|
-
|
119
|
-
|
120
|
-
|
132
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
133
|
+
LLAMA_POOLING_TYPE_NONE = 0,
|
134
|
+
LLAMA_POOLING_TYPE_MEAN = 1,
|
135
|
+
LLAMA_POOLING_TYPE_CLS = 2,
|
121
136
|
};
|
122
137
|
|
123
138
|
enum llama_split_mode {
|
124
|
-
|
125
|
-
|
126
|
-
|
139
|
+
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
140
|
+
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
141
|
+
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
127
142
|
};
|
128
143
|
|
129
144
|
typedef struct llama_token_data {
|
@@ -148,7 +163,7 @@ extern "C" {
|
|
148
163
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
149
164
|
// - pos : the positions of the respective token in the sequence
|
150
165
|
// - seq_id : the sequence to which the respective token belongs
|
151
|
-
// - logits : if zero, the logits for the respective token will not be output
|
166
|
+
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
152
167
|
//
|
153
168
|
typedef struct llama_batch {
|
154
169
|
int32_t n_tokens;
|
@@ -158,7 +173,7 @@ extern "C" {
|
|
158
173
|
llama_pos * pos;
|
159
174
|
int32_t * n_seq_id;
|
160
175
|
llama_seq_id ** seq_id;
|
161
|
-
int8_t * logits;
|
176
|
+
int8_t * logits; // TODO: rename this to "output"
|
162
177
|
|
163
178
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
164
179
|
// for future-proof code, use the above fields instead and ignore everything below
|
@@ -171,9 +186,9 @@ extern "C" {
|
|
171
186
|
} llama_batch;
|
172
187
|
|
173
188
|
enum llama_model_kv_override_type {
|
174
|
-
|
175
|
-
|
176
|
-
|
189
|
+
LLAMA_KV_OVERRIDE_TYPE_INT,
|
190
|
+
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
191
|
+
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
177
192
|
};
|
178
193
|
|
179
194
|
struct llama_model_kv_override {
|
@@ -222,7 +237,10 @@ extern "C" {
|
|
222
237
|
uint32_t n_batch; // prompt processing maximum batch size
|
223
238
|
uint32_t n_threads; // number of threads to use for generation
|
224
239
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
225
|
-
|
240
|
+
|
241
|
+
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
242
|
+
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
243
|
+
// (ignored if no pooling layer)
|
226
244
|
|
227
245
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
228
246
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -232,6 +250,7 @@ extern "C" {
|
|
232
250
|
float yarn_beta_fast; // YaRN low correction dim
|
233
251
|
float yarn_beta_slow; // YaRN high correction dim
|
234
252
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
253
|
+
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
235
254
|
|
236
255
|
ggml_backend_sched_eval_callback cb_eval;
|
237
256
|
void * cb_eval_user_data;
|
@@ -240,11 +259,15 @@ extern "C" {
|
|
240
259
|
enum ggml_type type_v; // data type for V cache
|
241
260
|
|
242
261
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
243
|
-
bool
|
244
|
-
bool
|
245
|
-
bool embedding; // embedding mode only
|
262
|
+
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
263
|
+
bool embeddings; // if true, extract embeddings (together with logits)
|
246
264
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
247
|
-
|
265
|
+
|
266
|
+
// Abort callback
|
267
|
+
// if it returns true, execution of llama_decode() will be aborted
|
268
|
+
// currently works only with CPU execution
|
269
|
+
ggml_abort_callback abort_callback;
|
270
|
+
void * abort_callback_data;
|
248
271
|
};
|
249
272
|
|
250
273
|
// model quantization parameters
|
@@ -349,15 +372,13 @@ extern "C" {
|
|
349
372
|
LLAMA_API bool llama_supports_mlock (void);
|
350
373
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
351
374
|
|
352
|
-
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
353
|
-
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
354
|
-
|
355
375
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
356
376
|
|
357
377
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
358
378
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
359
379
|
|
360
380
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
381
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
361
382
|
|
362
383
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
363
384
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
@@ -407,14 +428,6 @@ extern "C" {
|
|
407
428
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
408
429
|
// will be applied on top of the previous one
|
409
430
|
// Returns 0 on success
|
410
|
-
LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
|
411
|
-
struct llama_context * ctx,
|
412
|
-
const char * path_lora,
|
413
|
-
float scale,
|
414
|
-
const char * path_base_model,
|
415
|
-
int32_t n_threads),
|
416
|
-
"use llama_model_apply_lora_from_file instead");
|
417
|
-
|
418
431
|
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
419
432
|
const struct llama_model * model,
|
420
433
|
const char * path_lora,
|
@@ -512,10 +525,12 @@ extern "C" {
|
|
512
525
|
llama_seq_id seq_id);
|
513
526
|
|
514
527
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
515
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly
|
528
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
529
|
+
// - lazily on next llama_decode()
|
530
|
+
// - explicitly with llama_kv_cache_update()
|
516
531
|
// p0 < 0 : [0, p1]
|
517
532
|
// p1 < 0 : [p0, inf)
|
518
|
-
LLAMA_API void
|
533
|
+
LLAMA_API void llama_kv_cache_seq_add(
|
519
534
|
struct llama_context * ctx,
|
520
535
|
llama_seq_id seq_id,
|
521
536
|
llama_pos p0,
|
@@ -523,7 +538,9 @@ extern "C" {
|
|
523
538
|
llama_pos delta);
|
524
539
|
|
525
540
|
// Integer division of the positions by factor of `d > 1`
|
526
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly
|
541
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
542
|
+
// - lazily on next llama_decode()
|
543
|
+
// - explicitly with llama_kv_cache_update()
|
527
544
|
// p0 < 0 : [0, p1]
|
528
545
|
// p1 < 0 : [p0, inf)
|
529
546
|
LLAMA_API void llama_kv_cache_seq_div(
|
@@ -533,6 +550,20 @@ extern "C" {
|
|
533
550
|
llama_pos p1,
|
534
551
|
int d);
|
535
552
|
|
553
|
+
// Returns the largest position present in the KV cache for the specified sequence
|
554
|
+
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
555
|
+
struct llama_context * ctx,
|
556
|
+
llama_seq_id seq_id);
|
557
|
+
|
558
|
+
// Defragment the KV cache
|
559
|
+
// This will be applied:
|
560
|
+
// - lazily on next llama_decode()
|
561
|
+
// - explicitly with llama_kv_cache_update()
|
562
|
+
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
563
|
+
|
564
|
+
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
565
|
+
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
566
|
+
|
536
567
|
//
|
537
568
|
// State / sessions
|
538
569
|
//
|
@@ -552,7 +583,7 @@ extern "C" {
|
|
552
583
|
// Returns the number of bytes read
|
553
584
|
LLAMA_API size_t llama_set_state_data(
|
554
585
|
struct llama_context * ctx,
|
555
|
-
|
586
|
+
const uint8_t * src);
|
556
587
|
|
557
588
|
// Save/load session file
|
558
589
|
LLAMA_API bool llama_load_session_file(
|
@@ -572,27 +603,6 @@ extern "C" {
|
|
572
603
|
// Decoding
|
573
604
|
//
|
574
605
|
|
575
|
-
// Run the llama inference to obtain the logits and probabilities for the next token(s).
|
576
|
-
// tokens + n_tokens is the provided batch of new tokens to process
|
577
|
-
// n_past is the number of tokens to use from previous eval calls
|
578
|
-
// Returns 0 on success
|
579
|
-
// DEPRECATED: use llama_decode() instead
|
580
|
-
LLAMA_API DEPRECATED(int llama_eval(
|
581
|
-
struct llama_context * ctx,
|
582
|
-
llama_token * tokens,
|
583
|
-
int32_t n_tokens,
|
584
|
-
int32_t n_past),
|
585
|
-
"use llama_decode() instead");
|
586
|
-
|
587
|
-
// Same as llama_eval, but use float matrix input directly.
|
588
|
-
// DEPRECATED: use llama_decode() instead
|
589
|
-
LLAMA_API DEPRECATED(int llama_eval_embd(
|
590
|
-
struct llama_context * ctx,
|
591
|
-
float * embd,
|
592
|
-
int32_t n_tokens,
|
593
|
-
int32_t n_past),
|
594
|
-
"use llama_decode() instead");
|
595
|
-
|
596
606
|
// Return batch for single sequence of tokens starting at pos_0
|
597
607
|
//
|
598
608
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
@@ -631,7 +641,10 @@ extern "C" {
|
|
631
641
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
632
642
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
633
643
|
|
634
|
-
//
|
644
|
+
// Set abort callback
|
645
|
+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
646
|
+
|
647
|
+
// Token logits obtained from the last call to llama_decode()
|
635
648
|
// The logits for the last token are stored in the last row
|
636
649
|
// Logits for which llama_batch.logits[i] == 0 are undefined
|
637
650
|
// Rows: n_tokens provided with llama_batch
|
@@ -642,14 +655,20 @@ extern "C" {
|
|
642
655
|
// llama_get_logits(ctx) + i*n_vocab
|
643
656
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
644
657
|
|
645
|
-
// Get
|
646
|
-
// shape: [n_embd] (1-dimensional)
|
658
|
+
// Get all output token embeddings
|
659
|
+
// shape: [n_tokens*n_embd] (1-dimensional)
|
647
660
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
648
661
|
|
649
|
-
// Get the embeddings for the ith
|
662
|
+
// Get the embeddings for the ith token
|
650
663
|
// llama_get_embeddings(ctx) + i*n_embd
|
664
|
+
// shape: [n_embd] (1-dimensional)
|
651
665
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
652
666
|
|
667
|
+
// Get the embeddings for a sequence id
|
668
|
+
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
669
|
+
// shape: [n_embd] (1-dimensional)
|
670
|
+
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
671
|
+
|
653
672
|
//
|
654
673
|
// Vocab
|
655
674
|
//
|
@@ -766,13 +785,6 @@ extern "C" {
|
|
766
785
|
float * logits_guidance,
|
767
786
|
float scale);
|
768
787
|
|
769
|
-
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
770
|
-
struct llama_context * ctx,
|
771
|
-
llama_token_data_array * candidates,
|
772
|
-
struct llama_context * guidance_ctx,
|
773
|
-
float scale),
|
774
|
-
"use llama_sample_apply_guidance() instead");
|
775
|
-
|
776
788
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
777
789
|
LLAMA_API void llama_sample_softmax(
|
778
790
|
struct llama_context * ctx,
|
@@ -826,12 +838,6 @@ extern "C" {
|
|
826
838
|
llama_token_data_array * candidates,
|
827
839
|
float temp);
|
828
840
|
|
829
|
-
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
830
|
-
struct llama_context * ctx,
|
831
|
-
llama_token_data_array * candidates,
|
832
|
-
float temp),
|
833
|
-
"use llama_sample_temp instead");
|
834
|
-
|
835
841
|
/// @details Apply constraints from grammar
|
836
842
|
LLAMA_API void llama_sample_grammar(
|
837
843
|
struct llama_context * ctx,
|