llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -59,9 +59,10 @@ extern "C" {
|
|
59
59
|
typedef int32_t llama_seq_id;
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
65
66
|
};
|
66
67
|
|
67
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -129,6 +130,7 @@ extern "C" {
|
|
129
130
|
};
|
130
131
|
|
131
132
|
enum llama_pooling_type {
|
133
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
132
134
|
LLAMA_POOLING_TYPE_NONE = 0,
|
133
135
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
134
136
|
LLAMA_POOLING_TYPE_CLS = 2,
|
@@ -162,7 +164,7 @@ extern "C" {
|
|
162
164
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
163
165
|
// - pos : the positions of the respective token in the sequence
|
164
166
|
// - seq_id : the sequence to which the respective token belongs
|
165
|
-
// - logits : if zero, the logits for the respective token will not be output
|
167
|
+
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
166
168
|
//
|
167
169
|
typedef struct llama_batch {
|
168
170
|
int32_t n_tokens;
|
@@ -172,7 +174,7 @@ extern "C" {
|
|
172
174
|
llama_pos * pos;
|
173
175
|
int32_t * n_seq_id;
|
174
176
|
llama_seq_id ** seq_id;
|
175
|
-
int8_t * logits;
|
177
|
+
int8_t * logits; // TODO: rename this to "output"
|
176
178
|
|
177
179
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
178
180
|
// for future-proof code, use the above fields instead and ignore everything below
|
@@ -233,10 +235,15 @@ extern "C" {
|
|
233
235
|
struct llama_context_params {
|
234
236
|
uint32_t seed; // RNG seed, -1 for random
|
235
237
|
uint32_t n_ctx; // text context, 0 = from model
|
236
|
-
uint32_t n_batch; //
|
238
|
+
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
239
|
+
uint32_t n_ubatch; // physical maximum batch size
|
240
|
+
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
237
241
|
uint32_t n_threads; // number of threads to use for generation
|
238
242
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
239
|
-
|
243
|
+
|
244
|
+
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
245
|
+
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
246
|
+
// (ignored if no pooling layer)
|
240
247
|
|
241
248
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
242
249
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -255,10 +262,15 @@ extern "C" {
|
|
255
262
|
enum ggml_type type_v; // data type for V cache
|
256
263
|
|
257
264
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
258
|
-
bool logits_all; // the
|
259
|
-
bool
|
265
|
+
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
266
|
+
bool embeddings; // if true, extract embeddings (together with logits)
|
260
267
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
261
|
-
|
268
|
+
|
269
|
+
// Abort callback
|
270
|
+
// if it returns true, execution of llama_decode() will be aborted
|
271
|
+
// currently works only with CPU execution
|
272
|
+
ggml_abort_callback abort_callback;
|
273
|
+
void * abort_callback_data;
|
262
274
|
};
|
263
275
|
|
264
276
|
// model quantization parameters
|
@@ -268,7 +280,7 @@ extern "C" {
|
|
268
280
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
269
281
|
bool quantize_output_tensor; // quantize output.weight
|
270
282
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
271
|
-
bool pure; //
|
283
|
+
bool pure; // quantize all tensors to the default type
|
272
284
|
void * imatrix; // pointer to importance matrix data
|
273
285
|
} llama_model_quantize_params;
|
274
286
|
|
@@ -367,6 +379,8 @@ extern "C" {
|
|
367
379
|
|
368
380
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
369
381
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
382
|
+
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
383
|
+
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
370
384
|
|
371
385
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
372
386
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
@@ -445,7 +459,7 @@ extern "C" {
|
|
445
459
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
446
460
|
// if there are more sequences in a cell than this value, however they will
|
447
461
|
// not be visible in the view cells_sequences.
|
448
|
-
int32_t
|
462
|
+
int32_t n_seq_max;
|
449
463
|
|
450
464
|
// Number of tokens in the cache. For example, if there are two populated
|
451
465
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
@@ -465,12 +479,12 @@ extern "C" {
|
|
465
479
|
// Information for an individual cell.
|
466
480
|
struct llama_kv_cache_view_cell * cells;
|
467
481
|
|
468
|
-
// The sequences for each cell. There will be
|
482
|
+
// The sequences for each cell. There will be n_seq_max items per cell.
|
469
483
|
llama_seq_id * cells_sequences;
|
470
484
|
};
|
471
485
|
|
472
486
|
// Create an empty KV cache view. (use only for debugging purposes)
|
473
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
487
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
474
488
|
|
475
489
|
// Free a KV cache view. (use only for debugging purposes)
|
476
490
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
@@ -493,7 +507,7 @@ extern "C" {
|
|
493
507
|
// seq_id < 0 : match any sequence
|
494
508
|
// p0 < 0 : [0, p1]
|
495
509
|
// p1 < 0 : [p0, inf)
|
496
|
-
LLAMA_API
|
510
|
+
LLAMA_API bool llama_kv_cache_seq_rm(
|
497
511
|
struct llama_context * ctx,
|
498
512
|
llama_seq_id seq_id,
|
499
513
|
llama_pos p0,
|
@@ -632,7 +646,19 @@ extern "C" {
|
|
632
646
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
633
647
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
634
648
|
|
635
|
-
//
|
649
|
+
// Set whether to use causal attention or not
|
650
|
+
// If set to true, the model will only attend to the past tokens
|
651
|
+
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
652
|
+
|
653
|
+
// Set abort callback
|
654
|
+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
655
|
+
|
656
|
+
// Wait until all computations are finished
|
657
|
+
// This is automatically done when using one of the functions below to obtain the computation results
|
658
|
+
// and is not necessary to call it explicitly in most cases
|
659
|
+
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
660
|
+
|
661
|
+
// Token logits obtained from the last call to llama_decode()
|
636
662
|
// The logits for the last token are stored in the last row
|
637
663
|
// Logits for which llama_batch.logits[i] == 0 are undefined
|
638
664
|
// Rows: n_tokens provided with llama_batch
|
@@ -643,14 +669,20 @@ extern "C" {
|
|
643
669
|
// llama_get_logits(ctx) + i*n_vocab
|
644
670
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
645
671
|
|
646
|
-
// Get
|
647
|
-
// shape: [n_embd] (1-dimensional)
|
672
|
+
// Get all output token embeddings
|
673
|
+
// shape: [n_tokens*n_embd] (1-dimensional)
|
648
674
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
649
675
|
|
650
|
-
// Get the embeddings for the ith
|
676
|
+
// Get the embeddings for the ith token
|
651
677
|
// llama_get_embeddings(ctx) + i*n_embd
|
678
|
+
// shape: [n_embd] (1-dimensional)
|
652
679
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
653
680
|
|
681
|
+
// Get the embeddings for a sequence id
|
682
|
+
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
683
|
+
// shape: [n_embd] (1-dimensional)
|
684
|
+
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
685
|
+
|
654
686
|
//
|
655
687
|
// Vocab
|
656
688
|
//
|
@@ -684,7 +716,7 @@ extern "C" {
|
|
684
716
|
|
685
717
|
/// @details Convert the provided text into tokens.
|
686
718
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
687
|
-
/// @return Returns the number of tokens on success, no more than
|
719
|
+
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
688
720
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
689
721
|
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
690
722
|
/// Does not insert a leading space.
|
@@ -693,7 +725,7 @@ extern "C" {
|
|
693
725
|
const char * text,
|
694
726
|
int32_t text_len,
|
695
727
|
llama_token * tokens,
|
696
|
-
int32_t
|
728
|
+
int32_t n_tokens_max,
|
697
729
|
bool add_bos,
|
698
730
|
bool special);
|
699
731
|
|