llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -59,9 +59,10 @@ extern "C" {
|
|
59
59
|
typedef int32_t llama_seq_id;
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
65
66
|
};
|
66
67
|
|
67
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -129,6 +130,7 @@ extern "C" {
|
|
129
130
|
};
|
130
131
|
|
131
132
|
enum llama_pooling_type {
|
133
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
132
134
|
LLAMA_POOLING_TYPE_NONE = 0,
|
133
135
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
134
136
|
LLAMA_POOLING_TYPE_CLS = 2,
|
@@ -162,7 +164,7 @@ extern "C" {
|
|
162
164
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
163
165
|
// - pos : the positions of the respective token in the sequence
|
164
166
|
// - seq_id : the sequence to which the respective token belongs
|
165
|
-
// - logits : if zero, the logits for the respective token will not be output
|
167
|
+
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
166
168
|
//
|
167
169
|
typedef struct llama_batch {
|
168
170
|
int32_t n_tokens;
|
@@ -172,7 +174,7 @@ extern "C" {
|
|
172
174
|
llama_pos * pos;
|
173
175
|
int32_t * n_seq_id;
|
174
176
|
llama_seq_id ** seq_id;
|
175
|
-
int8_t * logits;
|
177
|
+
int8_t * logits; // TODO: rename this to "output"
|
176
178
|
|
177
179
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
178
180
|
// for future-proof code, use the above fields instead and ignore everything below
|
@@ -233,10 +235,15 @@ extern "C" {
|
|
233
235
|
struct llama_context_params {
|
234
236
|
uint32_t seed; // RNG seed, -1 for random
|
235
237
|
uint32_t n_ctx; // text context, 0 = from model
|
236
|
-
uint32_t n_batch; //
|
238
|
+
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
239
|
+
uint32_t n_ubatch; // physical maximum batch size
|
240
|
+
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
237
241
|
uint32_t n_threads; // number of threads to use for generation
|
238
242
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
239
|
-
|
243
|
+
|
244
|
+
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
245
|
+
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
246
|
+
// (ignored if no pooling layer)
|
240
247
|
|
241
248
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
242
249
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -255,10 +262,15 @@ extern "C" {
|
|
255
262
|
enum ggml_type type_v; // data type for V cache
|
256
263
|
|
257
264
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
258
|
-
bool logits_all; // the
|
259
|
-
bool
|
265
|
+
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
266
|
+
bool embeddings; // if true, extract embeddings (together with logits)
|
260
267
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
261
|
-
|
268
|
+
|
269
|
+
// Abort callback
|
270
|
+
// if it returns true, execution of llama_decode() will be aborted
|
271
|
+
// currently works only with CPU execution
|
272
|
+
ggml_abort_callback abort_callback;
|
273
|
+
void * abort_callback_data;
|
262
274
|
};
|
263
275
|
|
264
276
|
// model quantization parameters
|
@@ -268,7 +280,7 @@ extern "C" {
|
|
268
280
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
269
281
|
bool quantize_output_tensor; // quantize output.weight
|
270
282
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
271
|
-
bool pure; //
|
283
|
+
bool pure; // quantize all tensors to the default type
|
272
284
|
void * imatrix; // pointer to importance matrix data
|
273
285
|
} llama_model_quantize_params;
|
274
286
|
|
@@ -367,6 +379,8 @@ extern "C" {
|
|
367
379
|
|
368
380
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
369
381
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
382
|
+
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
383
|
+
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
370
384
|
|
371
385
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
372
386
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
@@ -445,7 +459,7 @@ extern "C" {
|
|
445
459
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
446
460
|
// if there are more sequences in a cell than this value, however they will
|
447
461
|
// not be visible in the view cells_sequences.
|
448
|
-
int32_t
|
462
|
+
int32_t n_seq_max;
|
449
463
|
|
450
464
|
// Number of tokens in the cache. For example, if there are two populated
|
451
465
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
@@ -465,12 +479,12 @@ extern "C" {
|
|
465
479
|
// Information for an individual cell.
|
466
480
|
struct llama_kv_cache_view_cell * cells;
|
467
481
|
|
468
|
-
// The sequences for each cell. There will be
|
482
|
+
// The sequences for each cell. There will be n_seq_max items per cell.
|
469
483
|
llama_seq_id * cells_sequences;
|
470
484
|
};
|
471
485
|
|
472
486
|
// Create an empty KV cache view. (use only for debugging purposes)
|
473
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
487
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
474
488
|
|
475
489
|
// Free a KV cache view. (use only for debugging purposes)
|
476
490
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
@@ -493,7 +507,7 @@ extern "C" {
|
|
493
507
|
// seq_id < 0 : match any sequence
|
494
508
|
// p0 < 0 : [0, p1]
|
495
509
|
// p1 < 0 : [p0, inf)
|
496
|
-
LLAMA_API
|
510
|
+
LLAMA_API bool llama_kv_cache_seq_rm(
|
497
511
|
struct llama_context * ctx,
|
498
512
|
llama_seq_id seq_id,
|
499
513
|
llama_pos p0,
|
@@ -632,7 +646,19 @@ extern "C" {
|
|
632
646
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
633
647
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
634
648
|
|
635
|
-
//
|
649
|
+
// Set whether to use causal attention or not
|
650
|
+
// If set to true, the model will only attend to the past tokens
|
651
|
+
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
652
|
+
|
653
|
+
// Set abort callback
|
654
|
+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
655
|
+
|
656
|
+
// Wait until all computations are finished
|
657
|
+
// This is automatically done when using one of the functions below to obtain the computation results
|
658
|
+
// and is not necessary to call it explicitly in most cases
|
659
|
+
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
660
|
+
|
661
|
+
// Token logits obtained from the last call to llama_decode()
|
636
662
|
// The logits for the last token are stored in the last row
|
637
663
|
// Logits for which llama_batch.logits[i] == 0 are undefined
|
638
664
|
// Rows: n_tokens provided with llama_batch
|
@@ -643,14 +669,20 @@ extern "C" {
|
|
643
669
|
// llama_get_logits(ctx) + i*n_vocab
|
644
670
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
645
671
|
|
646
|
-
// Get
|
647
|
-
// shape: [n_embd] (1-dimensional)
|
672
|
+
// Get all output token embeddings
|
673
|
+
// shape: [n_tokens*n_embd] (1-dimensional)
|
648
674
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
649
675
|
|
650
|
-
// Get the embeddings for the ith
|
676
|
+
// Get the embeddings for the ith token
|
651
677
|
// llama_get_embeddings(ctx) + i*n_embd
|
678
|
+
// shape: [n_embd] (1-dimensional)
|
652
679
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
653
680
|
|
681
|
+
// Get the embeddings for a sequence id
|
682
|
+
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
683
|
+
// shape: [n_embd] (1-dimensional)
|
684
|
+
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
685
|
+
|
654
686
|
//
|
655
687
|
// Vocab
|
656
688
|
//
|
@@ -684,7 +716,7 @@ extern "C" {
|
|
684
716
|
|
685
717
|
/// @details Convert the provided text into tokens.
|
686
718
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
687
|
-
/// @return Returns the number of tokens on success, no more than
|
719
|
+
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
688
720
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
689
721
|
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
690
722
|
/// Does not insert a leading space.
|
@@ -693,7 +725,7 @@ extern "C" {
|
|
693
725
|
const char * text,
|
694
726
|
int32_t text_len,
|
695
727
|
llama_token * tokens,
|
696
|
-
int32_t
|
728
|
+
int32_t n_tokens_max,
|
697
729
|
bool add_bos,
|
698
730
|
bool special);
|
699
731
|
|