llama_cpp 0.14.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +3 -1
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -59,9 +59,10 @@ extern "C" {
|
|
59
59
|
typedef int32_t llama_seq_id;
|
60
60
|
|
61
61
|
enum llama_vocab_type {
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
63
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
64
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
65
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
65
66
|
};
|
66
67
|
|
67
68
|
// note: these values should be synchronized with ggml_rope
|
@@ -234,7 +235,9 @@ extern "C" {
|
|
234
235
|
struct llama_context_params {
|
235
236
|
uint32_t seed; // RNG seed, -1 for random
|
236
237
|
uint32_t n_ctx; // text context, 0 = from model
|
237
|
-
uint32_t n_batch; //
|
238
|
+
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
239
|
+
uint32_t n_ubatch; // physical maximum batch size
|
240
|
+
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
238
241
|
uint32_t n_threads; // number of threads to use for generation
|
239
242
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
240
243
|
|
@@ -277,7 +280,7 @@ extern "C" {
|
|
277
280
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
278
281
|
bool quantize_output_tensor; // quantize output.weight
|
279
282
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
280
|
-
bool pure; //
|
283
|
+
bool pure; // quantize all tensors to the default type
|
281
284
|
void * imatrix; // pointer to importance matrix data
|
282
285
|
} llama_model_quantize_params;
|
283
286
|
|
@@ -376,6 +379,8 @@ extern "C" {
|
|
376
379
|
|
377
380
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
378
381
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
382
|
+
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
383
|
+
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
379
384
|
|
380
385
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
381
386
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
@@ -454,7 +459,7 @@ extern "C" {
|
|
454
459
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
455
460
|
// if there are more sequences in a cell than this value, however they will
|
456
461
|
// not be visible in the view cells_sequences.
|
457
|
-
int32_t
|
462
|
+
int32_t n_seq_max;
|
458
463
|
|
459
464
|
// Number of tokens in the cache. For example, if there are two populated
|
460
465
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
@@ -474,12 +479,12 @@ extern "C" {
|
|
474
479
|
// Information for an individual cell.
|
475
480
|
struct llama_kv_cache_view_cell * cells;
|
476
481
|
|
477
|
-
// The sequences for each cell. There will be
|
482
|
+
// The sequences for each cell. There will be n_seq_max items per cell.
|
478
483
|
llama_seq_id * cells_sequences;
|
479
484
|
};
|
480
485
|
|
481
486
|
// Create an empty KV cache view. (use only for debugging purposes)
|
482
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
487
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
483
488
|
|
484
489
|
// Free a KV cache view. (use only for debugging purposes)
|
485
490
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
@@ -502,7 +507,7 @@ extern "C" {
|
|
502
507
|
// seq_id < 0 : match any sequence
|
503
508
|
// p0 < 0 : [0, p1]
|
504
509
|
// p1 < 0 : [p0, inf)
|
505
|
-
LLAMA_API
|
510
|
+
LLAMA_API bool llama_kv_cache_seq_rm(
|
506
511
|
struct llama_context * ctx,
|
507
512
|
llama_seq_id seq_id,
|
508
513
|
llama_pos p0,
|
@@ -641,9 +646,18 @@ extern "C" {
|
|
641
646
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
642
647
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
643
648
|
|
649
|
+
// Set whether to use causal attention or not
|
650
|
+
// If set to true, the model will only attend to the past tokens
|
651
|
+
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
652
|
+
|
644
653
|
// Set abort callback
|
645
654
|
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
646
655
|
|
656
|
+
// Wait until all computations are finished
|
657
|
+
// This is automatically done when using one of the functions below to obtain the computation results
|
658
|
+
// and is not necessary to call it explicitly in most cases
|
659
|
+
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
660
|
+
|
647
661
|
// Token logits obtained from the last call to llama_decode()
|
648
662
|
// The logits for the last token are stored in the last row
|
649
663
|
// Logits for which llama_batch.logits[i] == 0 are undefined
|
@@ -702,7 +716,7 @@ extern "C" {
|
|
702
716
|
|
703
717
|
/// @details Convert the provided text into tokens.
|
704
718
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
705
|
-
/// @return Returns the number of tokens on success, no more than
|
719
|
+
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
706
720
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
707
721
|
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
708
722
|
/// Does not insert a leading space.
|
@@ -711,7 +725,7 @@ extern "C" {
|
|
711
725
|
const char * text,
|
712
726
|
int32_t text_len,
|
713
727
|
llama_token * tokens,
|
714
|
-
int32_t
|
728
|
+
int32_t n_tokens_max,
|
715
729
|
bool add_bos,
|
716
730
|
bool special);
|
717
731
|
|