llama_cpp 0.14.4 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
@@ -37,10 +37,14 @@
|
|
37
37
|
|
38
38
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
39
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
|
+
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
40
41
|
|
41
42
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
43
|
#define LLAMA_SESSION_VERSION 5
|
43
44
|
|
45
|
+
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
|
+
#define LLAMA_STATE_SEQ_VERSION 1
|
47
|
+
|
44
48
|
#ifdef __cplusplus
|
45
49
|
extern "C" {
|
46
50
|
#endif
|
@@ -523,6 +527,7 @@ extern "C" {
|
|
523
527
|
struct llama_context * ctx);
|
524
528
|
|
525
529
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
530
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
526
531
|
// seq_id < 0 : match any sequence
|
527
532
|
// p0 < 0 : [0, p1]
|
528
533
|
// p1 < 0 : [p0, inf)
|
@@ -594,34 +599,92 @@ extern "C" {
|
|
594
599
|
|
595
600
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
596
601
|
// and kv_cache) - will often be smaller after compacting tokens
|
597
|
-
LLAMA_API size_t
|
602
|
+
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
603
|
+
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
604
|
+
"use llama_state_get_size instead");
|
598
605
|
|
599
606
|
// Copies the state to the specified destination address.
|
600
607
|
// Destination needs to have allocated enough memory.
|
601
608
|
// Returns the number of bytes copied
|
602
|
-
LLAMA_API size_t
|
609
|
+
LLAMA_API size_t llama_state_get_data(
|
603
610
|
struct llama_context * ctx,
|
604
611
|
uint8_t * dst);
|
612
|
+
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
613
|
+
struct llama_context * ctx,
|
614
|
+
uint8_t * dst),
|
615
|
+
"use llama_state_get_data instead");
|
605
616
|
|
606
617
|
// Set the state reading from the specified address
|
607
618
|
// Returns the number of bytes read
|
608
|
-
LLAMA_API size_t
|
619
|
+
LLAMA_API size_t llama_state_set_data(
|
609
620
|
struct llama_context * ctx,
|
610
621
|
const uint8_t * src);
|
622
|
+
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
623
|
+
struct llama_context * ctx,
|
624
|
+
const uint8_t * src),
|
625
|
+
"use llama_state_set_data instead");
|
611
626
|
|
612
627
|
// Save/load session file
|
613
|
-
LLAMA_API bool
|
628
|
+
LLAMA_API bool llama_state_load_file(
|
614
629
|
struct llama_context * ctx,
|
615
630
|
const char * path_session,
|
616
631
|
llama_token * tokens_out,
|
617
632
|
size_t n_token_capacity,
|
618
633
|
size_t * n_token_count_out);
|
634
|
+
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
635
|
+
struct llama_context * ctx,
|
636
|
+
const char * path_session,
|
637
|
+
llama_token * tokens_out,
|
638
|
+
size_t n_token_capacity,
|
639
|
+
size_t * n_token_count_out),
|
640
|
+
"use llama_state_load_file instead");
|
619
641
|
|
620
|
-
LLAMA_API bool
|
642
|
+
LLAMA_API bool llama_state_save_file(
|
621
643
|
struct llama_context * ctx,
|
622
644
|
const char * path_session,
|
623
645
|
const llama_token * tokens,
|
624
646
|
size_t n_token_count);
|
647
|
+
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
648
|
+
struct llama_context * ctx,
|
649
|
+
const char * path_session,
|
650
|
+
const llama_token * tokens,
|
651
|
+
size_t n_token_count),
|
652
|
+
"use llama_state_save_file instead");
|
653
|
+
|
654
|
+
// Get the exact size needed to copy the KV cache of a single sequence
|
655
|
+
LLAMA_API size_t llama_state_seq_get_size(
|
656
|
+
struct llama_context * ctx,
|
657
|
+
llama_seq_id seq_id);
|
658
|
+
|
659
|
+
// Copy the KV cache of a single sequence into the specified buffer
|
660
|
+
LLAMA_API size_t llama_state_seq_get_data(
|
661
|
+
struct llama_context * ctx,
|
662
|
+
uint8_t * dst,
|
663
|
+
llama_seq_id seq_id);
|
664
|
+
|
665
|
+
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
666
|
+
// Returns:
|
667
|
+
// - Positive: Ok
|
668
|
+
// - Zero: Failed to load
|
669
|
+
LLAMA_API size_t llama_state_seq_set_data(
|
670
|
+
struct llama_context * ctx,
|
671
|
+
const uint8_t * src,
|
672
|
+
llama_seq_id dest_seq_id);
|
673
|
+
|
674
|
+
LLAMA_API size_t llama_state_seq_save_file(
|
675
|
+
struct llama_context * ctx,
|
676
|
+
const char * filepath,
|
677
|
+
llama_seq_id seq_id,
|
678
|
+
const llama_token * tokens,
|
679
|
+
size_t n_token_count);
|
680
|
+
|
681
|
+
LLAMA_API size_t llama_state_seq_load_file(
|
682
|
+
struct llama_context * ctx,
|
683
|
+
const char * filepath,
|
684
|
+
llama_seq_id dest_seq_id,
|
685
|
+
llama_token * tokens_out,
|
686
|
+
size_t n_token_capacity,
|
687
|
+
size_t * n_token_count_out);
|
625
688
|
|
626
689
|
//
|
627
690
|
// Decoding
|
@@ -684,8 +747,9 @@ extern "C" {
|
|
684
747
|
// Cols: n_vocab
|
685
748
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
686
749
|
|
687
|
-
// Logits for the ith token. Equivalent to:
|
750
|
+
// Logits for the ith token. For positive indices, Equivalent to:
|
688
751
|
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
752
|
+
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
689
753
|
// returns NULL for invalid ids.
|
690
754
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
691
755
|
|
@@ -697,8 +761,9 @@ extern "C" {
|
|
697
761
|
// Otherwise, returns NULL.
|
698
762
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
699
763
|
|
700
|
-
// Get the embeddings for the ith token. Equivalent to:
|
764
|
+
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
701
765
|
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
766
|
+
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
702
767
|
// shape: [n_embd] (1-dimensional)
|
703
768
|
// returns NULL for invalid ids.
|
704
769
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
@@ -721,6 +786,8 @@ extern "C" {
|
|
721
786
|
// Special tokens
|
722
787
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
723
788
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
789
|
+
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
790
|
+
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
724
791
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
725
792
|
|
726
793
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
@@ -743,16 +810,16 @@ extern "C" {
|
|
743
810
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
744
811
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
745
812
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
746
|
-
/// @param
|
747
|
-
///
|
813
|
+
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
814
|
+
/// as plaintext. Does not insert a leading space.
|
748
815
|
LLAMA_API int32_t llama_tokenize(
|
749
816
|
const struct llama_model * model,
|
750
817
|
const char * text,
|
751
818
|
int32_t text_len,
|
752
819
|
llama_token * tokens,
|
753
820
|
int32_t n_tokens_max,
|
754
|
-
bool
|
755
|
-
bool
|
821
|
+
bool add_special,
|
822
|
+
bool parse_special);
|
756
823
|
|
757
824
|
// Token Id -> Piece.
|
758
825
|
// Uses the vocabulary in the provided context.
|
@@ -1030,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
1030
1097
|
struct llama_context * ctx
|
1031
1098
|
);
|
1032
1099
|
|
1033
|
-
|
1100
|
+
void llama_grammar_accept(
|
1034
1101
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1035
1102
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1036
|
-
const uint32_t chr
|
1103
|
+
const uint32_t chr,
|
1104
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
1037
1105
|
|
1038
1106
|
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1039
1107
|
const std::string & src,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-04-
|
11
|
+
date: 2024-04-13 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|