llama_cpp 0.14.6 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
@@ -288,6 +288,7 @@ extern "C" {
|
|
288
288
|
bool quantize_output_tensor; // quantize output.weight
|
289
289
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
290
290
|
bool pure; // quantize all tensors to the default type
|
291
|
+
bool keep_split; // quantize to the same number of shards
|
291
292
|
void * imatrix; // pointer to importance matrix data
|
292
293
|
void * kv_overrides; // pointer to vector containing overrides
|
293
294
|
} llama_model_quantize_params;
|
@@ -390,8 +391,10 @@ extern "C" {
|
|
390
391
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
391
392
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
392
393
|
|
393
|
-
LLAMA_API enum
|
394
|
-
|
394
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
395
|
+
|
396
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
397
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
395
398
|
|
396
399
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
397
400
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
@@ -783,6 +786,9 @@ extern "C" {
|
|
783
786
|
|
784
787
|
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
785
788
|
|
789
|
+
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
790
|
+
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
791
|
+
|
786
792
|
// Special tokens
|
787
793
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
788
794
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
@@ -796,7 +802,7 @@ extern "C" {
|
|
796
802
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
797
803
|
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
798
804
|
|
799
|
-
//
|
805
|
+
// Codellama infill tokens
|
800
806
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
801
807
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
802
808
|
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
@@ -825,11 +831,13 @@ extern "C" {
|
|
825
831
|
// Uses the vocabulary in the provided context.
|
826
832
|
// Does not write null terminator to the buffer.
|
827
833
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
834
|
+
// @param special If true, special tokens are rendered in the output.
|
828
835
|
LLAMA_API int32_t llama_token_to_piece(
|
829
836
|
const struct llama_model * model,
|
830
837
|
llama_token token,
|
831
838
|
char * buf,
|
832
|
-
int32_t length
|
839
|
+
int32_t length,
|
840
|
+
bool special);
|
833
841
|
|
834
842
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
835
843
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
@@ -982,7 +990,7 @@ extern "C" {
|
|
982
990
|
struct llama_context * ctx,
|
983
991
|
llama_token_data_array * candidates);
|
984
992
|
|
985
|
-
/// @details Randomly selects a token from the candidates based on their probabilities.
|
993
|
+
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
986
994
|
LLAMA_API llama_token llama_sample_token(
|
987
995
|
struct llama_context * ctx,
|
988
996
|
llama_token_data_array * candidates);
|
@@ -1069,8 +1077,9 @@ extern "C" {
|
|
1069
1077
|
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
1070
1078
|
#ifdef LLAMA_API_INTERNAL
|
1071
1079
|
|
1072
|
-
#include <
|
1080
|
+
#include <random>
|
1073
1081
|
#include <string>
|
1082
|
+
#include <vector>
|
1074
1083
|
|
1075
1084
|
struct ggml_tensor;
|
1076
1085
|
|
@@ -1107,6 +1116,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
1107
1116
|
const std::string & src,
|
1108
1117
|
llama_partial_utf8 partial_start);
|
1109
1118
|
|
1119
|
+
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
1120
|
+
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
1121
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
1122
|
+
|
1110
1123
|
#endif // LLAMA_API_INTERNAL
|
1111
1124
|
|
1112
1125
|
#endif // LLAMA_H
|