llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -81,9 +81,10 @@ extern "C" {
|
|
81
81
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
82
|
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
83
83
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
84
|
-
|
85
|
-
|
86
|
-
|
84
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
85
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
86
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
87
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
87
88
|
};
|
88
89
|
|
89
90
|
// note: these values should be synchronized with ggml_rope
|
@@ -242,6 +243,9 @@ extern "C" {
|
|
242
243
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
243
244
|
const float * tensor_split;
|
244
245
|
|
246
|
+
// comma separated list of RPC servers to use for offloading
|
247
|
+
const char * rpc_servers;
|
248
|
+
|
245
249
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
246
250
|
// If the provided progress_callback returns true, model loading continues.
|
247
251
|
// If it returns false, model loading is immediately aborted.
|
@@ -755,6 +759,12 @@ extern "C" {
|
|
755
759
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
756
760
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
757
761
|
|
762
|
+
// Get the number of threads used for generation of a single token.
|
763
|
+
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
764
|
+
|
765
|
+
// Get the number of threads used for prompt and batch processing (multiple token).
|
766
|
+
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
767
|
+
|
758
768
|
// Set whether to use causal attention or not
|
759
769
|
// If set to true, the model will only attend to the past tokens
|
760
770
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|