llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -174,6 +174,7 @@ extern "C" {
|
|
174
174
|
LLAMA_POOLING_TYPE_NONE = 0,
|
175
175
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
176
176
|
LLAMA_POOLING_TYPE_CLS = 2,
|
177
|
+
LLAMA_POOLING_TYPE_LAST = 3,
|
177
178
|
};
|
178
179
|
|
179
180
|
enum llama_split_mode {
|
@@ -293,7 +294,6 @@ extern "C" {
|
|
293
294
|
|
294
295
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
295
296
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
296
|
-
// (ignored if no pooling layer)
|
297
297
|
|
298
298
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
299
299
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
@@ -786,6 +786,10 @@ extern "C" {
|
|
786
786
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
787
787
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
788
788
|
|
789
|
+
// Set whether the model is in embeddings model or not
|
790
|
+
// If true, embeddings will be returned but logits will not
|
791
|
+
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
792
|
+
|
789
793
|
// Set whether to use causal attention or not
|
790
794
|
// If set to true, the model will only attend to the past tokens
|
791
795
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
@@ -43,8 +43,10 @@
|
|
43
43
|
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
44
44
|
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
45
45
|
|
46
|
+
#if defined(__GNUC__)
|
46
47
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
47
48
|
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
49
|
+
#endif
|
48
50
|
|
49
51
|
#include "sgemm.h"
|
50
52
|
#include "ggml-impl.h"
|