llama_cpp 0.14.7 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -475,6 +475,7 @@ extern "C" {
|
|
475
475
|
GGML_OP_LEAKY_RELU,
|
476
476
|
|
477
477
|
GGML_OP_FLASH_ATTN,
|
478
|
+
GGML_OP_FLASH_ATTN_EXT,
|
478
479
|
GGML_OP_FLASH_FF,
|
479
480
|
GGML_OP_FLASH_ATTN_BACK,
|
480
481
|
GGML_OP_SSM_CONV,
|
@@ -762,6 +763,8 @@ extern "C" {
|
|
762
763
|
// use this to compute the memory overhead of a tensor
|
763
764
|
GGML_API size_t ggml_tensor_overhead(void);
|
764
765
|
|
766
|
+
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
|
767
|
+
|
765
768
|
// main
|
766
769
|
|
767
770
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -1720,6 +1723,25 @@ extern "C" {
|
|
1720
1723
|
struct ggml_tensor * v,
|
1721
1724
|
bool masked);
|
1722
1725
|
|
1726
|
+
#define GGML_KQ_MASK_PAD 32
|
1727
|
+
|
1728
|
+
// q: [n_embd, n_batch, n_head, 1]
|
1729
|
+
// k: [n_embd, n_kv, n_head_kv, 1]
|
1730
|
+
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
1731
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
1732
|
+
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
1733
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
1734
|
+
struct ggml_context * ctx,
|
1735
|
+
struct ggml_tensor * q,
|
1736
|
+
struct ggml_tensor * k,
|
1737
|
+
struct ggml_tensor * v,
|
1738
|
+
struct ggml_tensor * mask,
|
1739
|
+
float scale);
|
1740
|
+
|
1741
|
+
GGML_API void ggml_flash_attn_ext_set_prec(
|
1742
|
+
struct ggml_tensor * a,
|
1743
|
+
enum ggml_prec prec);
|
1744
|
+
|
1723
1745
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1724
1746
|
struct ggml_context * ctx,
|
1725
1747
|
struct ggml_tensor * q,
|