llama_cpp 0.14.7 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -475,6 +475,7 @@ extern "C" {
|
|
475
475
|
GGML_OP_LEAKY_RELU,
|
476
476
|
|
477
477
|
GGML_OP_FLASH_ATTN,
|
478
|
+
GGML_OP_FLASH_ATTN_EXT,
|
478
479
|
GGML_OP_FLASH_FF,
|
479
480
|
GGML_OP_FLASH_ATTN_BACK,
|
480
481
|
GGML_OP_SSM_CONV,
|
@@ -762,6 +763,8 @@ extern "C" {
|
|
762
763
|
// use this to compute the memory overhead of a tensor
|
763
764
|
GGML_API size_t ggml_tensor_overhead(void);
|
764
765
|
|
766
|
+
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
|
767
|
+
|
765
768
|
// main
|
766
769
|
|
767
770
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -1720,6 +1723,25 @@ extern "C" {
|
|
1720
1723
|
struct ggml_tensor * v,
|
1721
1724
|
bool masked);
|
1722
1725
|
|
1726
|
+
#define GGML_KQ_MASK_PAD 32
|
1727
|
+
|
1728
|
+
// q: [n_embd, n_batch, n_head, 1]
|
1729
|
+
// k: [n_embd, n_kv, n_head_kv, 1]
|
1730
|
+
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
1731
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
1732
|
+
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
1733
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
1734
|
+
struct ggml_context * ctx,
|
1735
|
+
struct ggml_tensor * q,
|
1736
|
+
struct ggml_tensor * k,
|
1737
|
+
struct ggml_tensor * v,
|
1738
|
+
struct ggml_tensor * mask,
|
1739
|
+
float scale);
|
1740
|
+
|
1741
|
+
GGML_API void ggml_flash_attn_ext_set_prec(
|
1742
|
+
struct ggml_tensor * a,
|
1743
|
+
enum ggml_prec prec);
|
1744
|
+
|
1723
1745
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1724
1746
|
struct ggml_context * ctx,
|
1725
1747
|
struct ggml_tensor * q,
|