RubyGems - llama_cpp - Versions diffs - 0.14.7 → 0.15.0 - Mend

llama_cpp 0.14.7 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/README.md +2 -2
data/ext/llama_cpp/extconf.rb +2 -1
data/ext/llama_cpp/llama_cpp.cpp +53 -9
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +18 -3
data/vendor/tmp/llama.cpp/Makefile +41 -16
data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
data/vendor/tmp/llama.cpp/ggml.c +391 -27
data/vendor/tmp/llama.cpp/ggml.h +22 -0
data/vendor/tmp/llama.cpp/llama.cpp +623 -395
data/vendor/tmp/llama.cpp/llama.h +27 -9
data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
data/vendor/tmp/llama.cpp/sgemm.h +4 -2
data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
data/vendor/tmp/llama.cpp/unicode.h +2 -1
metadata +3 -3

data/vendor/tmp/llama.cpp/ggml.h CHANGED Viewed

@@ -475,6 +475,7 @@ extern "C" {
         GGML_OP_LEAKY_RELU,
         GGML_OP_FLASH_ATTN,
+        GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_FF,
         GGML_OP_FLASH_ATTN_BACK,
         GGML_OP_SSM_CONV,
@@ -762,6 +763,8 @@ extern "C" {
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
+    GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
     // main
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1720,6 +1723,25 @@ extern "C" {
             struct ggml_tensor  * v,
             bool                  masked);
+#define GGML_KQ_MASK_PAD 32
+    // q:    [n_embd, n_batch,     n_head,    1]
+    // k:    [n_embd, n_kv,        n_head_kv, 1]
+    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * mask,
+            float                 scale);
+    GGML_API void ggml_flash_attn_ext_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
     GGML_API struct ggml_tensor * ggml_flash_attn_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,