llama_cpp 0.14.7 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -475,6 +475,7 @@ extern "C" {
475
475
  GGML_OP_LEAKY_RELU,
476
476
 
477
477
  GGML_OP_FLASH_ATTN,
478
+ GGML_OP_FLASH_ATTN_EXT,
478
479
  GGML_OP_FLASH_FF,
479
480
  GGML_OP_FLASH_ATTN_BACK,
480
481
  GGML_OP_SSM_CONV,
@@ -762,6 +763,8 @@ extern "C" {
762
763
  // use this to compute the memory overhead of a tensor
763
764
  GGML_API size_t ggml_tensor_overhead(void);
764
765
 
766
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
767
+
765
768
  // main
766
769
 
767
770
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1720,6 +1723,25 @@ extern "C" {
1720
1723
  struct ggml_tensor * v,
1721
1724
  bool masked);
1722
1725
 
1726
+ #define GGML_KQ_MASK_PAD 32
1727
+
1728
+ // q: [n_embd, n_batch, n_head, 1]
1729
+ // k: [n_embd, n_kv, n_head_kv, 1]
1730
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1731
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1732
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1733
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1734
+ struct ggml_context * ctx,
1735
+ struct ggml_tensor * q,
1736
+ struct ggml_tensor * k,
1737
+ struct ggml_tensor * v,
1738
+ struct ggml_tensor * mask,
1739
+ float scale);
1740
+
1741
+ GGML_API void ggml_flash_attn_ext_set_prec(
1742
+ struct ggml_tensor * a,
1743
+ enum ggml_prec prec);
1744
+
1723
1745
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1724
1746
  struct ggml_context * ctx,
1725
1747
  struct ggml_tensor * q,