llama_cpp 0.14.7 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -475,6 +475,7 @@ extern "C" {
475
475
  GGML_OP_LEAKY_RELU,
476
476
 
477
477
  GGML_OP_FLASH_ATTN,
478
+ GGML_OP_FLASH_ATTN_EXT,
478
479
  GGML_OP_FLASH_FF,
479
480
  GGML_OP_FLASH_ATTN_BACK,
480
481
  GGML_OP_SSM_CONV,
@@ -762,6 +763,8 @@ extern "C" {
762
763
  // use this to compute the memory overhead of a tensor
763
764
  GGML_API size_t ggml_tensor_overhead(void);
764
765
 
766
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
767
+
765
768
  // main
766
769
 
767
770
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1720,6 +1723,25 @@ extern "C" {
1720
1723
  struct ggml_tensor * v,
1721
1724
  bool masked);
1722
1725
 
1726
+ #define GGML_KQ_MASK_PAD 32
1727
+
1728
+ // q: [n_embd, n_batch, n_head, 1]
1729
+ // k: [n_embd, n_kv, n_head_kv, 1]
1730
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1731
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1732
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1733
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1734
+ struct ggml_context * ctx,
1735
+ struct ggml_tensor * q,
1736
+ struct ggml_tensor * k,
1737
+ struct ggml_tensor * v,
1738
+ struct ggml_tensor * mask,
1739
+ float scale);
1740
+
1741
+ GGML_API void ggml_flash_attn_ext_set_prec(
1742
+ struct ggml_tensor * a,
1743
+ enum ggml_prec prec);
1744
+
1723
1745
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1724
1746
  struct ggml_context * ctx,
1725
1747
  struct ggml_tensor * q,