llama_cpp 0.14.7 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -326,14 +326,20 @@ extern "C" {
326
326
  // get ggml_status name string
327
327
  GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
328
328
 
329
+ // ieee 754-2008 half-precision float16
330
+ // todo: make this not an integral type
329
331
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
332
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
333
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
334
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
335
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
336
+
337
+ // google brain half-precision bfloat16
338
+ typedef struct { uint16_t bits; } ggml_bf16_t;
339
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
340
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
341
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
343
 
338
344
  struct ggml_object;
339
345
  struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
370
376
  GGML_TYPE_I64 = 27,
371
377
  GGML_TYPE_F64 = 28,
372
378
  GGML_TYPE_IQ1_M = 29,
379
+ GGML_TYPE_BF16 = 30,
373
380
  GGML_TYPE_COUNT,
374
381
  };
375
382
 
@@ -410,6 +417,7 @@ extern "C" {
410
417
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
418
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
419
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
421
  };
414
422
 
415
423
  // available tensor operations:
@@ -475,6 +483,7 @@ extern "C" {
475
483
  GGML_OP_LEAKY_RELU,
476
484
 
477
485
  GGML_OP_FLASH_ATTN,
486
+ GGML_OP_FLASH_ATTN_EXT,
478
487
  GGML_OP_FLASH_FF,
479
488
  GGML_OP_FLASH_ATTN_BACK,
480
489
  GGML_OP_SSM_CONV,
@@ -762,6 +771,8 @@ extern "C" {
762
771
  // use this to compute the memory overhead of a tensor
763
772
  GGML_API size_t ggml_tensor_overhead(void);
764
773
 
774
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
775
+
765
776
  // main
766
777
 
767
778
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1720,6 +1731,25 @@ extern "C" {
1720
1731
  struct ggml_tensor * v,
1721
1732
  bool masked);
1722
1733
 
1734
+ #define GGML_KQ_MASK_PAD 32
1735
+
1736
+ // q: [n_embd, n_batch, n_head, 1]
1737
+ // k: [n_embd, n_kv, n_head_kv, 1]
1738
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1739
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1740
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1741
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1742
+ struct ggml_context * ctx,
1743
+ struct ggml_tensor * q,
1744
+ struct ggml_tensor * k,
1745
+ struct ggml_tensor * v,
1746
+ struct ggml_tensor * mask,
1747
+ float scale);
1748
+
1749
+ GGML_API void ggml_flash_attn_ext_set_prec(
1750
+ struct ggml_tensor * a,
1751
+ enum ggml_prec prec);
1752
+
1723
1753
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1724
1754
  struct ggml_context * ctx,
1725
1755
  struct ggml_tensor * q,