llama_cpp 0.14.7 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -326,14 +326,20 @@ extern "C" {
326
326
  // get ggml_status name string
327
327
  GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
328
328
 
329
+ // ieee 754-2008 half-precision float16
330
+ // todo: make this not an integral type
329
331
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
332
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
333
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
334
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
335
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
336
+
337
+ // google brain half-precision bfloat16
338
+ typedef struct { uint16_t bits; } ggml_bf16_t;
339
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
340
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
341
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
343
 
338
344
  struct ggml_object;
339
345
  struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
370
376
  GGML_TYPE_I64 = 27,
371
377
  GGML_TYPE_F64 = 28,
372
378
  GGML_TYPE_IQ1_M = 29,
379
+ GGML_TYPE_BF16 = 30,
373
380
  GGML_TYPE_COUNT,
374
381
  };
375
382
 
@@ -410,6 +417,7 @@ extern "C" {
410
417
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
418
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
419
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
421
  };
414
422
 
415
423
  // available tensor operations:
@@ -475,6 +483,7 @@ extern "C" {
475
483
  GGML_OP_LEAKY_RELU,
476
484
 
477
485
  GGML_OP_FLASH_ATTN,
486
+ GGML_OP_FLASH_ATTN_EXT,
478
487
  GGML_OP_FLASH_FF,
479
488
  GGML_OP_FLASH_ATTN_BACK,
480
489
  GGML_OP_SSM_CONV,
@@ -762,6 +771,8 @@ extern "C" {
762
771
  // use this to compute the memory overhead of a tensor
763
772
  GGML_API size_t ggml_tensor_overhead(void);
764
773
 
774
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
775
+
765
776
  // main
766
777
 
767
778
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1720,6 +1731,25 @@ extern "C" {
1720
1731
  struct ggml_tensor * v,
1721
1732
  bool masked);
1722
1733
 
1734
+ #define GGML_KQ_MASK_PAD 32
1735
+
1736
+ // q: [n_embd, n_batch, n_head, 1]
1737
+ // k: [n_embd, n_kv, n_head_kv, 1]
1738
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1739
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1740
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1741
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1742
+ struct ggml_context * ctx,
1743
+ struct ggml_tensor * q,
1744
+ struct ggml_tensor * k,
1745
+ struct ggml_tensor * v,
1746
+ struct ggml_tensor * mask,
1747
+ float scale);
1748
+
1749
+ GGML_API void ggml_flash_attn_ext_set_prec(
1750
+ struct ggml_tensor * a,
1751
+ enum ggml_prec prec);
1752
+
1723
1753
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1724
1754
  struct ggml_context * ctx,
1725
1755
  struct ggml_tensor * q,