llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -481,9 +481,7 @@ extern "C" {
481
481
  GGML_OP_ARGSORT,
482
482
  GGML_OP_LEAKY_RELU,
483
483
 
484
- GGML_OP_FLASH_ATTN,
485
484
  GGML_OP_FLASH_ATTN_EXT,
486
- GGML_OP_FLASH_FF,
487
485
  GGML_OP_FLASH_ATTN_BACK,
488
486
  GGML_OP_SSM_CONV,
489
487
  GGML_OP_SSM_SCAN,
@@ -1460,11 +1458,12 @@ extern "C" {
1460
1458
  struct ggml_tensor * b);
1461
1459
 
1462
1460
  // rotary position embedding
1463
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1461
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1464
1462
  // if mode & 2 == 1, GPT-NeoX style
1465
1463
  // if mode & 4 == 1, ChatGLM style
1466
1464
  //
1467
1465
  // b is an int32 vector with size a->ne[2], it contains the positions
1466
+ // c is freq factors (e.g. phi3-128k), (optional)
1468
1467
  GGML_API struct ggml_tensor * ggml_rope(
1469
1468
  struct ggml_context * ctx,
1470
1469
  struct ggml_tensor * a,
@@ -1483,10 +1482,11 @@ extern "C" {
1483
1482
  int n_ctx);
1484
1483
 
1485
1484
  // custom RoPE
1486
- GGML_API struct ggml_tensor * ggml_rope_custom(
1485
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1487
1486
  struct ggml_context * ctx,
1488
1487
  struct ggml_tensor * a,
1489
1488
  struct ggml_tensor * b,
1489
+ struct ggml_tensor * c,
1490
1490
  int n_dims,
1491
1491
  int mode,
1492
1492
  int n_ctx,
@@ -1499,10 +1499,11 @@ extern "C" {
1499
1499
  float beta_slow);
1500
1500
 
1501
1501
  // in-place, returns view(a)
1502
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1502
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1503
1503
  struct ggml_context * ctx,
1504
1504
  struct ggml_tensor * a,
1505
1505
  struct ggml_tensor * b,
1506
+ struct ggml_tensor * c,
1506
1507
  int n_dims,
1507
1508
  int mode,
1508
1509
  int n_ctx,
@@ -1514,18 +1515,41 @@ extern "C" {
1514
1515
  float beta_fast,
1515
1516
  float beta_slow);
1516
1517
 
1517
- // compute correction dims for YaRN RoPE scaling
1518
- GGML_CALL void ggml_rope_yarn_corr_dims(
1519
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1518
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1519
+ struct ggml_context * ctx,
1520
+ struct ggml_tensor * a,
1521
+ struct ggml_tensor * b,
1522
+ int n_dims,
1523
+ int mode,
1524
+ int n_ctx,
1525
+ int n_orig_ctx,
1526
+ float freq_base,
1527
+ float freq_scale,
1528
+ float ext_factor,
1529
+ float attn_factor,
1530
+ float beta_fast,
1531
+ float beta_slow),
1532
+ "use ggml_rope_ext instead");
1520
1533
 
1521
- // xPos RoPE, in-place, returns view(a)
1522
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1534
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1523
1535
  struct ggml_context * ctx,
1524
1536
  struct ggml_tensor * a,
1525
1537
  struct ggml_tensor * b,
1526
1538
  int n_dims,
1527
- float base,
1528
- bool down);
1539
+ int mode,
1540
+ int n_ctx,
1541
+ int n_orig_ctx,
1542
+ float freq_base,
1543
+ float freq_scale,
1544
+ float ext_factor,
1545
+ float attn_factor,
1546
+ float beta_fast,
1547
+ float beta_slow),
1548
+ "use ggml_rope_ext_inplace instead");
1549
+
1550
+ // compute correction dims for YaRN RoPE scaling
1551
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1552
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1529
1553
 
1530
1554
  // rotary position embedding backward, i.e compute dx from dy
1531
1555
  // a - dy
@@ -1533,6 +1557,7 @@ extern "C" {
1533
1557
  struct ggml_context * ctx,
1534
1558
  struct ggml_tensor * a,
1535
1559
  struct ggml_tensor * b,
1560
+ struct ggml_tensor * c,
1536
1561
  int n_dims,
1537
1562
  int mode,
1538
1563
  int n_ctx,
@@ -1734,13 +1759,6 @@ extern "C" {
1734
1759
  struct ggml_tensor * a,
1735
1760
  int k);
1736
1761
 
1737
- GGML_API struct ggml_tensor * ggml_flash_attn(
1738
- struct ggml_context * ctx,
1739
- struct ggml_tensor * q,
1740
- struct ggml_tensor * k,
1741
- struct ggml_tensor * v,
1742
- bool masked);
1743
-
1744
1762
  #define GGML_KQ_MASK_PAD 32
1745
1763
 
1746
1764
  // q: [n_embd, n_batch, n_head, 1]
@@ -1761,6 +1779,7 @@ extern "C" {
1761
1779
  struct ggml_tensor * a,
1762
1780
  enum ggml_prec prec);
1763
1781
 
1782
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1764
1783
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1765
1784
  struct ggml_context * ctx,
1766
1785
  struct ggml_tensor * q,
@@ -1769,14 +1788,6 @@ extern "C" {
1769
1788
  struct ggml_tensor * d,
1770
1789
  bool masked);
1771
1790
 
1772
- GGML_API struct ggml_tensor * ggml_flash_ff(
1773
- struct ggml_context * ctx,
1774
- struct ggml_tensor * a,
1775
- struct ggml_tensor * b0,
1776
- struct ggml_tensor * b1,
1777
- struct ggml_tensor * c0,
1778
- struct ggml_tensor * c1);
1779
-
1780
1791
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1781
1792
  struct ggml_context * ctx,
1782
1793
  struct ggml_tensor * s,
@@ -2390,6 +2401,7 @@ extern "C" {
2390
2401
  GGML_API int ggml_cpu_has_avx512 (void);
2391
2402
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2392
2403
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2404
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2393
2405
  GGML_API int ggml_cpu_has_fma (void);
2394
2406
  GGML_API int ggml_cpu_has_neon (void);
2395
2407
  GGML_API int ggml_cpu_has_arm_fma (void);