llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -481,9 +481,7 @@ extern "C" {
481
481
  GGML_OP_ARGSORT,
482
482
  GGML_OP_LEAKY_RELU,
483
483
 
484
- GGML_OP_FLASH_ATTN,
485
484
  GGML_OP_FLASH_ATTN_EXT,
486
- GGML_OP_FLASH_FF,
487
485
  GGML_OP_FLASH_ATTN_BACK,
488
486
  GGML_OP_SSM_CONV,
489
487
  GGML_OP_SSM_SCAN,
@@ -1460,11 +1458,12 @@ extern "C" {
1460
1458
  struct ggml_tensor * b);
1461
1459
 
1462
1460
  // rotary position embedding
1463
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1461
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1464
1462
  // if mode & 2 == 1, GPT-NeoX style
1465
1463
  // if mode & 4 == 1, ChatGLM style
1466
1464
  //
1467
1465
  // b is an int32 vector with size a->ne[2], it contains the positions
1466
+ // c is freq factors (e.g. phi3-128k), (optional)
1468
1467
  GGML_API struct ggml_tensor * ggml_rope(
1469
1468
  struct ggml_context * ctx,
1470
1469
  struct ggml_tensor * a,
@@ -1483,10 +1482,11 @@ extern "C" {
1483
1482
  int n_ctx);
1484
1483
 
1485
1484
  // custom RoPE
1486
- GGML_API struct ggml_tensor * ggml_rope_custom(
1485
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1487
1486
  struct ggml_context * ctx,
1488
1487
  struct ggml_tensor * a,
1489
1488
  struct ggml_tensor * b,
1489
+ struct ggml_tensor * c,
1490
1490
  int n_dims,
1491
1491
  int mode,
1492
1492
  int n_ctx,
@@ -1499,10 +1499,11 @@ extern "C" {
1499
1499
  float beta_slow);
1500
1500
 
1501
1501
  // in-place, returns view(a)
1502
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1502
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1503
1503
  struct ggml_context * ctx,
1504
1504
  struct ggml_tensor * a,
1505
1505
  struct ggml_tensor * b,
1506
+ struct ggml_tensor * c,
1506
1507
  int n_dims,
1507
1508
  int mode,
1508
1509
  int n_ctx,
@@ -1514,18 +1515,41 @@ extern "C" {
1514
1515
  float beta_fast,
1515
1516
  float beta_slow);
1516
1517
 
1517
- // compute correction dims for YaRN RoPE scaling
1518
- GGML_CALL void ggml_rope_yarn_corr_dims(
1519
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1518
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1519
+ struct ggml_context * ctx,
1520
+ struct ggml_tensor * a,
1521
+ struct ggml_tensor * b,
1522
+ int n_dims,
1523
+ int mode,
1524
+ int n_ctx,
1525
+ int n_orig_ctx,
1526
+ float freq_base,
1527
+ float freq_scale,
1528
+ float ext_factor,
1529
+ float attn_factor,
1530
+ float beta_fast,
1531
+ float beta_slow),
1532
+ "use ggml_rope_ext instead");
1520
1533
 
1521
- // xPos RoPE, in-place, returns view(a)
1522
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1534
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1523
1535
  struct ggml_context * ctx,
1524
1536
  struct ggml_tensor * a,
1525
1537
  struct ggml_tensor * b,
1526
1538
  int n_dims,
1527
- float base,
1528
- bool down);
1539
+ int mode,
1540
+ int n_ctx,
1541
+ int n_orig_ctx,
1542
+ float freq_base,
1543
+ float freq_scale,
1544
+ float ext_factor,
1545
+ float attn_factor,
1546
+ float beta_fast,
1547
+ float beta_slow),
1548
+ "use ggml_rope_ext_inplace instead");
1549
+
1550
+ // compute correction dims for YaRN RoPE scaling
1551
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1552
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1529
1553
 
1530
1554
  // rotary position embedding backward, i.e compute dx from dy
1531
1555
  // a - dy
@@ -1533,6 +1557,7 @@ extern "C" {
1533
1557
  struct ggml_context * ctx,
1534
1558
  struct ggml_tensor * a,
1535
1559
  struct ggml_tensor * b,
1560
+ struct ggml_tensor * c,
1536
1561
  int n_dims,
1537
1562
  int mode,
1538
1563
  int n_ctx,
@@ -1734,13 +1759,6 @@ extern "C" {
1734
1759
  struct ggml_tensor * a,
1735
1760
  int k);
1736
1761
 
1737
- GGML_API struct ggml_tensor * ggml_flash_attn(
1738
- struct ggml_context * ctx,
1739
- struct ggml_tensor * q,
1740
- struct ggml_tensor * k,
1741
- struct ggml_tensor * v,
1742
- bool masked);
1743
-
1744
1762
  #define GGML_KQ_MASK_PAD 32
1745
1763
 
1746
1764
  // q: [n_embd, n_batch, n_head, 1]
@@ -1761,6 +1779,7 @@ extern "C" {
1761
1779
  struct ggml_tensor * a,
1762
1780
  enum ggml_prec prec);
1763
1781
 
1782
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1764
1783
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1765
1784
  struct ggml_context * ctx,
1766
1785
  struct ggml_tensor * q,
@@ -1769,14 +1788,6 @@ extern "C" {
1769
1788
  struct ggml_tensor * d,
1770
1789
  bool masked);
1771
1790
 
1772
- GGML_API struct ggml_tensor * ggml_flash_ff(
1773
- struct ggml_context * ctx,
1774
- struct ggml_tensor * a,
1775
- struct ggml_tensor * b0,
1776
- struct ggml_tensor * b1,
1777
- struct ggml_tensor * c0,
1778
- struct ggml_tensor * c1);
1779
-
1780
1791
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1781
1792
  struct ggml_context * ctx,
1782
1793
  struct ggml_tensor * s,
@@ -2390,6 +2401,7 @@ extern "C" {
2390
2401
  GGML_API int ggml_cpu_has_avx512 (void);
2391
2402
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2392
2403
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2404
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2393
2405
  GGML_API int ggml_cpu_has_fma (void);
2394
2406
  GGML_API int ggml_cpu_has_neon (void);
2395
2407
  GGML_API int ggml_cpu_has_arm_fma (void);