llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -468,7 +468,6 @@ extern "C" {
468
468
  GGML_OP_SOFT_MAX_BACK,
469
469
  GGML_OP_ROPE,
470
470
  GGML_OP_ROPE_BACK,
471
- GGML_OP_ALIBI,
472
471
  GGML_OP_CLAMP,
473
472
  GGML_OP_CONV_TRANSPOSE_1D,
474
473
  GGML_OP_IM2COL,
@@ -482,9 +481,7 @@ extern "C" {
482
481
  GGML_OP_ARGSORT,
483
482
  GGML_OP_LEAKY_RELU,
484
483
 
485
- GGML_OP_FLASH_ATTN,
486
484
  GGML_OP_FLASH_ATTN_EXT,
487
- GGML_OP_FLASH_FF,
488
485
  GGML_OP_FLASH_ATTN_BACK,
489
486
  GGML_OP_SSM_CONV,
490
487
  GGML_OP_SSM_SCAN,
@@ -520,6 +517,7 @@ extern "C" {
520
517
  GGML_UNARY_OP_TANH,
521
518
  GGML_UNARY_OP_ELU,
522
519
  GGML_UNARY_OP_RELU,
520
+ GGML_UNARY_OP_SIGMOID,
523
521
  GGML_UNARY_OP_GELU,
524
522
  GGML_UNARY_OP_GELU_QUICK,
525
523
  GGML_UNARY_OP_SILU,
@@ -565,7 +563,8 @@ extern "C" {
565
563
  // n-dimensional tensor
566
564
  struct ggml_tensor {
567
565
  enum ggml_type type;
568
- enum ggml_backend_type backend;
566
+
567
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
569
568
 
570
569
  struct ggml_backend_buffer * buffer;
571
570
 
@@ -766,7 +765,8 @@ extern "C" {
766
765
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
767
766
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
768
767
 
769
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
768
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
770
 
771
771
  // use this to compute the memory overhead of a tensor
772
772
  GGML_API size_t ggml_tensor_overhead(void);
@@ -1074,6 +1074,14 @@ extern "C" {
1074
1074
  struct ggml_context * ctx,
1075
1075
  struct ggml_tensor * a);
1076
1076
 
1077
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1078
+ struct ggml_context * ctx,
1079
+ struct ggml_tensor * a);
1080
+
1081
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1082
+ struct ggml_context * ctx,
1083
+ struct ggml_tensor * a);
1084
+
1077
1085
  GGML_API struct ggml_tensor * ggml_gelu(
1078
1086
  struct ggml_context * ctx,
1079
1087
  struct ggml_tensor * a);
@@ -1428,15 +1436,13 @@ extern "C" {
1428
1436
  struct ggml_context * ctx,
1429
1437
  struct ggml_tensor * a);
1430
1438
 
1431
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1439
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1432
1440
  // mask is optional
1433
- // pos is required when max_bias > 0.0f
1434
1441
  // max_bias = 0.0f for no ALiBi
1435
1442
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1436
1443
  struct ggml_context * ctx,
1437
1444
  struct ggml_tensor * a,
1438
1445
  struct ggml_tensor * mask,
1439
- struct ggml_tensor * pos,
1440
1446
  float scale,
1441
1447
  float max_bias);
1442
1448
 
@@ -1452,11 +1458,12 @@ extern "C" {
1452
1458
  struct ggml_tensor * b);
1453
1459
 
1454
1460
  // rotary position embedding
1455
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1461
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1456
1462
  // if mode & 2 == 1, GPT-NeoX style
1457
1463
  // if mode & 4 == 1, ChatGLM style
1458
1464
  //
1459
1465
  // b is an int32 vector with size a->ne[2], it contains the positions
1466
+ // c is freq factors (e.g. phi3-128k), (optional)
1460
1467
  GGML_API struct ggml_tensor * ggml_rope(
1461
1468
  struct ggml_context * ctx,
1462
1469
  struct ggml_tensor * a,
@@ -1475,10 +1482,11 @@ extern "C" {
1475
1482
  int n_ctx);
1476
1483
 
1477
1484
  // custom RoPE
1478
- GGML_API struct ggml_tensor * ggml_rope_custom(
1485
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1479
1486
  struct ggml_context * ctx,
1480
1487
  struct ggml_tensor * a,
1481
1488
  struct ggml_tensor * b,
1489
+ struct ggml_tensor * c,
1482
1490
  int n_dims,
1483
1491
  int mode,
1484
1492
  int n_ctx,
@@ -1491,10 +1499,11 @@ extern "C" {
1491
1499
  float beta_slow);
1492
1500
 
1493
1501
  // in-place, returns view(a)
1494
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1502
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1495
1503
  struct ggml_context * ctx,
1496
1504
  struct ggml_tensor * a,
1497
1505
  struct ggml_tensor * b,
1506
+ struct ggml_tensor * c,
1498
1507
  int n_dims,
1499
1508
  int mode,
1500
1509
  int n_ctx,
@@ -1506,18 +1515,41 @@ extern "C" {
1506
1515
  float beta_fast,
1507
1516
  float beta_slow);
1508
1517
 
1509
- // compute correction dims for YaRN RoPE scaling
1510
- GGML_CALL void ggml_rope_yarn_corr_dims(
1511
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1518
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1519
+ struct ggml_context * ctx,
1520
+ struct ggml_tensor * a,
1521
+ struct ggml_tensor * b,
1522
+ int n_dims,
1523
+ int mode,
1524
+ int n_ctx,
1525
+ int n_orig_ctx,
1526
+ float freq_base,
1527
+ float freq_scale,
1528
+ float ext_factor,
1529
+ float attn_factor,
1530
+ float beta_fast,
1531
+ float beta_slow),
1532
+ "use ggml_rope_ext instead");
1512
1533
 
1513
- // xPos RoPE, in-place, returns view(a)
1514
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1534
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1515
1535
  struct ggml_context * ctx,
1516
1536
  struct ggml_tensor * a,
1517
1537
  struct ggml_tensor * b,
1518
1538
  int n_dims,
1519
- float base,
1520
- bool down);
1539
+ int mode,
1540
+ int n_ctx,
1541
+ int n_orig_ctx,
1542
+ float freq_base,
1543
+ float freq_scale,
1544
+ float ext_factor,
1545
+ float attn_factor,
1546
+ float beta_fast,
1547
+ float beta_slow),
1548
+ "use ggml_rope_ext_inplace instead");
1549
+
1550
+ // compute correction dims for YaRN RoPE scaling
1551
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1552
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1521
1553
 
1522
1554
  // rotary position embedding backward, i.e compute dx from dy
1523
1555
  // a - dy
@@ -1525,6 +1557,7 @@ extern "C" {
1525
1557
  struct ggml_context * ctx,
1526
1558
  struct ggml_tensor * a,
1527
1559
  struct ggml_tensor * b,
1560
+ struct ggml_tensor * c,
1528
1561
  int n_dims,
1529
1562
  int mode,
1530
1563
  int n_ctx,
@@ -1538,16 +1571,6 @@ extern "C" {
1538
1571
  float xpos_base,
1539
1572
  bool xpos_down);
1540
1573
 
1541
- // alibi position embedding
1542
- // in-place, returns view(a)
1543
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1544
- struct ggml_context * ctx,
1545
- struct ggml_tensor * a,
1546
- int n_past,
1547
- int n_head,
1548
- float bias_max),
1549
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1550
-
1551
1574
  // clamp
1552
1575
  // in-place, returns view(a)
1553
1576
  GGML_API struct ggml_tensor * ggml_clamp(
@@ -1677,12 +1700,24 @@ extern "C" {
1677
1700
  float p1);
1678
1701
 
1679
1702
  // nearest interpolate
1703
+ // multiplies ne0 and ne1 by scale factor
1680
1704
  // used in stable-diffusion
1681
1705
  GGML_API struct ggml_tensor * ggml_upscale(
1682
1706
  struct ggml_context * ctx,
1683
1707
  struct ggml_tensor * a,
1684
1708
  int scale_factor);
1685
1709
 
1710
+ // nearest interpolate
1711
+ // nearest interpolate to specified dimensions
1712
+ // used in tortoise.cpp
1713
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1714
+ struct ggml_context * ctx,
1715
+ struct ggml_tensor * a,
1716
+ int ne0,
1717
+ int ne1,
1718
+ int ne2,
1719
+ int ne3);
1720
+
1686
1721
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1687
1722
  GGML_API struct ggml_tensor * ggml_pad(
1688
1723
  struct ggml_context * ctx,
@@ -1724,13 +1759,6 @@ extern "C" {
1724
1759
  struct ggml_tensor * a,
1725
1760
  int k);
1726
1761
 
1727
- GGML_API struct ggml_tensor * ggml_flash_attn(
1728
- struct ggml_context * ctx,
1729
- struct ggml_tensor * q,
1730
- struct ggml_tensor * k,
1731
- struct ggml_tensor * v,
1732
- bool masked);
1733
-
1734
1762
  #define GGML_KQ_MASK_PAD 32
1735
1763
 
1736
1764
  // q: [n_embd, n_batch, n_head, 1]
@@ -1744,12 +1772,14 @@ extern "C" {
1744
1772
  struct ggml_tensor * k,
1745
1773
  struct ggml_tensor * v,
1746
1774
  struct ggml_tensor * mask,
1747
- float scale);
1775
+ float scale,
1776
+ float max_bias);
1748
1777
 
1749
1778
  GGML_API void ggml_flash_attn_ext_set_prec(
1750
1779
  struct ggml_tensor * a,
1751
1780
  enum ggml_prec prec);
1752
1781
 
1782
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1753
1783
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1754
1784
  struct ggml_context * ctx,
1755
1785
  struct ggml_tensor * q,
@@ -1758,14 +1788,6 @@ extern "C" {
1758
1788
  struct ggml_tensor * d,
1759
1789
  bool masked);
1760
1790
 
1761
- GGML_API struct ggml_tensor * ggml_flash_ff(
1762
- struct ggml_context * ctx,
1763
- struct ggml_tensor * a,
1764
- struct ggml_tensor * b0,
1765
- struct ggml_tensor * b1,
1766
- struct ggml_tensor * c0,
1767
- struct ggml_tensor * c1);
1768
-
1769
1791
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1770
1792
  struct ggml_context * ctx,
1771
1793
  struct ggml_tensor * s,
@@ -2379,6 +2401,7 @@ extern "C" {
2379
2401
  GGML_API int ggml_cpu_has_avx512 (void);
2380
2402
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2381
2403
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2404
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2382
2405
  GGML_API int ggml_cpu_has_fma (void);
2383
2406
  GGML_API int ggml_cpu_has_neon (void);
2384
2407
  GGML_API int ggml_cpu_has_arm_fma (void);