llama_cpp 0.15.2 → 0.15.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -481,9 +481,7 @@ extern "C" {
481
481
  GGML_OP_ARGSORT,
482
482
  GGML_OP_LEAKY_RELU,
483
483
 
484
- GGML_OP_FLASH_ATTN,
485
484
  GGML_OP_FLASH_ATTN_EXT,
486
- GGML_OP_FLASH_FF,
487
485
  GGML_OP_FLASH_ATTN_BACK,
488
486
  GGML_OP_SSM_CONV,
489
487
  GGML_OP_SSM_SCAN,
@@ -758,7 +756,6 @@ extern "C" {
758
756
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
759
757
 
760
758
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
761
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
762
759
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
763
760
  GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
764
761
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
@@ -767,6 +764,11 @@ extern "C" {
767
764
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
768
765
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
769
766
 
767
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
768
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
769
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
770
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
771
+
770
772
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
771
773
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
772
774
 
@@ -1009,12 +1011,13 @@ extern "C" {
1009
1011
  struct ggml_tensor * a,
1010
1012
  struct ggml_tensor * b);
1011
1013
 
1012
- // concat a and b on dim 2
1014
+ // concat a and b along dim
1013
1015
  // used in stable-diffusion
1014
1016
  GGML_API struct ggml_tensor * ggml_concat(
1015
1017
  struct ggml_context * ctx,
1016
1018
  struct ggml_tensor * a,
1017
- struct ggml_tensor * b);
1019
+ struct ggml_tensor * b,
1020
+ int dim);
1018
1021
 
1019
1022
  GGML_API struct ggml_tensor * ggml_abs(
1020
1023
  struct ggml_context * ctx,
@@ -1460,11 +1463,12 @@ extern "C" {
1460
1463
  struct ggml_tensor * b);
1461
1464
 
1462
1465
  // rotary position embedding
1463
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1466
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1464
1467
  // if mode & 2 == 1, GPT-NeoX style
1465
1468
  // if mode & 4 == 1, ChatGLM style
1466
1469
  //
1467
1470
  // b is an int32 vector with size a->ne[2], it contains the positions
1471
+ // c is freq factors (e.g. phi3-128k), (optional)
1468
1472
  GGML_API struct ggml_tensor * ggml_rope(
1469
1473
  struct ggml_context * ctx,
1470
1474
  struct ggml_tensor * a,
@@ -1483,10 +1487,11 @@ extern "C" {
1483
1487
  int n_ctx);
1484
1488
 
1485
1489
  // custom RoPE
1486
- GGML_API struct ggml_tensor * ggml_rope_custom(
1490
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1487
1491
  struct ggml_context * ctx,
1488
1492
  struct ggml_tensor * a,
1489
1493
  struct ggml_tensor * b,
1494
+ struct ggml_tensor * c,
1490
1495
  int n_dims,
1491
1496
  int mode,
1492
1497
  int n_ctx,
@@ -1499,10 +1504,11 @@ extern "C" {
1499
1504
  float beta_slow);
1500
1505
 
1501
1506
  // in-place, returns view(a)
1502
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1507
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1503
1508
  struct ggml_context * ctx,
1504
1509
  struct ggml_tensor * a,
1505
1510
  struct ggml_tensor * b,
1511
+ struct ggml_tensor * c,
1506
1512
  int n_dims,
1507
1513
  int mode,
1508
1514
  int n_ctx,
@@ -1514,18 +1520,49 @@ extern "C" {
1514
1520
  float beta_fast,
1515
1521
  float beta_slow);
1516
1522
 
1517
- // compute correction dims for YaRN RoPE scaling
1518
- GGML_CALL void ggml_rope_yarn_corr_dims(
1519
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1523
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1524
+ struct ggml_context * ctx,
1525
+ struct ggml_tensor * a,
1526
+ struct ggml_tensor * b,
1527
+ int n_dims,
1528
+ int mode,
1529
+ int n_ctx,
1530
+ int n_orig_ctx,
1531
+ float freq_base,
1532
+ float freq_scale,
1533
+ float ext_factor,
1534
+ float attn_factor,
1535
+ float beta_fast,
1536
+ float beta_slow),
1537
+ "use ggml_rope_ext instead");
1520
1538
 
1521
- // xPos RoPE, in-place, returns view(a)
1522
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1539
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1523
1540
  struct ggml_context * ctx,
1524
1541
  struct ggml_tensor * a,
1525
1542
  struct ggml_tensor * b,
1526
1543
  int n_dims,
1527
- float base,
1528
- bool down);
1544
+ int mode,
1545
+ int n_ctx,
1546
+ int n_orig_ctx,
1547
+ float freq_base,
1548
+ float freq_scale,
1549
+ float ext_factor,
1550
+ float attn_factor,
1551
+ float beta_fast,
1552
+ float beta_slow),
1553
+ "use ggml_rope_ext_inplace instead");
1554
+
1555
+ struct ggml_tensor * ggml_rope_xpos_inplace(
1556
+ struct ggml_context * ctx,
1557
+ struct ggml_tensor * a,
1558
+ struct ggml_tensor * b,
1559
+ int n_dims,
1560
+ float base,
1561
+ bool down);
1562
+
1563
+ // compute correction dims for YaRN RoPE scaling
1564
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1565
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1529
1566
 
1530
1567
  // rotary position embedding backward, i.e compute dx from dy
1531
1568
  // a - dy
@@ -1533,6 +1570,7 @@ extern "C" {
1533
1570
  struct ggml_context * ctx,
1534
1571
  struct ggml_tensor * a,
1535
1572
  struct ggml_tensor * b,
1573
+ struct ggml_tensor * c,
1536
1574
  int n_dims,
1537
1575
  int mode,
1538
1576
  int n_ctx,
@@ -1734,13 +1772,6 @@ extern "C" {
1734
1772
  struct ggml_tensor * a,
1735
1773
  int k);
1736
1774
 
1737
- GGML_API struct ggml_tensor * ggml_flash_attn(
1738
- struct ggml_context * ctx,
1739
- struct ggml_tensor * q,
1740
- struct ggml_tensor * k,
1741
- struct ggml_tensor * v,
1742
- bool masked);
1743
-
1744
1775
  #define GGML_KQ_MASK_PAD 32
1745
1776
 
1746
1777
  // q: [n_embd, n_batch, n_head, 1]
@@ -1761,6 +1792,7 @@ extern "C" {
1761
1792
  struct ggml_tensor * a,
1762
1793
  enum ggml_prec prec);
1763
1794
 
1795
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1764
1796
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1765
1797
  struct ggml_context * ctx,
1766
1798
  struct ggml_tensor * q,
@@ -1769,14 +1801,6 @@ extern "C" {
1769
1801
  struct ggml_tensor * d,
1770
1802
  bool masked);
1771
1803
 
1772
- GGML_API struct ggml_tensor * ggml_flash_ff(
1773
- struct ggml_context * ctx,
1774
- struct ggml_tensor * a,
1775
- struct ggml_tensor * b0,
1776
- struct ggml_tensor * b1,
1777
- struct ggml_tensor * c0,
1778
- struct ggml_tensor * c1);
1779
-
1780
1804
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1781
1805
  struct ggml_context * ctx,
1782
1806
  struct ggml_tensor * s,
@@ -2390,8 +2414,10 @@ extern "C" {
2390
2414
  GGML_API int ggml_cpu_has_avx512 (void);
2391
2415
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2392
2416
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2417
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2393
2418
  GGML_API int ggml_cpu_has_fma (void);
2394
2419
  GGML_API int ggml_cpu_has_neon (void);
2420
+ GGML_API int ggml_cpu_has_sve (void);
2395
2421
  GGML_API int ggml_cpu_has_arm_fma (void);
2396
2422
  GGML_API int ggml_cpu_has_metal (void);
2397
2423
  GGML_API int ggml_cpu_has_f16c (void);
@@ -2406,6 +2432,7 @@ extern "C" {
2406
2432
  GGML_API int ggml_cpu_has_sse3 (void);
2407
2433
  GGML_API int ggml_cpu_has_ssse3 (void);
2408
2434
  GGML_API int ggml_cpu_has_sycl (void);
2435
+ GGML_API int ggml_cpu_has_rpc (void);
2409
2436
  GGML_API int ggml_cpu_has_vsx (void);
2410
2437
  GGML_API int ggml_cpu_has_matmul_int8(void);
2411
2438