llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -481,9 +481,7 @@ extern "C" {
481
481
  GGML_OP_ARGSORT,
482
482
  GGML_OP_LEAKY_RELU,
483
483
 
484
- GGML_OP_FLASH_ATTN,
485
484
  GGML_OP_FLASH_ATTN_EXT,
486
- GGML_OP_FLASH_FF,
487
485
  GGML_OP_FLASH_ATTN_BACK,
488
486
  GGML_OP_SSM_CONV,
489
487
  GGML_OP_SSM_SCAN,
@@ -758,7 +756,6 @@ extern "C" {
758
756
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
759
757
 
760
758
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
761
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
762
759
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
763
760
  GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
764
761
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
@@ -767,6 +764,11 @@ extern "C" {
767
764
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
768
765
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
769
766
 
767
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
768
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
769
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
770
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
771
+
770
772
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
771
773
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
772
774
 
@@ -1009,12 +1011,13 @@ extern "C" {
1009
1011
  struct ggml_tensor * a,
1010
1012
  struct ggml_tensor * b);
1011
1013
 
1012
- // concat a and b on dim 2
1014
+ // concat a and b along dim
1013
1015
  // used in stable-diffusion
1014
1016
  GGML_API struct ggml_tensor * ggml_concat(
1015
1017
  struct ggml_context * ctx,
1016
1018
  struct ggml_tensor * a,
1017
- struct ggml_tensor * b);
1019
+ struct ggml_tensor * b,
1020
+ int dim);
1018
1021
 
1019
1022
  GGML_API struct ggml_tensor * ggml_abs(
1020
1023
  struct ggml_context * ctx,
@@ -1460,11 +1463,12 @@ extern "C" {
1460
1463
  struct ggml_tensor * b);
1461
1464
 
1462
1465
  // rotary position embedding
1463
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1466
+ // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1464
1467
  // if mode & 2 == 1, GPT-NeoX style
1465
1468
  // if mode & 4 == 1, ChatGLM style
1466
1469
  //
1467
1470
  // b is an int32 vector with size a->ne[2], it contains the positions
1471
+ // c is freq factors (e.g. phi3-128k), (optional)
1468
1472
  GGML_API struct ggml_tensor * ggml_rope(
1469
1473
  struct ggml_context * ctx,
1470
1474
  struct ggml_tensor * a,
@@ -1483,10 +1487,11 @@ extern "C" {
1483
1487
  int n_ctx);
1484
1488
 
1485
1489
  // custom RoPE
1486
- GGML_API struct ggml_tensor * ggml_rope_custom(
1490
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1487
1491
  struct ggml_context * ctx,
1488
1492
  struct ggml_tensor * a,
1489
1493
  struct ggml_tensor * b,
1494
+ struct ggml_tensor * c,
1490
1495
  int n_dims,
1491
1496
  int mode,
1492
1497
  int n_ctx,
@@ -1499,10 +1504,11 @@ extern "C" {
1499
1504
  float beta_slow);
1500
1505
 
1501
1506
  // in-place, returns view(a)
1502
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1507
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1503
1508
  struct ggml_context * ctx,
1504
1509
  struct ggml_tensor * a,
1505
1510
  struct ggml_tensor * b,
1511
+ struct ggml_tensor * c,
1506
1512
  int n_dims,
1507
1513
  int mode,
1508
1514
  int n_ctx,
@@ -1514,18 +1520,49 @@ extern "C" {
1514
1520
  float beta_fast,
1515
1521
  float beta_slow);
1516
1522
 
1517
- // compute correction dims for YaRN RoPE scaling
1518
- GGML_CALL void ggml_rope_yarn_corr_dims(
1519
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1523
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1524
+ struct ggml_context * ctx,
1525
+ struct ggml_tensor * a,
1526
+ struct ggml_tensor * b,
1527
+ int n_dims,
1528
+ int mode,
1529
+ int n_ctx,
1530
+ int n_orig_ctx,
1531
+ float freq_base,
1532
+ float freq_scale,
1533
+ float ext_factor,
1534
+ float attn_factor,
1535
+ float beta_fast,
1536
+ float beta_slow),
1537
+ "use ggml_rope_ext instead");
1520
1538
 
1521
- // xPos RoPE, in-place, returns view(a)
1522
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1539
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1523
1540
  struct ggml_context * ctx,
1524
1541
  struct ggml_tensor * a,
1525
1542
  struct ggml_tensor * b,
1526
1543
  int n_dims,
1527
- float base,
1528
- bool down);
1544
+ int mode,
1545
+ int n_ctx,
1546
+ int n_orig_ctx,
1547
+ float freq_base,
1548
+ float freq_scale,
1549
+ float ext_factor,
1550
+ float attn_factor,
1551
+ float beta_fast,
1552
+ float beta_slow),
1553
+ "use ggml_rope_ext_inplace instead");
1554
+
1555
+ struct ggml_tensor * ggml_rope_xpos_inplace(
1556
+ struct ggml_context * ctx,
1557
+ struct ggml_tensor * a,
1558
+ struct ggml_tensor * b,
1559
+ int n_dims,
1560
+ float base,
1561
+ bool down);
1562
+
1563
+ // compute correction dims for YaRN RoPE scaling
1564
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1565
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1529
1566
 
1530
1567
  // rotary position embedding backward, i.e compute dx from dy
1531
1568
  // a - dy
@@ -1533,6 +1570,7 @@ extern "C" {
1533
1570
  struct ggml_context * ctx,
1534
1571
  struct ggml_tensor * a,
1535
1572
  struct ggml_tensor * b,
1573
+ struct ggml_tensor * c,
1536
1574
  int n_dims,
1537
1575
  int mode,
1538
1576
  int n_ctx,
@@ -1734,13 +1772,6 @@ extern "C" {
1734
1772
  struct ggml_tensor * a,
1735
1773
  int k);
1736
1774
 
1737
- GGML_API struct ggml_tensor * ggml_flash_attn(
1738
- struct ggml_context * ctx,
1739
- struct ggml_tensor * q,
1740
- struct ggml_tensor * k,
1741
- struct ggml_tensor * v,
1742
- bool masked);
1743
-
1744
1775
  #define GGML_KQ_MASK_PAD 32
1745
1776
 
1746
1777
  // q: [n_embd, n_batch, n_head, 1]
@@ -1761,6 +1792,7 @@ extern "C" {
1761
1792
  struct ggml_tensor * a,
1762
1793
  enum ggml_prec prec);
1763
1794
 
1795
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1764
1796
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1765
1797
  struct ggml_context * ctx,
1766
1798
  struct ggml_tensor * q,
@@ -1769,14 +1801,6 @@ extern "C" {
1769
1801
  struct ggml_tensor * d,
1770
1802
  bool masked);
1771
1803
 
1772
- GGML_API struct ggml_tensor * ggml_flash_ff(
1773
- struct ggml_context * ctx,
1774
- struct ggml_tensor * a,
1775
- struct ggml_tensor * b0,
1776
- struct ggml_tensor * b1,
1777
- struct ggml_tensor * c0,
1778
- struct ggml_tensor * c1);
1779
-
1780
1804
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1781
1805
  struct ggml_context * ctx,
1782
1806
  struct ggml_tensor * s,
@@ -2390,8 +2414,10 @@ extern "C" {
2390
2414
  GGML_API int ggml_cpu_has_avx512 (void);
2391
2415
  GGML_API int ggml_cpu_has_avx512_vbmi(void);
2392
2416
  GGML_API int ggml_cpu_has_avx512_vnni(void);
2417
+ GGML_API int ggml_cpu_has_avx512_bf16(void);
2393
2418
  GGML_API int ggml_cpu_has_fma (void);
2394
2419
  GGML_API int ggml_cpu_has_neon (void);
2420
+ GGML_API int ggml_cpu_has_sve (void);
2395
2421
  GGML_API int ggml_cpu_has_arm_fma (void);
2396
2422
  GGML_API int ggml_cpu_has_metal (void);
2397
2423
  GGML_API int ggml_cpu_has_f16c (void);
@@ -2406,6 +2432,7 @@ extern "C" {
2406
2432
  GGML_API int ggml_cpu_has_sse3 (void);
2407
2433
  GGML_API int ggml_cpu_has_ssse3 (void);
2408
2434
  GGML_API int ggml_cpu_has_sycl (void);
2435
+ GGML_API int ggml_cpu_has_rpc (void);
2409
2436
  GGML_API int ggml_cpu_has_vsx (void);
2410
2437
  GGML_API int ggml_cpu_has_matmul_int8(void);
2411
2438