llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -468,7 +468,6 @@ extern "C" {
|
|
468
468
|
GGML_OP_SOFT_MAX_BACK,
|
469
469
|
GGML_OP_ROPE,
|
470
470
|
GGML_OP_ROPE_BACK,
|
471
|
-
GGML_OP_ALIBI,
|
472
471
|
GGML_OP_CLAMP,
|
473
472
|
GGML_OP_CONV_TRANSPOSE_1D,
|
474
473
|
GGML_OP_IM2COL,
|
@@ -482,9 +481,7 @@ extern "C" {
|
|
482
481
|
GGML_OP_ARGSORT,
|
483
482
|
GGML_OP_LEAKY_RELU,
|
484
483
|
|
485
|
-
GGML_OP_FLASH_ATTN,
|
486
484
|
GGML_OP_FLASH_ATTN_EXT,
|
487
|
-
GGML_OP_FLASH_FF,
|
488
485
|
GGML_OP_FLASH_ATTN_BACK,
|
489
486
|
GGML_OP_SSM_CONV,
|
490
487
|
GGML_OP_SSM_SCAN,
|
@@ -520,6 +517,7 @@ extern "C" {
|
|
520
517
|
GGML_UNARY_OP_TANH,
|
521
518
|
GGML_UNARY_OP_ELU,
|
522
519
|
GGML_UNARY_OP_RELU,
|
520
|
+
GGML_UNARY_OP_SIGMOID,
|
523
521
|
GGML_UNARY_OP_GELU,
|
524
522
|
GGML_UNARY_OP_GELU_QUICK,
|
525
523
|
GGML_UNARY_OP_SILU,
|
@@ -565,7 +563,8 @@ extern "C" {
|
|
565
563
|
// n-dimensional tensor
|
566
564
|
struct ggml_tensor {
|
567
565
|
enum ggml_type type;
|
568
|
-
|
566
|
+
|
567
|
+
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
569
568
|
|
570
569
|
struct ggml_backend_buffer * buffer;
|
571
570
|
|
@@ -766,7 +765,8 @@ extern "C" {
|
|
766
765
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
767
766
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
768
767
|
|
769
|
-
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
768
|
+
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
769
|
+
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
770
770
|
|
771
771
|
// use this to compute the memory overhead of a tensor
|
772
772
|
GGML_API size_t ggml_tensor_overhead(void);
|
@@ -1074,6 +1074,14 @@ extern "C" {
|
|
1074
1074
|
struct ggml_context * ctx,
|
1075
1075
|
struct ggml_tensor * a);
|
1076
1076
|
|
1077
|
+
GGML_API struct ggml_tensor * ggml_sigmoid(
|
1078
|
+
struct ggml_context * ctx,
|
1079
|
+
struct ggml_tensor * a);
|
1080
|
+
|
1081
|
+
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
1082
|
+
struct ggml_context * ctx,
|
1083
|
+
struct ggml_tensor * a);
|
1084
|
+
|
1077
1085
|
GGML_API struct ggml_tensor * ggml_gelu(
|
1078
1086
|
struct ggml_context * ctx,
|
1079
1087
|
struct ggml_tensor * a);
|
@@ -1428,15 +1436,13 @@ extern "C" {
|
|
1428
1436
|
struct ggml_context * ctx,
|
1429
1437
|
struct ggml_tensor * a);
|
1430
1438
|
|
1431
|
-
// fused soft_max(a*scale + mask
|
1439
|
+
// fused soft_max(a*scale + mask*(ALiBi slope))
|
1432
1440
|
// mask is optional
|
1433
|
-
// pos is required when max_bias > 0.0f
|
1434
1441
|
// max_bias = 0.0f for no ALiBi
|
1435
1442
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1436
1443
|
struct ggml_context * ctx,
|
1437
1444
|
struct ggml_tensor * a,
|
1438
1445
|
struct ggml_tensor * mask,
|
1439
|
-
struct ggml_tensor * pos,
|
1440
1446
|
float scale,
|
1441
1447
|
float max_bias);
|
1442
1448
|
|
@@ -1452,11 +1458,12 @@ extern "C" {
|
|
1452
1458
|
struct ggml_tensor * b);
|
1453
1459
|
|
1454
1460
|
// rotary position embedding
|
1455
|
-
// if mode & 1 == 1, skip n_past elements (
|
1461
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
1456
1462
|
// if mode & 2 == 1, GPT-NeoX style
|
1457
1463
|
// if mode & 4 == 1, ChatGLM style
|
1458
1464
|
//
|
1459
1465
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1466
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1460
1467
|
GGML_API struct ggml_tensor * ggml_rope(
|
1461
1468
|
struct ggml_context * ctx,
|
1462
1469
|
struct ggml_tensor * a,
|
@@ -1475,10 +1482,11 @@ extern "C" {
|
|
1475
1482
|
int n_ctx);
|
1476
1483
|
|
1477
1484
|
// custom RoPE
|
1478
|
-
GGML_API struct ggml_tensor *
|
1485
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
1479
1486
|
struct ggml_context * ctx,
|
1480
1487
|
struct ggml_tensor * a,
|
1481
1488
|
struct ggml_tensor * b,
|
1489
|
+
struct ggml_tensor * c,
|
1482
1490
|
int n_dims,
|
1483
1491
|
int mode,
|
1484
1492
|
int n_ctx,
|
@@ -1491,10 +1499,11 @@ extern "C" {
|
|
1491
1499
|
float beta_slow);
|
1492
1500
|
|
1493
1501
|
// in-place, returns view(a)
|
1494
|
-
GGML_API struct ggml_tensor *
|
1502
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
1495
1503
|
struct ggml_context * ctx,
|
1496
1504
|
struct ggml_tensor * a,
|
1497
1505
|
struct ggml_tensor * b,
|
1506
|
+
struct ggml_tensor * c,
|
1498
1507
|
int n_dims,
|
1499
1508
|
int mode,
|
1500
1509
|
int n_ctx,
|
@@ -1506,18 +1515,41 @@ extern "C" {
|
|
1506
1515
|
float beta_fast,
|
1507
1516
|
float beta_slow);
|
1508
1517
|
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1518
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
1519
|
+
struct ggml_context * ctx,
|
1520
|
+
struct ggml_tensor * a,
|
1521
|
+
struct ggml_tensor * b,
|
1522
|
+
int n_dims,
|
1523
|
+
int mode,
|
1524
|
+
int n_ctx,
|
1525
|
+
int n_orig_ctx,
|
1526
|
+
float freq_base,
|
1527
|
+
float freq_scale,
|
1528
|
+
float ext_factor,
|
1529
|
+
float attn_factor,
|
1530
|
+
float beta_fast,
|
1531
|
+
float beta_slow),
|
1532
|
+
"use ggml_rope_ext instead");
|
1512
1533
|
|
1513
|
-
|
1514
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1534
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1515
1535
|
struct ggml_context * ctx,
|
1516
1536
|
struct ggml_tensor * a,
|
1517
1537
|
struct ggml_tensor * b,
|
1518
1538
|
int n_dims,
|
1519
|
-
|
1520
|
-
|
1539
|
+
int mode,
|
1540
|
+
int n_ctx,
|
1541
|
+
int n_orig_ctx,
|
1542
|
+
float freq_base,
|
1543
|
+
float freq_scale,
|
1544
|
+
float ext_factor,
|
1545
|
+
float attn_factor,
|
1546
|
+
float beta_fast,
|
1547
|
+
float beta_slow),
|
1548
|
+
"use ggml_rope_ext_inplace instead");
|
1549
|
+
|
1550
|
+
// compute correction dims for YaRN RoPE scaling
|
1551
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1552
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1521
1553
|
|
1522
1554
|
// rotary position embedding backward, i.e compute dx from dy
|
1523
1555
|
// a - dy
|
@@ -1525,6 +1557,7 @@ extern "C" {
|
|
1525
1557
|
struct ggml_context * ctx,
|
1526
1558
|
struct ggml_tensor * a,
|
1527
1559
|
struct ggml_tensor * b,
|
1560
|
+
struct ggml_tensor * c,
|
1528
1561
|
int n_dims,
|
1529
1562
|
int mode,
|
1530
1563
|
int n_ctx,
|
@@ -1538,16 +1571,6 @@ extern "C" {
|
|
1538
1571
|
float xpos_base,
|
1539
1572
|
bool xpos_down);
|
1540
1573
|
|
1541
|
-
// alibi position embedding
|
1542
|
-
// in-place, returns view(a)
|
1543
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1544
|
-
struct ggml_context * ctx,
|
1545
|
-
struct ggml_tensor * a,
|
1546
|
-
int n_past,
|
1547
|
-
int n_head,
|
1548
|
-
float bias_max),
|
1549
|
-
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1550
|
-
|
1551
1574
|
// clamp
|
1552
1575
|
// in-place, returns view(a)
|
1553
1576
|
GGML_API struct ggml_tensor * ggml_clamp(
|
@@ -1677,12 +1700,24 @@ extern "C" {
|
|
1677
1700
|
float p1);
|
1678
1701
|
|
1679
1702
|
// nearest interpolate
|
1703
|
+
// multiplies ne0 and ne1 by scale factor
|
1680
1704
|
// used in stable-diffusion
|
1681
1705
|
GGML_API struct ggml_tensor * ggml_upscale(
|
1682
1706
|
struct ggml_context * ctx,
|
1683
1707
|
struct ggml_tensor * a,
|
1684
1708
|
int scale_factor);
|
1685
1709
|
|
1710
|
+
// nearest interpolate
|
1711
|
+
// nearest interpolate to specified dimensions
|
1712
|
+
// used in tortoise.cpp
|
1713
|
+
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
1714
|
+
struct ggml_context * ctx,
|
1715
|
+
struct ggml_tensor * a,
|
1716
|
+
int ne0,
|
1717
|
+
int ne1,
|
1718
|
+
int ne2,
|
1719
|
+
int ne3);
|
1720
|
+
|
1686
1721
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1687
1722
|
GGML_API struct ggml_tensor * ggml_pad(
|
1688
1723
|
struct ggml_context * ctx,
|
@@ -1724,13 +1759,6 @@ extern "C" {
|
|
1724
1759
|
struct ggml_tensor * a,
|
1725
1760
|
int k);
|
1726
1761
|
|
1727
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1728
|
-
struct ggml_context * ctx,
|
1729
|
-
struct ggml_tensor * q,
|
1730
|
-
struct ggml_tensor * k,
|
1731
|
-
struct ggml_tensor * v,
|
1732
|
-
bool masked);
|
1733
|
-
|
1734
1762
|
#define GGML_KQ_MASK_PAD 32
|
1735
1763
|
|
1736
1764
|
// q: [n_embd, n_batch, n_head, 1]
|
@@ -1744,12 +1772,14 @@ extern "C" {
|
|
1744
1772
|
struct ggml_tensor * k,
|
1745
1773
|
struct ggml_tensor * v,
|
1746
1774
|
struct ggml_tensor * mask,
|
1747
|
-
float scale
|
1775
|
+
float scale,
|
1776
|
+
float max_bias);
|
1748
1777
|
|
1749
1778
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
1750
1779
|
struct ggml_tensor * a,
|
1751
1780
|
enum ggml_prec prec);
|
1752
1781
|
|
1782
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
1753
1783
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1754
1784
|
struct ggml_context * ctx,
|
1755
1785
|
struct ggml_tensor * q,
|
@@ -1758,14 +1788,6 @@ extern "C" {
|
|
1758
1788
|
struct ggml_tensor * d,
|
1759
1789
|
bool masked);
|
1760
1790
|
|
1761
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
1762
|
-
struct ggml_context * ctx,
|
1763
|
-
struct ggml_tensor * a,
|
1764
|
-
struct ggml_tensor * b0,
|
1765
|
-
struct ggml_tensor * b1,
|
1766
|
-
struct ggml_tensor * c0,
|
1767
|
-
struct ggml_tensor * c1);
|
1768
|
-
|
1769
1791
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1770
1792
|
struct ggml_context * ctx,
|
1771
1793
|
struct ggml_tensor * s,
|
@@ -2379,6 +2401,7 @@ extern "C" {
|
|
2379
2401
|
GGML_API int ggml_cpu_has_avx512 (void);
|
2380
2402
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
2381
2403
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
2404
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2382
2405
|
GGML_API int ggml_cpu_has_fma (void);
|
2383
2406
|
GGML_API int ggml_cpu_has_neon (void);
|
2384
2407
|
GGML_API int ggml_cpu_has_arm_fma (void);
|