llama_cpp 0.15.2 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -481,9 +481,7 @@ extern "C" {
|
|
481
481
|
GGML_OP_ARGSORT,
|
482
482
|
GGML_OP_LEAKY_RELU,
|
483
483
|
|
484
|
-
GGML_OP_FLASH_ATTN,
|
485
484
|
GGML_OP_FLASH_ATTN_EXT,
|
486
|
-
GGML_OP_FLASH_FF,
|
487
485
|
GGML_OP_FLASH_ATTN_BACK,
|
488
486
|
GGML_OP_SSM_CONV,
|
489
487
|
GGML_OP_SSM_SCAN,
|
@@ -758,7 +756,6 @@ extern "C" {
|
|
758
756
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
759
757
|
|
760
758
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
761
|
-
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
762
759
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
763
760
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
764
761
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
@@ -767,6 +764,11 @@ extern "C" {
|
|
767
764
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
768
765
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
769
766
|
|
767
|
+
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
768
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
769
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
770
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
771
|
+
|
770
772
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
771
773
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
772
774
|
|
@@ -1009,12 +1011,13 @@ extern "C" {
|
|
1009
1011
|
struct ggml_tensor * a,
|
1010
1012
|
struct ggml_tensor * b);
|
1011
1013
|
|
1012
|
-
// concat a and b
|
1014
|
+
// concat a and b along dim
|
1013
1015
|
// used in stable-diffusion
|
1014
1016
|
GGML_API struct ggml_tensor * ggml_concat(
|
1015
1017
|
struct ggml_context * ctx,
|
1016
1018
|
struct ggml_tensor * a,
|
1017
|
-
struct ggml_tensor * b
|
1019
|
+
struct ggml_tensor * b,
|
1020
|
+
int dim);
|
1018
1021
|
|
1019
1022
|
GGML_API struct ggml_tensor * ggml_abs(
|
1020
1023
|
struct ggml_context * ctx,
|
@@ -1460,11 +1463,12 @@ extern "C" {
|
|
1460
1463
|
struct ggml_tensor * b);
|
1461
1464
|
|
1462
1465
|
// rotary position embedding
|
1463
|
-
// if mode & 1 == 1, skip n_past elements (
|
1466
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
1464
1467
|
// if mode & 2 == 1, GPT-NeoX style
|
1465
1468
|
// if mode & 4 == 1, ChatGLM style
|
1466
1469
|
//
|
1467
1470
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1471
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1468
1472
|
GGML_API struct ggml_tensor * ggml_rope(
|
1469
1473
|
struct ggml_context * ctx,
|
1470
1474
|
struct ggml_tensor * a,
|
@@ -1483,10 +1487,11 @@ extern "C" {
|
|
1483
1487
|
int n_ctx);
|
1484
1488
|
|
1485
1489
|
// custom RoPE
|
1486
|
-
GGML_API struct ggml_tensor *
|
1490
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
1487
1491
|
struct ggml_context * ctx,
|
1488
1492
|
struct ggml_tensor * a,
|
1489
1493
|
struct ggml_tensor * b,
|
1494
|
+
struct ggml_tensor * c,
|
1490
1495
|
int n_dims,
|
1491
1496
|
int mode,
|
1492
1497
|
int n_ctx,
|
@@ -1499,10 +1504,11 @@ extern "C" {
|
|
1499
1504
|
float beta_slow);
|
1500
1505
|
|
1501
1506
|
// in-place, returns view(a)
|
1502
|
-
GGML_API struct ggml_tensor *
|
1507
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
1503
1508
|
struct ggml_context * ctx,
|
1504
1509
|
struct ggml_tensor * a,
|
1505
1510
|
struct ggml_tensor * b,
|
1511
|
+
struct ggml_tensor * c,
|
1506
1512
|
int n_dims,
|
1507
1513
|
int mode,
|
1508
1514
|
int n_ctx,
|
@@ -1514,18 +1520,49 @@ extern "C" {
|
|
1514
1520
|
float beta_fast,
|
1515
1521
|
float beta_slow);
|
1516
1522
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1523
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
1524
|
+
struct ggml_context * ctx,
|
1525
|
+
struct ggml_tensor * a,
|
1526
|
+
struct ggml_tensor * b,
|
1527
|
+
int n_dims,
|
1528
|
+
int mode,
|
1529
|
+
int n_ctx,
|
1530
|
+
int n_orig_ctx,
|
1531
|
+
float freq_base,
|
1532
|
+
float freq_scale,
|
1533
|
+
float ext_factor,
|
1534
|
+
float attn_factor,
|
1535
|
+
float beta_fast,
|
1536
|
+
float beta_slow),
|
1537
|
+
"use ggml_rope_ext instead");
|
1520
1538
|
|
1521
|
-
|
1522
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1539
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1523
1540
|
struct ggml_context * ctx,
|
1524
1541
|
struct ggml_tensor * a,
|
1525
1542
|
struct ggml_tensor * b,
|
1526
1543
|
int n_dims,
|
1527
|
-
|
1528
|
-
|
1544
|
+
int mode,
|
1545
|
+
int n_ctx,
|
1546
|
+
int n_orig_ctx,
|
1547
|
+
float freq_base,
|
1548
|
+
float freq_scale,
|
1549
|
+
float ext_factor,
|
1550
|
+
float attn_factor,
|
1551
|
+
float beta_fast,
|
1552
|
+
float beta_slow),
|
1553
|
+
"use ggml_rope_ext_inplace instead");
|
1554
|
+
|
1555
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
1556
|
+
struct ggml_context * ctx,
|
1557
|
+
struct ggml_tensor * a,
|
1558
|
+
struct ggml_tensor * b,
|
1559
|
+
int n_dims,
|
1560
|
+
float base,
|
1561
|
+
bool down);
|
1562
|
+
|
1563
|
+
// compute correction dims for YaRN RoPE scaling
|
1564
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1565
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1529
1566
|
|
1530
1567
|
// rotary position embedding backward, i.e compute dx from dy
|
1531
1568
|
// a - dy
|
@@ -1533,6 +1570,7 @@ extern "C" {
|
|
1533
1570
|
struct ggml_context * ctx,
|
1534
1571
|
struct ggml_tensor * a,
|
1535
1572
|
struct ggml_tensor * b,
|
1573
|
+
struct ggml_tensor * c,
|
1536
1574
|
int n_dims,
|
1537
1575
|
int mode,
|
1538
1576
|
int n_ctx,
|
@@ -1734,13 +1772,6 @@ extern "C" {
|
|
1734
1772
|
struct ggml_tensor * a,
|
1735
1773
|
int k);
|
1736
1774
|
|
1737
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1738
|
-
struct ggml_context * ctx,
|
1739
|
-
struct ggml_tensor * q,
|
1740
|
-
struct ggml_tensor * k,
|
1741
|
-
struct ggml_tensor * v,
|
1742
|
-
bool masked);
|
1743
|
-
|
1744
1775
|
#define GGML_KQ_MASK_PAD 32
|
1745
1776
|
|
1746
1777
|
// q: [n_embd, n_batch, n_head, 1]
|
@@ -1761,6 +1792,7 @@ extern "C" {
|
|
1761
1792
|
struct ggml_tensor * a,
|
1762
1793
|
enum ggml_prec prec);
|
1763
1794
|
|
1795
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
1764
1796
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1765
1797
|
struct ggml_context * ctx,
|
1766
1798
|
struct ggml_tensor * q,
|
@@ -1769,14 +1801,6 @@ extern "C" {
|
|
1769
1801
|
struct ggml_tensor * d,
|
1770
1802
|
bool masked);
|
1771
1803
|
|
1772
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
1773
|
-
struct ggml_context * ctx,
|
1774
|
-
struct ggml_tensor * a,
|
1775
|
-
struct ggml_tensor * b0,
|
1776
|
-
struct ggml_tensor * b1,
|
1777
|
-
struct ggml_tensor * c0,
|
1778
|
-
struct ggml_tensor * c1);
|
1779
|
-
|
1780
1804
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1781
1805
|
struct ggml_context * ctx,
|
1782
1806
|
struct ggml_tensor * s,
|
@@ -2390,8 +2414,10 @@ extern "C" {
|
|
2390
2414
|
GGML_API int ggml_cpu_has_avx512 (void);
|
2391
2415
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
2392
2416
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
2417
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2393
2418
|
GGML_API int ggml_cpu_has_fma (void);
|
2394
2419
|
GGML_API int ggml_cpu_has_neon (void);
|
2420
|
+
GGML_API int ggml_cpu_has_sve (void);
|
2395
2421
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
2396
2422
|
GGML_API int ggml_cpu_has_metal (void);
|
2397
2423
|
GGML_API int ggml_cpu_has_f16c (void);
|
@@ -2406,6 +2432,7 @@ extern "C" {
|
|
2406
2432
|
GGML_API int ggml_cpu_has_sse3 (void);
|
2407
2433
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2408
2434
|
GGML_API int ggml_cpu_has_sycl (void);
|
2435
|
+
GGML_API int ggml_cpu_has_rpc (void);
|
2409
2436
|
GGML_API int ggml_cpu_has_vsx (void);
|
2410
2437
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2411
2438
|
|