llama_cpp 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -481,9 +481,7 @@ extern "C" {
|
|
481
481
|
GGML_OP_ARGSORT,
|
482
482
|
GGML_OP_LEAKY_RELU,
|
483
483
|
|
484
|
-
GGML_OP_FLASH_ATTN,
|
485
484
|
GGML_OP_FLASH_ATTN_EXT,
|
486
|
-
GGML_OP_FLASH_FF,
|
487
485
|
GGML_OP_FLASH_ATTN_BACK,
|
488
486
|
GGML_OP_SSM_CONV,
|
489
487
|
GGML_OP_SSM_SCAN,
|
@@ -758,7 +756,6 @@ extern "C" {
|
|
758
756
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
759
757
|
|
760
758
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
761
|
-
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
762
759
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
763
760
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
764
761
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
@@ -767,6 +764,11 @@ extern "C" {
|
|
767
764
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
768
765
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
769
766
|
|
767
|
+
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
768
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
769
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
770
|
+
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
771
|
+
|
770
772
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
771
773
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
772
774
|
|
@@ -1009,12 +1011,13 @@ extern "C" {
|
|
1009
1011
|
struct ggml_tensor * a,
|
1010
1012
|
struct ggml_tensor * b);
|
1011
1013
|
|
1012
|
-
// concat a and b
|
1014
|
+
// concat a and b along dim
|
1013
1015
|
// used in stable-diffusion
|
1014
1016
|
GGML_API struct ggml_tensor * ggml_concat(
|
1015
1017
|
struct ggml_context * ctx,
|
1016
1018
|
struct ggml_tensor * a,
|
1017
|
-
struct ggml_tensor * b
|
1019
|
+
struct ggml_tensor * b,
|
1020
|
+
int dim);
|
1018
1021
|
|
1019
1022
|
GGML_API struct ggml_tensor * ggml_abs(
|
1020
1023
|
struct ggml_context * ctx,
|
@@ -1460,11 +1463,12 @@ extern "C" {
|
|
1460
1463
|
struct ggml_tensor * b);
|
1461
1464
|
|
1462
1465
|
// rotary position embedding
|
1463
|
-
// if mode & 1 == 1, skip n_past elements (
|
1466
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
1464
1467
|
// if mode & 2 == 1, GPT-NeoX style
|
1465
1468
|
// if mode & 4 == 1, ChatGLM style
|
1466
1469
|
//
|
1467
1470
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1471
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1468
1472
|
GGML_API struct ggml_tensor * ggml_rope(
|
1469
1473
|
struct ggml_context * ctx,
|
1470
1474
|
struct ggml_tensor * a,
|
@@ -1483,10 +1487,11 @@ extern "C" {
|
|
1483
1487
|
int n_ctx);
|
1484
1488
|
|
1485
1489
|
// custom RoPE
|
1486
|
-
GGML_API struct ggml_tensor *
|
1490
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
1487
1491
|
struct ggml_context * ctx,
|
1488
1492
|
struct ggml_tensor * a,
|
1489
1493
|
struct ggml_tensor * b,
|
1494
|
+
struct ggml_tensor * c,
|
1490
1495
|
int n_dims,
|
1491
1496
|
int mode,
|
1492
1497
|
int n_ctx,
|
@@ -1499,10 +1504,11 @@ extern "C" {
|
|
1499
1504
|
float beta_slow);
|
1500
1505
|
|
1501
1506
|
// in-place, returns view(a)
|
1502
|
-
GGML_API struct ggml_tensor *
|
1507
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
1503
1508
|
struct ggml_context * ctx,
|
1504
1509
|
struct ggml_tensor * a,
|
1505
1510
|
struct ggml_tensor * b,
|
1511
|
+
struct ggml_tensor * c,
|
1506
1512
|
int n_dims,
|
1507
1513
|
int mode,
|
1508
1514
|
int n_ctx,
|
@@ -1514,18 +1520,49 @@ extern "C" {
|
|
1514
1520
|
float beta_fast,
|
1515
1521
|
float beta_slow);
|
1516
1522
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1523
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
1524
|
+
struct ggml_context * ctx,
|
1525
|
+
struct ggml_tensor * a,
|
1526
|
+
struct ggml_tensor * b,
|
1527
|
+
int n_dims,
|
1528
|
+
int mode,
|
1529
|
+
int n_ctx,
|
1530
|
+
int n_orig_ctx,
|
1531
|
+
float freq_base,
|
1532
|
+
float freq_scale,
|
1533
|
+
float ext_factor,
|
1534
|
+
float attn_factor,
|
1535
|
+
float beta_fast,
|
1536
|
+
float beta_slow),
|
1537
|
+
"use ggml_rope_ext instead");
|
1520
1538
|
|
1521
|
-
|
1522
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1539
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1523
1540
|
struct ggml_context * ctx,
|
1524
1541
|
struct ggml_tensor * a,
|
1525
1542
|
struct ggml_tensor * b,
|
1526
1543
|
int n_dims,
|
1527
|
-
|
1528
|
-
|
1544
|
+
int mode,
|
1545
|
+
int n_ctx,
|
1546
|
+
int n_orig_ctx,
|
1547
|
+
float freq_base,
|
1548
|
+
float freq_scale,
|
1549
|
+
float ext_factor,
|
1550
|
+
float attn_factor,
|
1551
|
+
float beta_fast,
|
1552
|
+
float beta_slow),
|
1553
|
+
"use ggml_rope_ext_inplace instead");
|
1554
|
+
|
1555
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
1556
|
+
struct ggml_context * ctx,
|
1557
|
+
struct ggml_tensor * a,
|
1558
|
+
struct ggml_tensor * b,
|
1559
|
+
int n_dims,
|
1560
|
+
float base,
|
1561
|
+
bool down);
|
1562
|
+
|
1563
|
+
// compute correction dims for YaRN RoPE scaling
|
1564
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1565
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1529
1566
|
|
1530
1567
|
// rotary position embedding backward, i.e compute dx from dy
|
1531
1568
|
// a - dy
|
@@ -1533,6 +1570,7 @@ extern "C" {
|
|
1533
1570
|
struct ggml_context * ctx,
|
1534
1571
|
struct ggml_tensor * a,
|
1535
1572
|
struct ggml_tensor * b,
|
1573
|
+
struct ggml_tensor * c,
|
1536
1574
|
int n_dims,
|
1537
1575
|
int mode,
|
1538
1576
|
int n_ctx,
|
@@ -1734,13 +1772,6 @@ extern "C" {
|
|
1734
1772
|
struct ggml_tensor * a,
|
1735
1773
|
int k);
|
1736
1774
|
|
1737
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1738
|
-
struct ggml_context * ctx,
|
1739
|
-
struct ggml_tensor * q,
|
1740
|
-
struct ggml_tensor * k,
|
1741
|
-
struct ggml_tensor * v,
|
1742
|
-
bool masked);
|
1743
|
-
|
1744
1775
|
#define GGML_KQ_MASK_PAD 32
|
1745
1776
|
|
1746
1777
|
// q: [n_embd, n_batch, n_head, 1]
|
@@ -1761,6 +1792,7 @@ extern "C" {
|
|
1761
1792
|
struct ggml_tensor * a,
|
1762
1793
|
enum ggml_prec prec);
|
1763
1794
|
|
1795
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
1764
1796
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1765
1797
|
struct ggml_context * ctx,
|
1766
1798
|
struct ggml_tensor * q,
|
@@ -1769,14 +1801,6 @@ extern "C" {
|
|
1769
1801
|
struct ggml_tensor * d,
|
1770
1802
|
bool masked);
|
1771
1803
|
|
1772
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
1773
|
-
struct ggml_context * ctx,
|
1774
|
-
struct ggml_tensor * a,
|
1775
|
-
struct ggml_tensor * b0,
|
1776
|
-
struct ggml_tensor * b1,
|
1777
|
-
struct ggml_tensor * c0,
|
1778
|
-
struct ggml_tensor * c1);
|
1779
|
-
|
1780
1804
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1781
1805
|
struct ggml_context * ctx,
|
1782
1806
|
struct ggml_tensor * s,
|
@@ -2390,8 +2414,10 @@ extern "C" {
|
|
2390
2414
|
GGML_API int ggml_cpu_has_avx512 (void);
|
2391
2415
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
2392
2416
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
2417
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2393
2418
|
GGML_API int ggml_cpu_has_fma (void);
|
2394
2419
|
GGML_API int ggml_cpu_has_neon (void);
|
2420
|
+
GGML_API int ggml_cpu_has_sve (void);
|
2395
2421
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
2396
2422
|
GGML_API int ggml_cpu_has_metal (void);
|
2397
2423
|
GGML_API int ggml_cpu_has_f16c (void);
|
@@ -2406,6 +2432,7 @@ extern "C" {
|
|
2406
2432
|
GGML_API int ggml_cpu_has_sse3 (void);
|
2407
2433
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2408
2434
|
GGML_API int ggml_cpu_has_sycl (void);
|
2435
|
+
GGML_API int ggml_cpu_has_rpc (void);
|
2409
2436
|
GGML_API int ggml_cpu_has_vsx (void);
|
2410
2437
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2411
2438
|
|