llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -481,9 +481,7 @@ extern "C" {
|
|
481
481
|
GGML_OP_ARGSORT,
|
482
482
|
GGML_OP_LEAKY_RELU,
|
483
483
|
|
484
|
-
GGML_OP_FLASH_ATTN,
|
485
484
|
GGML_OP_FLASH_ATTN_EXT,
|
486
|
-
GGML_OP_FLASH_FF,
|
487
485
|
GGML_OP_FLASH_ATTN_BACK,
|
488
486
|
GGML_OP_SSM_CONV,
|
489
487
|
GGML_OP_SSM_SCAN,
|
@@ -1460,11 +1458,12 @@ extern "C" {
|
|
1460
1458
|
struct ggml_tensor * b);
|
1461
1459
|
|
1462
1460
|
// rotary position embedding
|
1463
|
-
// if mode & 1 == 1, skip n_past elements (
|
1461
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
1464
1462
|
// if mode & 2 == 1, GPT-NeoX style
|
1465
1463
|
// if mode & 4 == 1, ChatGLM style
|
1466
1464
|
//
|
1467
1465
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1466
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1468
1467
|
GGML_API struct ggml_tensor * ggml_rope(
|
1469
1468
|
struct ggml_context * ctx,
|
1470
1469
|
struct ggml_tensor * a,
|
@@ -1483,10 +1482,11 @@ extern "C" {
|
|
1483
1482
|
int n_ctx);
|
1484
1483
|
|
1485
1484
|
// custom RoPE
|
1486
|
-
GGML_API struct ggml_tensor *
|
1485
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
1487
1486
|
struct ggml_context * ctx,
|
1488
1487
|
struct ggml_tensor * a,
|
1489
1488
|
struct ggml_tensor * b,
|
1489
|
+
struct ggml_tensor * c,
|
1490
1490
|
int n_dims,
|
1491
1491
|
int mode,
|
1492
1492
|
int n_ctx,
|
@@ -1499,10 +1499,11 @@ extern "C" {
|
|
1499
1499
|
float beta_slow);
|
1500
1500
|
|
1501
1501
|
// in-place, returns view(a)
|
1502
|
-
GGML_API struct ggml_tensor *
|
1502
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
1503
1503
|
struct ggml_context * ctx,
|
1504
1504
|
struct ggml_tensor * a,
|
1505
1505
|
struct ggml_tensor * b,
|
1506
|
+
struct ggml_tensor * c,
|
1506
1507
|
int n_dims,
|
1507
1508
|
int mode,
|
1508
1509
|
int n_ctx,
|
@@ -1514,18 +1515,41 @@ extern "C" {
|
|
1514
1515
|
float beta_fast,
|
1515
1516
|
float beta_slow);
|
1516
1517
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1518
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
1519
|
+
struct ggml_context * ctx,
|
1520
|
+
struct ggml_tensor * a,
|
1521
|
+
struct ggml_tensor * b,
|
1522
|
+
int n_dims,
|
1523
|
+
int mode,
|
1524
|
+
int n_ctx,
|
1525
|
+
int n_orig_ctx,
|
1526
|
+
float freq_base,
|
1527
|
+
float freq_scale,
|
1528
|
+
float ext_factor,
|
1529
|
+
float attn_factor,
|
1530
|
+
float beta_fast,
|
1531
|
+
float beta_slow),
|
1532
|
+
"use ggml_rope_ext instead");
|
1520
1533
|
|
1521
|
-
|
1522
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1534
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1523
1535
|
struct ggml_context * ctx,
|
1524
1536
|
struct ggml_tensor * a,
|
1525
1537
|
struct ggml_tensor * b,
|
1526
1538
|
int n_dims,
|
1527
|
-
|
1528
|
-
|
1539
|
+
int mode,
|
1540
|
+
int n_ctx,
|
1541
|
+
int n_orig_ctx,
|
1542
|
+
float freq_base,
|
1543
|
+
float freq_scale,
|
1544
|
+
float ext_factor,
|
1545
|
+
float attn_factor,
|
1546
|
+
float beta_fast,
|
1547
|
+
float beta_slow),
|
1548
|
+
"use ggml_rope_ext_inplace instead");
|
1549
|
+
|
1550
|
+
// compute correction dims for YaRN RoPE scaling
|
1551
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1552
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1529
1553
|
|
1530
1554
|
// rotary position embedding backward, i.e compute dx from dy
|
1531
1555
|
// a - dy
|
@@ -1533,6 +1557,7 @@ extern "C" {
|
|
1533
1557
|
struct ggml_context * ctx,
|
1534
1558
|
struct ggml_tensor * a,
|
1535
1559
|
struct ggml_tensor * b,
|
1560
|
+
struct ggml_tensor * c,
|
1536
1561
|
int n_dims,
|
1537
1562
|
int mode,
|
1538
1563
|
int n_ctx,
|
@@ -1734,13 +1759,6 @@ extern "C" {
|
|
1734
1759
|
struct ggml_tensor * a,
|
1735
1760
|
int k);
|
1736
1761
|
|
1737
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1738
|
-
struct ggml_context * ctx,
|
1739
|
-
struct ggml_tensor * q,
|
1740
|
-
struct ggml_tensor * k,
|
1741
|
-
struct ggml_tensor * v,
|
1742
|
-
bool masked);
|
1743
|
-
|
1744
1762
|
#define GGML_KQ_MASK_PAD 32
|
1745
1763
|
|
1746
1764
|
// q: [n_embd, n_batch, n_head, 1]
|
@@ -1761,6 +1779,7 @@ extern "C" {
|
|
1761
1779
|
struct ggml_tensor * a,
|
1762
1780
|
enum ggml_prec prec);
|
1763
1781
|
|
1782
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
1764
1783
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1765
1784
|
struct ggml_context * ctx,
|
1766
1785
|
struct ggml_tensor * q,
|
@@ -1769,14 +1788,6 @@ extern "C" {
|
|
1769
1788
|
struct ggml_tensor * d,
|
1770
1789
|
bool masked);
|
1771
1790
|
|
1772
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
1773
|
-
struct ggml_context * ctx,
|
1774
|
-
struct ggml_tensor * a,
|
1775
|
-
struct ggml_tensor * b0,
|
1776
|
-
struct ggml_tensor * b1,
|
1777
|
-
struct ggml_tensor * c0,
|
1778
|
-
struct ggml_tensor * c1);
|
1779
|
-
|
1780
1791
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1781
1792
|
struct ggml_context * ctx,
|
1782
1793
|
struct ggml_tensor * s,
|
@@ -2390,6 +2401,7 @@ extern "C" {
|
|
2390
2401
|
GGML_API int ggml_cpu_has_avx512 (void);
|
2391
2402
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
2392
2403
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
2404
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2393
2405
|
GGML_API int ggml_cpu_has_fma (void);
|
2394
2406
|
GGML_API int ggml_cpu_has_neon (void);
|
2395
2407
|
GGML_API int ggml_cpu_has_arm_fma (void);
|