llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -481,9 +481,7 @@ extern "C" {
|
|
481
481
|
GGML_OP_ARGSORT,
|
482
482
|
GGML_OP_LEAKY_RELU,
|
483
483
|
|
484
|
-
GGML_OP_FLASH_ATTN,
|
485
484
|
GGML_OP_FLASH_ATTN_EXT,
|
486
|
-
GGML_OP_FLASH_FF,
|
487
485
|
GGML_OP_FLASH_ATTN_BACK,
|
488
486
|
GGML_OP_SSM_CONV,
|
489
487
|
GGML_OP_SSM_SCAN,
|
@@ -1460,11 +1458,12 @@ extern "C" {
|
|
1460
1458
|
struct ggml_tensor * b);
|
1461
1459
|
|
1462
1460
|
// rotary position embedding
|
1463
|
-
// if mode & 1 == 1, skip n_past elements (
|
1461
|
+
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
1464
1462
|
// if mode & 2 == 1, GPT-NeoX style
|
1465
1463
|
// if mode & 4 == 1, ChatGLM style
|
1466
1464
|
//
|
1467
1465
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1466
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1468
1467
|
GGML_API struct ggml_tensor * ggml_rope(
|
1469
1468
|
struct ggml_context * ctx,
|
1470
1469
|
struct ggml_tensor * a,
|
@@ -1483,10 +1482,11 @@ extern "C" {
|
|
1483
1482
|
int n_ctx);
|
1484
1483
|
|
1485
1484
|
// custom RoPE
|
1486
|
-
GGML_API struct ggml_tensor *
|
1485
|
+
GGML_API struct ggml_tensor * ggml_rope_ext(
|
1487
1486
|
struct ggml_context * ctx,
|
1488
1487
|
struct ggml_tensor * a,
|
1489
1488
|
struct ggml_tensor * b,
|
1489
|
+
struct ggml_tensor * c,
|
1490
1490
|
int n_dims,
|
1491
1491
|
int mode,
|
1492
1492
|
int n_ctx,
|
@@ -1499,10 +1499,11 @@ extern "C" {
|
|
1499
1499
|
float beta_slow);
|
1500
1500
|
|
1501
1501
|
// in-place, returns view(a)
|
1502
|
-
GGML_API struct ggml_tensor *
|
1502
|
+
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
1503
1503
|
struct ggml_context * ctx,
|
1504
1504
|
struct ggml_tensor * a,
|
1505
1505
|
struct ggml_tensor * b,
|
1506
|
+
struct ggml_tensor * c,
|
1506
1507
|
int n_dims,
|
1507
1508
|
int mode,
|
1508
1509
|
int n_ctx,
|
@@ -1514,18 +1515,41 @@ extern "C" {
|
|
1514
1515
|
float beta_fast,
|
1515
1516
|
float beta_slow);
|
1516
1517
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1518
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
1519
|
+
struct ggml_context * ctx,
|
1520
|
+
struct ggml_tensor * a,
|
1521
|
+
struct ggml_tensor * b,
|
1522
|
+
int n_dims,
|
1523
|
+
int mode,
|
1524
|
+
int n_ctx,
|
1525
|
+
int n_orig_ctx,
|
1526
|
+
float freq_base,
|
1527
|
+
float freq_scale,
|
1528
|
+
float ext_factor,
|
1529
|
+
float attn_factor,
|
1530
|
+
float beta_fast,
|
1531
|
+
float beta_slow),
|
1532
|
+
"use ggml_rope_ext instead");
|
1520
1533
|
|
1521
|
-
|
1522
|
-
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1534
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1523
1535
|
struct ggml_context * ctx,
|
1524
1536
|
struct ggml_tensor * a,
|
1525
1537
|
struct ggml_tensor * b,
|
1526
1538
|
int n_dims,
|
1527
|
-
|
1528
|
-
|
1539
|
+
int mode,
|
1540
|
+
int n_ctx,
|
1541
|
+
int n_orig_ctx,
|
1542
|
+
float freq_base,
|
1543
|
+
float freq_scale,
|
1544
|
+
float ext_factor,
|
1545
|
+
float attn_factor,
|
1546
|
+
float beta_fast,
|
1547
|
+
float beta_slow),
|
1548
|
+
"use ggml_rope_ext_inplace instead");
|
1549
|
+
|
1550
|
+
// compute correction dims for YaRN RoPE scaling
|
1551
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
1552
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1529
1553
|
|
1530
1554
|
// rotary position embedding backward, i.e compute dx from dy
|
1531
1555
|
// a - dy
|
@@ -1533,6 +1557,7 @@ extern "C" {
|
|
1533
1557
|
struct ggml_context * ctx,
|
1534
1558
|
struct ggml_tensor * a,
|
1535
1559
|
struct ggml_tensor * b,
|
1560
|
+
struct ggml_tensor * c,
|
1536
1561
|
int n_dims,
|
1537
1562
|
int mode,
|
1538
1563
|
int n_ctx,
|
@@ -1734,13 +1759,6 @@ extern "C" {
|
|
1734
1759
|
struct ggml_tensor * a,
|
1735
1760
|
int k);
|
1736
1761
|
|
1737
|
-
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1738
|
-
struct ggml_context * ctx,
|
1739
|
-
struct ggml_tensor * q,
|
1740
|
-
struct ggml_tensor * k,
|
1741
|
-
struct ggml_tensor * v,
|
1742
|
-
bool masked);
|
1743
|
-
|
1744
1762
|
#define GGML_KQ_MASK_PAD 32
|
1745
1763
|
|
1746
1764
|
// q: [n_embd, n_batch, n_head, 1]
|
@@ -1761,6 +1779,7 @@ extern "C" {
|
|
1761
1779
|
struct ggml_tensor * a,
|
1762
1780
|
enum ggml_prec prec);
|
1763
1781
|
|
1782
|
+
// TODO: needs to be adapted to ggml_flash_attn_ext
|
1764
1783
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1765
1784
|
struct ggml_context * ctx,
|
1766
1785
|
struct ggml_tensor * q,
|
@@ -1769,14 +1788,6 @@ extern "C" {
|
|
1769
1788
|
struct ggml_tensor * d,
|
1770
1789
|
bool masked);
|
1771
1790
|
|
1772
|
-
GGML_API struct ggml_tensor * ggml_flash_ff(
|
1773
|
-
struct ggml_context * ctx,
|
1774
|
-
struct ggml_tensor * a,
|
1775
|
-
struct ggml_tensor * b0,
|
1776
|
-
struct ggml_tensor * b1,
|
1777
|
-
struct ggml_tensor * c0,
|
1778
|
-
struct ggml_tensor * c1);
|
1779
|
-
|
1780
1791
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1781
1792
|
struct ggml_context * ctx,
|
1782
1793
|
struct ggml_tensor * s,
|
@@ -2390,6 +2401,7 @@ extern "C" {
|
|
2390
2401
|
GGML_API int ggml_cpu_has_avx512 (void);
|
2391
2402
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
2392
2403
|
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
2404
|
+
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
2393
2405
|
GGML_API int ggml_cpu_has_fma (void);
|
2394
2406
|
GGML_API int ggml_cpu_has_neon (void);
|
2395
2407
|
GGML_API int ggml_cpu_has_arm_fma (void);
|