llama_cpp 0.9.3 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -46,7 +46,6 @@
|
|
46
46
|
#endif
|
47
47
|
#include <windows.h>
|
48
48
|
#include <io.h>
|
49
|
-
#include <stdio.h> // for _fseeki64
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#include <algorithm>
|
@@ -91,7 +90,7 @@
|
|
91
90
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
91
|
#endif
|
93
92
|
|
94
|
-
#define LLAMA_MAX_NODES
|
93
|
+
#define LLAMA_MAX_NODES 8192
|
95
94
|
|
96
95
|
//
|
97
96
|
// logging
|
@@ -604,6 +603,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
604
603
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
605
604
|
}
|
606
605
|
|
606
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
607
|
+
switch (type) {
|
608
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
609
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
618
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
619
|
+
default: return format("unknown type %d", type);
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
624
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
625
|
+
|
626
|
+
switch (type) {
|
627
|
+
case GGUF_TYPE_STRING:
|
628
|
+
return gguf_get_val_str(ctx_gguf, i);
|
629
|
+
case GGUF_TYPE_ARRAY:
|
630
|
+
{
|
631
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
632
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
633
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
634
|
+
std::stringstream ss;
|
635
|
+
ss << "[";
|
636
|
+
for (int j = 0; j < arr_n; j++) {
|
637
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
638
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
639
|
+
// escape quotes
|
640
|
+
replace_all(val, "\\", "\\\\");
|
641
|
+
replace_all(val, "\"", "\\\"");
|
642
|
+
ss << '"' << val << '"';
|
643
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
644
|
+
ss << "???";
|
645
|
+
} else {
|
646
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
647
|
+
}
|
648
|
+
if (j < arr_n - 1) {
|
649
|
+
ss << ", ";
|
650
|
+
}
|
651
|
+
}
|
652
|
+
ss << "]";
|
653
|
+
return ss.str();
|
654
|
+
}
|
655
|
+
default:
|
656
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
657
|
+
}
|
658
|
+
}
|
659
|
+
|
607
660
|
//
|
608
661
|
// ggml helpers
|
609
662
|
//
|
@@ -1059,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1059
1112
|
//
|
1060
1113
|
|
1061
1114
|
struct llama_state {
|
1115
|
+
llama_state() {
|
1116
|
+
#ifdef GGML_USE_METAL
|
1117
|
+
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
1118
|
+
#endif
|
1119
|
+
}
|
1120
|
+
|
1062
1121
|
// We save the log callback globally
|
1063
1122
|
ggml_log_callback log_callback = llama_log_callback_default;
|
1064
1123
|
void * log_callback_user_data = nullptr;
|
@@ -1221,6 +1280,7 @@ struct llama_kv_cache {
|
|
1221
1280
|
// cannot be freely changed after a slot has been allocated.
|
1222
1281
|
uint32_t head = 0;
|
1223
1282
|
uint32_t size = 0;
|
1283
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1224
1284
|
|
1225
1285
|
// computed before each graph build
|
1226
1286
|
uint32_t n = 0;
|
@@ -1322,6 +1382,9 @@ struct llama_model {
|
|
1322
1382
|
|
1323
1383
|
int n_gpu_layers;
|
1324
1384
|
|
1385
|
+
// gguf metadata
|
1386
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1387
|
+
|
1325
1388
|
// context
|
1326
1389
|
struct ggml_context * ctx = NULL;
|
1327
1390
|
|
@@ -1442,6 +1505,7 @@ static bool llama_kv_cache_init(
|
|
1442
1505
|
|
1443
1506
|
cache.head = 0;
|
1444
1507
|
cache.size = n_ctx;
|
1508
|
+
cache.used = 0;
|
1445
1509
|
|
1446
1510
|
cache.cells.clear();
|
1447
1511
|
cache.cells.resize(n_ctx);
|
@@ -1543,6 +1607,8 @@ static bool llama_kv_cache_find_slot(
|
|
1543
1607
|
}
|
1544
1608
|
}
|
1545
1609
|
|
1610
|
+
cache.used += n_tokens;
|
1611
|
+
|
1546
1612
|
return true;
|
1547
1613
|
}
|
1548
1614
|
|
@@ -1563,6 +1629,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1563
1629
|
cache.cells[i].seq_id.clear();
|
1564
1630
|
}
|
1565
1631
|
cache.head = 0;
|
1632
|
+
cache.used = 0;
|
1566
1633
|
}
|
1567
1634
|
|
1568
1635
|
static void llama_kv_cache_seq_rm(
|
@@ -1585,6 +1652,9 @@ static void llama_kv_cache_seq_rm(
|
|
1585
1652
|
continue;
|
1586
1653
|
}
|
1587
1654
|
if (cache.cells[i].seq_id.empty()) {
|
1655
|
+
// keep count of the number of used cells
|
1656
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1657
|
+
|
1588
1658
|
cache.cells[i].pos = -1;
|
1589
1659
|
if (new_head == cache.size) new_head = i;
|
1590
1660
|
}
|
@@ -1592,7 +1662,7 @@ static void llama_kv_cache_seq_rm(
|
|
1592
1662
|
}
|
1593
1663
|
|
1594
1664
|
// If we freed up a slot, set head to it so searching can start there.
|
1595
|
-
if (new_head != cache.size) cache.head = new_head;
|
1665
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1596
1666
|
}
|
1597
1667
|
|
1598
1668
|
static void llama_kv_cache_seq_cp(
|
@@ -1618,6 +1688,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1618
1688
|
|
1619
1689
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1620
1690
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1691
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1621
1692
|
cache.cells[i].pos = -1;
|
1622
1693
|
cache.cells[i].seq_id.clear();
|
1623
1694
|
if (new_head == cache.size) new_head = i;
|
@@ -1628,7 +1699,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1628
1699
|
}
|
1629
1700
|
|
1630
1701
|
// If we freed up a slot, set head to it so searching can start there.
|
1631
|
-
if (new_head != cache.size) cache.head = new_head;
|
1702
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1632
1703
|
}
|
1633
1704
|
|
1634
1705
|
static void llama_kv_cache_seq_shift(
|
@@ -1649,6 +1720,7 @@ static void llama_kv_cache_seq_shift(
|
|
1649
1720
|
cache.cells[i].delta += delta;
|
1650
1721
|
|
1651
1722
|
if (cache.cells[i].pos < 0) {
|
1723
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1652
1724
|
cache.cells[i].pos = -1;
|
1653
1725
|
cache.cells[i].seq_id.clear();
|
1654
1726
|
if (new_head == cache.size) new_head = i;
|
@@ -1780,10 +1852,10 @@ struct llama_model_loader {
|
|
1780
1852
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1781
1853
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1782
1854
|
default:
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1855
|
+
{
|
1856
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1857
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1858
|
+
} break;
|
1787
1859
|
}
|
1788
1860
|
|
1789
1861
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1797,10 +1869,21 @@ struct llama_model_loader {
|
|
1797
1869
|
}
|
1798
1870
|
|
1799
1871
|
for (int i = 0; i < n_kv; i++) {
|
1800
|
-
const char * name
|
1801
|
-
const enum gguf_type type
|
1872
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1873
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1874
|
+
const std::string type_name =
|
1875
|
+
type == GGUF_TYPE_ARRAY
|
1876
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1877
|
+
: gguf_type_name(type);
|
1878
|
+
|
1879
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1880
|
+
const size_t MAX_VALUE_LEN = 40;
|
1881
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1882
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1883
|
+
}
|
1884
|
+
replace_all(value, "\n", "\\n");
|
1802
1885
|
|
1803
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1886
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1804
1887
|
}
|
1805
1888
|
|
1806
1889
|
// print type counts
|
@@ -2095,6 +2178,17 @@ static void llm_load_hparams(
|
|
2095
2178
|
|
2096
2179
|
auto & hparams = model.hparams;
|
2097
2180
|
|
2181
|
+
// get metadata as string
|
2182
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2183
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2184
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2185
|
+
continue;
|
2186
|
+
}
|
2187
|
+
const char * name = gguf_get_key(ctx, i);
|
2188
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2189
|
+
model.gguf_kv.emplace(name, value);
|
2190
|
+
}
|
2191
|
+
|
2098
2192
|
// get general kv
|
2099
2193
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2100
2194
|
|
@@ -2545,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2545
2639
|
}
|
2546
2640
|
|
2547
2641
|
// general kv
|
2548
|
-
LLAMA_LOG_INFO("%s: general.name
|
2642
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
2549
2643
|
|
2550
2644
|
// special tokens
|
2551
|
-
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token
|
2552
|
-
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token
|
2553
|
-
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token
|
2554
|
-
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token
|
2555
|
-
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token
|
2556
|
-
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token
|
2645
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
2646
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
2647
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
2648
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
2649
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
2650
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2557
2651
|
}
|
2558
2652
|
|
2559
2653
|
static void llm_load_tensors(
|
@@ -3375,7 +3469,7 @@ static void llm_build_k_shift(
|
|
3375
3469
|
struct ggml_cgraph * graph,
|
3376
3470
|
llm_rope_type type,
|
3377
3471
|
int64_t n_ctx,
|
3378
|
-
|
3472
|
+
int n_rot,
|
3379
3473
|
float freq_base,
|
3380
3474
|
float freq_scale,
|
3381
3475
|
const llm_build_cb & cb) {
|
@@ -3407,7 +3501,7 @@ static void llm_build_k_shift(
|
|
3407
3501
|
// we rotate only the first n_rot dimensions
|
3408
3502
|
ggml_rope_custom_inplace(ctx,
|
3409
3503
|
ggml_view_3d(ctx, kv.k,
|
3410
|
-
|
3504
|
+
n_embd_head, n_head_kv, n_ctx,
|
3411
3505
|
ggml_element_size(kv.k)*n_embd_head,
|
3412
3506
|
ggml_element_size(kv.k)*n_embd_gqa,
|
3413
3507
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
@@ -3605,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3605
3699
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3606
3700
|
cb(kq, "kq", il);
|
3607
3701
|
|
3608
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
3609
|
-
cb(kq, "kq_scaled", il);
|
3610
|
-
|
3611
3702
|
if (max_alibi_bias > 0.0f) {
|
3612
|
-
//
|
3613
|
-
|
3614
|
-
|
3615
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3616
|
-
cb(kq, "kq_scaled_alibi", il);
|
3617
|
-
}
|
3703
|
+
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3704
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
3705
|
+
cb(kq, "kq_scaled", il);
|
3618
3706
|
|
3619
|
-
|
3620
|
-
|
3707
|
+
if (max_alibi_bias > 0.0f) {
|
3708
|
+
// TODO: n_head or n_head_kv
|
3709
|
+
// TODO: K-shift is likely not working
|
3710
|
+
// TODO: change to ggml_add
|
3711
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3712
|
+
cb(kq, "kq_scaled_alibi", il);
|
3713
|
+
}
|
3621
3714
|
|
3622
|
-
|
3623
|
-
|
3715
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
3716
|
+
cb(kq, "kq_masked", il);
|
3717
|
+
|
3718
|
+
kq = ggml_soft_max(ctx, kq);
|
3719
|
+
cb(kq, "kq_soft_max", il);
|
3720
|
+
} else {
|
3721
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
3722
|
+
cb(kq, "kq_soft_max_ext", il);
|
3723
|
+
}
|
3624
3724
|
|
3625
3725
|
// split cached v into n_head heads
|
3626
3726
|
struct ggml_tensor * v =
|
@@ -4730,92 +4830,34 @@ struct llm_build_context {
|
|
4730
4830
|
// self-attention
|
4731
4831
|
{
|
4732
4832
|
// compute Q and K and RoPE them
|
4733
|
-
struct ggml_tensor *
|
4734
|
-
cb(
|
4833
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4834
|
+
cb(Qcur, "Qcur", il);
|
4735
4835
|
|
4736
|
-
struct ggml_tensor *
|
4737
|
-
cb(
|
4836
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4837
|
+
cb(Kcur, "Kcur", il);
|
4738
4838
|
|
4739
4839
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
4840
|
cb(Vcur, "Vcur", il);
|
4741
4841
|
|
4742
|
-
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
-
0
|
4748
|
-
));
|
4749
|
-
cb(qrot, "qrot", il);
|
4750
|
-
|
4751
|
-
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
-
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
-
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
-
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
-
0
|
4756
|
-
));
|
4757
|
-
cb(krot, "krot", il);
|
4758
|
-
|
4759
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
-
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
-
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
-
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
-
);
|
4766
|
-
cb(qpass, "qpass", il);
|
4767
|
-
|
4768
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
-
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
-
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
-
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
-
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
-
);
|
4774
|
-
cb(kpass, "kpass", il);
|
4775
|
-
|
4776
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
-
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
-
);
|
4780
|
-
cb(qrotated, "qrotated", il);
|
4781
|
-
|
4782
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
-
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
Qcur = ggml_rope_custom(
|
4843
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4844
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4845
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4785
4846
|
);
|
4786
|
-
cb(krotated, "krotated", il);
|
4787
|
-
|
4788
|
-
// ggml currently only supports concatenation on dim=2
|
4789
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
-
cb(qrotated, "qrotated", il);
|
4792
|
-
|
4793
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
-
cb(krotated, "krotated", il);
|
4795
|
-
|
4796
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
-
cb(qpass, "qpass", il);
|
4798
|
-
|
4799
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
-
cb(kpass, "kpass", il);
|
4801
|
-
|
4802
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
4847
|
cb(Qcur, "Qcur", il);
|
4804
4848
|
|
4805
|
-
|
4806
|
-
|
4807
|
-
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4849
|
+
Kcur = ggml_rope_custom(
|
4850
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4851
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4852
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4853
|
+
);
|
4812
4854
|
cb(Kcur, "Kcur", il);
|
4813
4855
|
|
4814
4856
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
4857
|
|
4816
4858
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
4859
|
model.layers[il].wo, NULL,
|
4818
|
-
|
4860
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
4861
|
cb(cur, "kqv_out", il);
|
4820
4862
|
}
|
4821
4863
|
|
@@ -5000,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5000
5042
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
5001
5043
|
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
5002
5044
|
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
5045
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
5003
5046
|
{ "v", OFFLOAD_FUNC_V },
|
5004
5047
|
{ "kqv", OFFLOAD_FUNC_V },
|
5005
5048
|
{ "kqv_merged", OFFLOAD_FUNC_V },
|
@@ -5443,6 +5486,12 @@ static int llama_decode_internal(
|
|
5443
5486
|
batch.seq_id = seq_id_arr.data();
|
5444
5487
|
}
|
5445
5488
|
|
5489
|
+
// if we have enough unused cells before the current head ->
|
5490
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5491
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5492
|
+
kv_self.head = 0;
|
5493
|
+
}
|
5494
|
+
|
5446
5495
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5447
5496
|
return 1;
|
5448
5497
|
}
|
@@ -5453,7 +5502,7 @@ static int llama_decode_internal(
|
|
5453
5502
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5454
5503
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5455
5504
|
|
5456
|
-
//printf("kv_self.n = %
|
5505
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5457
5506
|
|
5458
5507
|
ggml_allocr_reset(lctx.alloc);
|
5459
5508
|
|
@@ -5502,18 +5551,8 @@ static int llama_decode_internal(
|
|
5502
5551
|
n_threads = std::min(4, n_threads);
|
5503
5552
|
}
|
5504
5553
|
|
5505
|
-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5506
|
-
const bool full_offload_supported =
|
5507
|
-
model.arch == LLM_ARCH_LLAMA ||
|
5508
|
-
model.arch == LLM_ARCH_BAICHUAN ||
|
5509
|
-
model.arch == LLM_ARCH_FALCON ||
|
5510
|
-
model.arch == LLM_ARCH_REFACT ||
|
5511
|
-
model.arch == LLM_ARCH_MPT ||
|
5512
|
-
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
-
model.arch == LLM_ARCH_STABLELM;
|
5514
|
-
|
5515
5554
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5516
|
-
if (ggml_cpu_has_cublas() &&
|
5555
|
+
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5517
5556
|
n_threads = 1;
|
5518
5557
|
}
|
5519
5558
|
|
@@ -6372,10 +6411,13 @@ struct llama_grammar_candidate {
|
|
6372
6411
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6373
6412
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6374
6413
|
const char * src,
|
6414
|
+
size_t n_src,
|
6375
6415
|
llama_partial_utf8 partial_start) {
|
6376
6416
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6377
6417
|
const char * pos = src;
|
6378
6418
|
std::vector<uint32_t> code_points;
|
6419
|
+
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
+
code_points.reserve(n_src + 1);
|
6379
6421
|
uint32_t value = partial_start.value;
|
6380
6422
|
int n_remain = partial_start.n_remain;
|
6381
6423
|
|
@@ -6426,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6426
6468
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6427
6469
|
}
|
6428
6470
|
|
6471
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
+
std::string src,
|
6473
|
+
llama_partial_utf8 partial_start
|
6474
|
+
) {
|
6475
|
+
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
+
}
|
6477
|
+
|
6429
6478
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6430
6479
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6431
6480
|
switch (pos->type) {
|
@@ -6979,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
6979
7028
|
// Replace the data in candidates with the new_candidates data
|
6980
7029
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
6981
7030
|
candidates->size = new_candidates.size();
|
7031
|
+
candidates->sorted = false;
|
6982
7032
|
|
6983
7033
|
if (ctx) {
|
6984
7034
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7075,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7075
7125
|
} else if (piece.empty() || piece[0] == 0) {
|
7076
7126
|
candidates->data[i].logit = -INFINITY;
|
7077
7127
|
} else {
|
7078
|
-
candidates_decoded.push_back(decode_utf8(piece
|
7128
|
+
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
7079
7129
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
7080
7130
|
}
|
7081
7131
|
}
|
@@ -7282,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7282
7332
|
const std::string piece = llama_token_to_piece(ctx, token);
|
7283
7333
|
|
7284
7334
|
// Note terminating 0 in decoded string
|
7285
|
-
const auto decoded = decode_utf8(piece
|
7335
|
+
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
7286
7336
|
const auto & code_points = decoded.first;
|
7287
7337
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
7288
7338
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -8527,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
|
|
8527
8577
|
|
8528
8578
|
#ifdef GGML_USE_METAL
|
8529
8579
|
if (model->n_gpu_layers > 0) {
|
8530
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8531
|
-
|
8532
8580
|
ctx->ctx_metal = ggml_metal_init(1);
|
8533
8581
|
if (!ctx->ctx_metal) {
|
8534
8582
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
@@ -8666,6 +8714,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8666
8714
|
return model->hparams.rope_freq_scale_train;
|
8667
8715
|
}
|
8668
8716
|
|
8717
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8718
|
+
const auto & it = model->gguf_kv.find(key);
|
8719
|
+
if (it == model->gguf_kv.end()) {
|
8720
|
+
if (buf_size > 0) {
|
8721
|
+
buf[0] = '\0';
|
8722
|
+
}
|
8723
|
+
return -1;
|
8724
|
+
}
|
8725
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8726
|
+
}
|
8727
|
+
|
8728
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8729
|
+
return (int)model->gguf_kv.size();
|
8730
|
+
}
|
8731
|
+
|
8732
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8733
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8734
|
+
if (buf_size > 0) {
|
8735
|
+
buf[0] = '\0';
|
8736
|
+
}
|
8737
|
+
return -1;
|
8738
|
+
}
|
8739
|
+
auto it = model->gguf_kv.begin();
|
8740
|
+
std::advance(it, i);
|
8741
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8742
|
+
}
|
8743
|
+
|
8744
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8745
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8746
|
+
if (buf_size > 0) {
|
8747
|
+
buf[0] = '\0';
|
8748
|
+
}
|
8749
|
+
return -1;
|
8750
|
+
}
|
8751
|
+
auto it = model->gguf_kv.begin();
|
8752
|
+
std::advance(it, i);
|
8753
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8754
|
+
}
|
8755
|
+
|
8669
8756
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8670
8757
|
return snprintf(buf, buf_size, "%s %s %s",
|
8671
8758
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8724,8 +8811,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8724
8811
|
}
|
8725
8812
|
}
|
8726
8813
|
|
8814
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8815
|
+
struct llama_kv_cache_view result = {
|
8816
|
+
/*.n_cells = */ 0,
|
8817
|
+
/*.n_max_seq = */ n_max_seq,
|
8818
|
+
/*.token_count = */ 0,
|
8819
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8820
|
+
/*.max_contiguous = */ 0,
|
8821
|
+
/*.max_contiguous_idx = */ -1,
|
8822
|
+
/*.cells = */ nullptr,
|
8823
|
+
/*.cells_sequences = */ nullptr,
|
8824
|
+
};
|
8825
|
+
return result;
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8829
|
+
if (view->cells != nullptr) {
|
8830
|
+
free(view->cells);
|
8831
|
+
view->cells = nullptr;
|
8832
|
+
}
|
8833
|
+
if (view->cells_sequences != nullptr) {
|
8834
|
+
free(view->cells_sequences);
|
8835
|
+
view->cells_sequences = nullptr;
|
8836
|
+
}
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8840
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8841
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8842
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8843
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8844
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8845
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8846
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8847
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8848
|
+
}
|
8849
|
+
|
8850
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8851
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8852
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8853
|
+
int32_t used_cells = 0;
|
8854
|
+
int32_t token_count = 0;
|
8855
|
+
int32_t curr_contig_idx = -1;
|
8856
|
+
uint32_t max_contig = 0;
|
8857
|
+
int32_t max_contig_idx = -1;
|
8858
|
+
|
8859
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8860
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8861
|
+
token_count += curr_size;
|
8862
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8863
|
+
|
8864
|
+
if (curr_size > 0) {
|
8865
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8866
|
+
max_contig = i - curr_contig_idx;
|
8867
|
+
max_contig_idx = curr_contig_idx;
|
8868
|
+
}
|
8869
|
+
curr_contig_idx = -1;
|
8870
|
+
} else if (curr_contig_idx < 0) {
|
8871
|
+
curr_contig_idx = i;
|
8872
|
+
}
|
8873
|
+
|
8874
|
+
int seq_idx = 0;
|
8875
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8876
|
+
if (seq_idx >= view->n_max_seq) {
|
8877
|
+
break;
|
8878
|
+
}
|
8879
|
+
cs_curr[seq_idx] = it;
|
8880
|
+
seq_idx++;
|
8881
|
+
}
|
8882
|
+
if (seq_idx != 0) {
|
8883
|
+
used_cells++;
|
8884
|
+
}
|
8885
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8886
|
+
cs_curr[seq_idx] = -1;
|
8887
|
+
}
|
8888
|
+
}
|
8889
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8890
|
+
max_contig_idx = curr_contig_idx;
|
8891
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8892
|
+
}
|
8893
|
+
view->max_contiguous = max_contig;
|
8894
|
+
view->max_contiguous_idx = max_contig_idx;
|
8895
|
+
view->token_count = token_count;
|
8896
|
+
view->used_cells = used_cells;
|
8897
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8898
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8899
|
+
__func__, ctx->kv_self.used, used_cells);
|
8900
|
+
}
|
8901
|
+
}
|
8902
|
+
|
8727
8903
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8728
|
-
|
8904
|
+
int result = 0;
|
8905
|
+
|
8906
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8907
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8908
|
+
}
|
8909
|
+
|
8910
|
+
return result;
|
8911
|
+
}
|
8912
|
+
|
8913
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8914
|
+
return ctx->kv_self.used;
|
8729
8915
|
}
|
8730
8916
|
|
8731
8917
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8895,10 +9081,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8895
9081
|
const size_t kv_buf_size = kv_self.buf.size;
|
8896
9082
|
const uint32_t kv_head = kv_self.head;
|
8897
9083
|
const uint32_t kv_size = kv_self.size;
|
9084
|
+
const uint32_t kv_used = kv_self.used;
|
8898
9085
|
|
8899
9086
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8900
9087
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8901
9088
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9089
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8902
9090
|
|
8903
9091
|
if (kv_buf_size) {
|
8904
9092
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -9021,10 +9209,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9021
9209
|
size_t kv_buf_size;
|
9022
9210
|
uint32_t kv_head;
|
9023
9211
|
uint32_t kv_size;
|
9212
|
+
uint32_t kv_used;
|
9024
9213
|
|
9025
9214
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9026
9215
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9027
9216
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9217
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9028
9218
|
|
9029
9219
|
if (kv_buf_size) {
|
9030
9220
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
@@ -9059,6 +9249,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9059
9249
|
|
9060
9250
|
ctx->kv_self.head = kv_head;
|
9061
9251
|
ctx->kv_self.size = kv_size;
|
9252
|
+
ctx->kv_self.used = kv_used;
|
9062
9253
|
|
9063
9254
|
ctx->kv_self.cells.resize(kv_size);
|
9064
9255
|
|
@@ -9521,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
9521
9712
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
9522
9713
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
9523
9714
|
g_state.log_callback_user_data = user_data;
|
9715
|
+
#ifdef GGML_USE_METAL
|
9716
|
+
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
9717
|
+
#endif
|
9524
9718
|
}
|
9525
9719
|
|
9526
9720
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|