llama_cpp 0.9.3 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -46,7 +46,6 @@
|
|
46
46
|
#endif
|
47
47
|
#include <windows.h>
|
48
48
|
#include <io.h>
|
49
|
-
#include <stdio.h> // for _fseeki64
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#include <algorithm>
|
@@ -91,7 +90,7 @@
|
|
91
90
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
91
|
#endif
|
93
92
|
|
94
|
-
#define LLAMA_MAX_NODES
|
93
|
+
#define LLAMA_MAX_NODES 8192
|
95
94
|
|
96
95
|
//
|
97
96
|
// logging
|
@@ -604,6 +603,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
604
603
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
605
604
|
}
|
606
605
|
|
606
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
607
|
+
switch (type) {
|
608
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
609
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
618
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
619
|
+
default: return format("unknown type %d", type);
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
624
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
625
|
+
|
626
|
+
switch (type) {
|
627
|
+
case GGUF_TYPE_STRING:
|
628
|
+
return gguf_get_val_str(ctx_gguf, i);
|
629
|
+
case GGUF_TYPE_ARRAY:
|
630
|
+
{
|
631
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
632
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
633
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
634
|
+
std::stringstream ss;
|
635
|
+
ss << "[";
|
636
|
+
for (int j = 0; j < arr_n; j++) {
|
637
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
638
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
639
|
+
// escape quotes
|
640
|
+
replace_all(val, "\\", "\\\\");
|
641
|
+
replace_all(val, "\"", "\\\"");
|
642
|
+
ss << '"' << val << '"';
|
643
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
644
|
+
ss << "???";
|
645
|
+
} else {
|
646
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
647
|
+
}
|
648
|
+
if (j < arr_n - 1) {
|
649
|
+
ss << ", ";
|
650
|
+
}
|
651
|
+
}
|
652
|
+
ss << "]";
|
653
|
+
return ss.str();
|
654
|
+
}
|
655
|
+
default:
|
656
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
657
|
+
}
|
658
|
+
}
|
659
|
+
|
607
660
|
//
|
608
661
|
// ggml helpers
|
609
662
|
//
|
@@ -1059,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1059
1112
|
//
|
1060
1113
|
|
1061
1114
|
struct llama_state {
|
1115
|
+
llama_state() {
|
1116
|
+
#ifdef GGML_USE_METAL
|
1117
|
+
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
1118
|
+
#endif
|
1119
|
+
}
|
1120
|
+
|
1062
1121
|
// We save the log callback globally
|
1063
1122
|
ggml_log_callback log_callback = llama_log_callback_default;
|
1064
1123
|
void * log_callback_user_data = nullptr;
|
@@ -1221,6 +1280,7 @@ struct llama_kv_cache {
|
|
1221
1280
|
// cannot be freely changed after a slot has been allocated.
|
1222
1281
|
uint32_t head = 0;
|
1223
1282
|
uint32_t size = 0;
|
1283
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1224
1284
|
|
1225
1285
|
// computed before each graph build
|
1226
1286
|
uint32_t n = 0;
|
@@ -1322,6 +1382,9 @@ struct llama_model {
|
|
1322
1382
|
|
1323
1383
|
int n_gpu_layers;
|
1324
1384
|
|
1385
|
+
// gguf metadata
|
1386
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1387
|
+
|
1325
1388
|
// context
|
1326
1389
|
struct ggml_context * ctx = NULL;
|
1327
1390
|
|
@@ -1442,6 +1505,7 @@ static bool llama_kv_cache_init(
|
|
1442
1505
|
|
1443
1506
|
cache.head = 0;
|
1444
1507
|
cache.size = n_ctx;
|
1508
|
+
cache.used = 0;
|
1445
1509
|
|
1446
1510
|
cache.cells.clear();
|
1447
1511
|
cache.cells.resize(n_ctx);
|
@@ -1543,6 +1607,8 @@ static bool llama_kv_cache_find_slot(
|
|
1543
1607
|
}
|
1544
1608
|
}
|
1545
1609
|
|
1610
|
+
cache.used += n_tokens;
|
1611
|
+
|
1546
1612
|
return true;
|
1547
1613
|
}
|
1548
1614
|
|
@@ -1563,6 +1629,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1563
1629
|
cache.cells[i].seq_id.clear();
|
1564
1630
|
}
|
1565
1631
|
cache.head = 0;
|
1632
|
+
cache.used = 0;
|
1566
1633
|
}
|
1567
1634
|
|
1568
1635
|
static void llama_kv_cache_seq_rm(
|
@@ -1585,6 +1652,9 @@ static void llama_kv_cache_seq_rm(
|
|
1585
1652
|
continue;
|
1586
1653
|
}
|
1587
1654
|
if (cache.cells[i].seq_id.empty()) {
|
1655
|
+
// keep count of the number of used cells
|
1656
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1657
|
+
|
1588
1658
|
cache.cells[i].pos = -1;
|
1589
1659
|
if (new_head == cache.size) new_head = i;
|
1590
1660
|
}
|
@@ -1592,7 +1662,7 @@ static void llama_kv_cache_seq_rm(
|
|
1592
1662
|
}
|
1593
1663
|
|
1594
1664
|
// If we freed up a slot, set head to it so searching can start there.
|
1595
|
-
if (new_head != cache.size) cache.head = new_head;
|
1665
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1596
1666
|
}
|
1597
1667
|
|
1598
1668
|
static void llama_kv_cache_seq_cp(
|
@@ -1618,6 +1688,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1618
1688
|
|
1619
1689
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1620
1690
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1691
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1621
1692
|
cache.cells[i].pos = -1;
|
1622
1693
|
cache.cells[i].seq_id.clear();
|
1623
1694
|
if (new_head == cache.size) new_head = i;
|
@@ -1628,7 +1699,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1628
1699
|
}
|
1629
1700
|
|
1630
1701
|
// If we freed up a slot, set head to it so searching can start there.
|
1631
|
-
if (new_head != cache.size) cache.head = new_head;
|
1702
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1632
1703
|
}
|
1633
1704
|
|
1634
1705
|
static void llama_kv_cache_seq_shift(
|
@@ -1649,6 +1720,7 @@ static void llama_kv_cache_seq_shift(
|
|
1649
1720
|
cache.cells[i].delta += delta;
|
1650
1721
|
|
1651
1722
|
if (cache.cells[i].pos < 0) {
|
1723
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1652
1724
|
cache.cells[i].pos = -1;
|
1653
1725
|
cache.cells[i].seq_id.clear();
|
1654
1726
|
if (new_head == cache.size) new_head = i;
|
@@ -1780,10 +1852,10 @@ struct llama_model_loader {
|
|
1780
1852
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1781
1853
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1782
1854
|
default:
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1855
|
+
{
|
1856
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1857
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1858
|
+
} break;
|
1787
1859
|
}
|
1788
1860
|
|
1789
1861
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1797,10 +1869,21 @@ struct llama_model_loader {
|
|
1797
1869
|
}
|
1798
1870
|
|
1799
1871
|
for (int i = 0; i < n_kv; i++) {
|
1800
|
-
const char * name
|
1801
|
-
const enum gguf_type type
|
1872
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1873
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1874
|
+
const std::string type_name =
|
1875
|
+
type == GGUF_TYPE_ARRAY
|
1876
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1877
|
+
: gguf_type_name(type);
|
1878
|
+
|
1879
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1880
|
+
const size_t MAX_VALUE_LEN = 40;
|
1881
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1882
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1883
|
+
}
|
1884
|
+
replace_all(value, "\n", "\\n");
|
1802
1885
|
|
1803
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1886
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1804
1887
|
}
|
1805
1888
|
|
1806
1889
|
// print type counts
|
@@ -2095,6 +2178,17 @@ static void llm_load_hparams(
|
|
2095
2178
|
|
2096
2179
|
auto & hparams = model.hparams;
|
2097
2180
|
|
2181
|
+
// get metadata as string
|
2182
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2183
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2184
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2185
|
+
continue;
|
2186
|
+
}
|
2187
|
+
const char * name = gguf_get_key(ctx, i);
|
2188
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2189
|
+
model.gguf_kv.emplace(name, value);
|
2190
|
+
}
|
2191
|
+
|
2098
2192
|
// get general kv
|
2099
2193
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2100
2194
|
|
@@ -2545,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2545
2639
|
}
|
2546
2640
|
|
2547
2641
|
// general kv
|
2548
|
-
LLAMA_LOG_INFO("%s: general.name
|
2642
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
2549
2643
|
|
2550
2644
|
// special tokens
|
2551
|
-
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token
|
2552
|
-
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token
|
2553
|
-
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token
|
2554
|
-
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token
|
2555
|
-
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token
|
2556
|
-
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token
|
2645
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
2646
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
2647
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
2648
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
2649
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
2650
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2557
2651
|
}
|
2558
2652
|
|
2559
2653
|
static void llm_load_tensors(
|
@@ -3375,7 +3469,7 @@ static void llm_build_k_shift(
|
|
3375
3469
|
struct ggml_cgraph * graph,
|
3376
3470
|
llm_rope_type type,
|
3377
3471
|
int64_t n_ctx,
|
3378
|
-
|
3472
|
+
int n_rot,
|
3379
3473
|
float freq_base,
|
3380
3474
|
float freq_scale,
|
3381
3475
|
const llm_build_cb & cb) {
|
@@ -3407,7 +3501,7 @@ static void llm_build_k_shift(
|
|
3407
3501
|
// we rotate only the first n_rot dimensions
|
3408
3502
|
ggml_rope_custom_inplace(ctx,
|
3409
3503
|
ggml_view_3d(ctx, kv.k,
|
3410
|
-
|
3504
|
+
n_embd_head, n_head_kv, n_ctx,
|
3411
3505
|
ggml_element_size(kv.k)*n_embd_head,
|
3412
3506
|
ggml_element_size(kv.k)*n_embd_gqa,
|
3413
3507
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
@@ -3605,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3605
3699
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3606
3700
|
cb(kq, "kq", il);
|
3607
3701
|
|
3608
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
3609
|
-
cb(kq, "kq_scaled", il);
|
3610
|
-
|
3611
3702
|
if (max_alibi_bias > 0.0f) {
|
3612
|
-
//
|
3613
|
-
|
3614
|
-
|
3615
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3616
|
-
cb(kq, "kq_scaled_alibi", il);
|
3617
|
-
}
|
3703
|
+
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3704
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
3705
|
+
cb(kq, "kq_scaled", il);
|
3618
3706
|
|
3619
|
-
|
3620
|
-
|
3707
|
+
if (max_alibi_bias > 0.0f) {
|
3708
|
+
// TODO: n_head or n_head_kv
|
3709
|
+
// TODO: K-shift is likely not working
|
3710
|
+
// TODO: change to ggml_add
|
3711
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3712
|
+
cb(kq, "kq_scaled_alibi", il);
|
3713
|
+
}
|
3621
3714
|
|
3622
|
-
|
3623
|
-
|
3715
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
3716
|
+
cb(kq, "kq_masked", il);
|
3717
|
+
|
3718
|
+
kq = ggml_soft_max(ctx, kq);
|
3719
|
+
cb(kq, "kq_soft_max", il);
|
3720
|
+
} else {
|
3721
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
3722
|
+
cb(kq, "kq_soft_max_ext", il);
|
3723
|
+
}
|
3624
3724
|
|
3625
3725
|
// split cached v into n_head heads
|
3626
3726
|
struct ggml_tensor * v =
|
@@ -4730,92 +4830,34 @@ struct llm_build_context {
|
|
4730
4830
|
// self-attention
|
4731
4831
|
{
|
4732
4832
|
// compute Q and K and RoPE them
|
4733
|
-
struct ggml_tensor *
|
4734
|
-
cb(
|
4833
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4834
|
+
cb(Qcur, "Qcur", il);
|
4735
4835
|
|
4736
|
-
struct ggml_tensor *
|
4737
|
-
cb(
|
4836
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4837
|
+
cb(Kcur, "Kcur", il);
|
4738
4838
|
|
4739
4839
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
4840
|
cb(Vcur, "Vcur", il);
|
4741
4841
|
|
4742
|
-
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
-
0
|
4748
|
-
));
|
4749
|
-
cb(qrot, "qrot", il);
|
4750
|
-
|
4751
|
-
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
-
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
-
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
-
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
-
0
|
4756
|
-
));
|
4757
|
-
cb(krot, "krot", il);
|
4758
|
-
|
4759
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
-
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
-
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
-
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
-
);
|
4766
|
-
cb(qpass, "qpass", il);
|
4767
|
-
|
4768
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
-
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
-
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
-
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
-
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
-
);
|
4774
|
-
cb(kpass, "kpass", il);
|
4775
|
-
|
4776
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
-
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
-
);
|
4780
|
-
cb(qrotated, "qrotated", il);
|
4781
|
-
|
4782
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
-
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
Qcur = ggml_rope_custom(
|
4843
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4844
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4845
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4785
4846
|
);
|
4786
|
-
cb(krotated, "krotated", il);
|
4787
|
-
|
4788
|
-
// ggml currently only supports concatenation on dim=2
|
4789
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
-
cb(qrotated, "qrotated", il);
|
4792
|
-
|
4793
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
-
cb(krotated, "krotated", il);
|
4795
|
-
|
4796
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
-
cb(qpass, "qpass", il);
|
4798
|
-
|
4799
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
-
cb(kpass, "kpass", il);
|
4801
|
-
|
4802
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
4847
|
cb(Qcur, "Qcur", il);
|
4804
4848
|
|
4805
|
-
|
4806
|
-
|
4807
|
-
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4849
|
+
Kcur = ggml_rope_custom(
|
4850
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4851
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4852
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4853
|
+
);
|
4812
4854
|
cb(Kcur, "Kcur", il);
|
4813
4855
|
|
4814
4856
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
4857
|
|
4816
4858
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
4859
|
model.layers[il].wo, NULL,
|
4818
|
-
|
4860
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
4861
|
cb(cur, "kqv_out", il);
|
4820
4862
|
}
|
4821
4863
|
|
@@ -5000,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5000
5042
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
5001
5043
|
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
5002
5044
|
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
5045
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
5003
5046
|
{ "v", OFFLOAD_FUNC_V },
|
5004
5047
|
{ "kqv", OFFLOAD_FUNC_V },
|
5005
5048
|
{ "kqv_merged", OFFLOAD_FUNC_V },
|
@@ -5443,6 +5486,12 @@ static int llama_decode_internal(
|
|
5443
5486
|
batch.seq_id = seq_id_arr.data();
|
5444
5487
|
}
|
5445
5488
|
|
5489
|
+
// if we have enough unused cells before the current head ->
|
5490
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5491
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5492
|
+
kv_self.head = 0;
|
5493
|
+
}
|
5494
|
+
|
5446
5495
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5447
5496
|
return 1;
|
5448
5497
|
}
|
@@ -5453,7 +5502,7 @@ static int llama_decode_internal(
|
|
5453
5502
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5454
5503
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5455
5504
|
|
5456
|
-
//printf("kv_self.n = %
|
5505
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5457
5506
|
|
5458
5507
|
ggml_allocr_reset(lctx.alloc);
|
5459
5508
|
|
@@ -5502,18 +5551,8 @@ static int llama_decode_internal(
|
|
5502
5551
|
n_threads = std::min(4, n_threads);
|
5503
5552
|
}
|
5504
5553
|
|
5505
|
-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5506
|
-
const bool full_offload_supported =
|
5507
|
-
model.arch == LLM_ARCH_LLAMA ||
|
5508
|
-
model.arch == LLM_ARCH_BAICHUAN ||
|
5509
|
-
model.arch == LLM_ARCH_FALCON ||
|
5510
|
-
model.arch == LLM_ARCH_REFACT ||
|
5511
|
-
model.arch == LLM_ARCH_MPT ||
|
5512
|
-
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
-
model.arch == LLM_ARCH_STABLELM;
|
5514
|
-
|
5515
5554
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5516
|
-
if (ggml_cpu_has_cublas() &&
|
5555
|
+
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5517
5556
|
n_threads = 1;
|
5518
5557
|
}
|
5519
5558
|
|
@@ -6372,10 +6411,13 @@ struct llama_grammar_candidate {
|
|
6372
6411
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6373
6412
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6374
6413
|
const char * src,
|
6414
|
+
size_t n_src,
|
6375
6415
|
llama_partial_utf8 partial_start) {
|
6376
6416
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6377
6417
|
const char * pos = src;
|
6378
6418
|
std::vector<uint32_t> code_points;
|
6419
|
+
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
+
code_points.reserve(n_src + 1);
|
6379
6421
|
uint32_t value = partial_start.value;
|
6380
6422
|
int n_remain = partial_start.n_remain;
|
6381
6423
|
|
@@ -6426,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6426
6468
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6427
6469
|
}
|
6428
6470
|
|
6471
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
+
std::string src,
|
6473
|
+
llama_partial_utf8 partial_start
|
6474
|
+
) {
|
6475
|
+
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
+
}
|
6477
|
+
|
6429
6478
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6430
6479
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6431
6480
|
switch (pos->type) {
|
@@ -6979,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
6979
7028
|
// Replace the data in candidates with the new_candidates data
|
6980
7029
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
6981
7030
|
candidates->size = new_candidates.size();
|
7031
|
+
candidates->sorted = false;
|
6982
7032
|
|
6983
7033
|
if (ctx) {
|
6984
7034
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7075,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7075
7125
|
} else if (piece.empty() || piece[0] == 0) {
|
7076
7126
|
candidates->data[i].logit = -INFINITY;
|
7077
7127
|
} else {
|
7078
|
-
candidates_decoded.push_back(decode_utf8(piece
|
7128
|
+
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
7079
7129
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
7080
7130
|
}
|
7081
7131
|
}
|
@@ -7282,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7282
7332
|
const std::string piece = llama_token_to_piece(ctx, token);
|
7283
7333
|
|
7284
7334
|
// Note terminating 0 in decoded string
|
7285
|
-
const auto decoded = decode_utf8(piece
|
7335
|
+
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
7286
7336
|
const auto & code_points = decoded.first;
|
7287
7337
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
7288
7338
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -8527,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
|
|
8527
8577
|
|
8528
8578
|
#ifdef GGML_USE_METAL
|
8529
8579
|
if (model->n_gpu_layers > 0) {
|
8530
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8531
|
-
|
8532
8580
|
ctx->ctx_metal = ggml_metal_init(1);
|
8533
8581
|
if (!ctx->ctx_metal) {
|
8534
8582
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
@@ -8666,6 +8714,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8666
8714
|
return model->hparams.rope_freq_scale_train;
|
8667
8715
|
}
|
8668
8716
|
|
8717
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8718
|
+
const auto & it = model->gguf_kv.find(key);
|
8719
|
+
if (it == model->gguf_kv.end()) {
|
8720
|
+
if (buf_size > 0) {
|
8721
|
+
buf[0] = '\0';
|
8722
|
+
}
|
8723
|
+
return -1;
|
8724
|
+
}
|
8725
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8726
|
+
}
|
8727
|
+
|
8728
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8729
|
+
return (int)model->gguf_kv.size();
|
8730
|
+
}
|
8731
|
+
|
8732
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8733
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8734
|
+
if (buf_size > 0) {
|
8735
|
+
buf[0] = '\0';
|
8736
|
+
}
|
8737
|
+
return -1;
|
8738
|
+
}
|
8739
|
+
auto it = model->gguf_kv.begin();
|
8740
|
+
std::advance(it, i);
|
8741
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8742
|
+
}
|
8743
|
+
|
8744
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8745
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8746
|
+
if (buf_size > 0) {
|
8747
|
+
buf[0] = '\0';
|
8748
|
+
}
|
8749
|
+
return -1;
|
8750
|
+
}
|
8751
|
+
auto it = model->gguf_kv.begin();
|
8752
|
+
std::advance(it, i);
|
8753
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8754
|
+
}
|
8755
|
+
|
8669
8756
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8670
8757
|
return snprintf(buf, buf_size, "%s %s %s",
|
8671
8758
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8724,8 +8811,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8724
8811
|
}
|
8725
8812
|
}
|
8726
8813
|
|
8814
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8815
|
+
struct llama_kv_cache_view result = {
|
8816
|
+
/*.n_cells = */ 0,
|
8817
|
+
/*.n_max_seq = */ n_max_seq,
|
8818
|
+
/*.token_count = */ 0,
|
8819
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8820
|
+
/*.max_contiguous = */ 0,
|
8821
|
+
/*.max_contiguous_idx = */ -1,
|
8822
|
+
/*.cells = */ nullptr,
|
8823
|
+
/*.cells_sequences = */ nullptr,
|
8824
|
+
};
|
8825
|
+
return result;
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8829
|
+
if (view->cells != nullptr) {
|
8830
|
+
free(view->cells);
|
8831
|
+
view->cells = nullptr;
|
8832
|
+
}
|
8833
|
+
if (view->cells_sequences != nullptr) {
|
8834
|
+
free(view->cells_sequences);
|
8835
|
+
view->cells_sequences = nullptr;
|
8836
|
+
}
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8840
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8841
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8842
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8843
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8844
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8845
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8846
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8847
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8848
|
+
}
|
8849
|
+
|
8850
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8851
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8852
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8853
|
+
int32_t used_cells = 0;
|
8854
|
+
int32_t token_count = 0;
|
8855
|
+
int32_t curr_contig_idx = -1;
|
8856
|
+
uint32_t max_contig = 0;
|
8857
|
+
int32_t max_contig_idx = -1;
|
8858
|
+
|
8859
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8860
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8861
|
+
token_count += curr_size;
|
8862
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8863
|
+
|
8864
|
+
if (curr_size > 0) {
|
8865
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8866
|
+
max_contig = i - curr_contig_idx;
|
8867
|
+
max_contig_idx = curr_contig_idx;
|
8868
|
+
}
|
8869
|
+
curr_contig_idx = -1;
|
8870
|
+
} else if (curr_contig_idx < 0) {
|
8871
|
+
curr_contig_idx = i;
|
8872
|
+
}
|
8873
|
+
|
8874
|
+
int seq_idx = 0;
|
8875
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8876
|
+
if (seq_idx >= view->n_max_seq) {
|
8877
|
+
break;
|
8878
|
+
}
|
8879
|
+
cs_curr[seq_idx] = it;
|
8880
|
+
seq_idx++;
|
8881
|
+
}
|
8882
|
+
if (seq_idx != 0) {
|
8883
|
+
used_cells++;
|
8884
|
+
}
|
8885
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8886
|
+
cs_curr[seq_idx] = -1;
|
8887
|
+
}
|
8888
|
+
}
|
8889
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8890
|
+
max_contig_idx = curr_contig_idx;
|
8891
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8892
|
+
}
|
8893
|
+
view->max_contiguous = max_contig;
|
8894
|
+
view->max_contiguous_idx = max_contig_idx;
|
8895
|
+
view->token_count = token_count;
|
8896
|
+
view->used_cells = used_cells;
|
8897
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8898
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8899
|
+
__func__, ctx->kv_self.used, used_cells);
|
8900
|
+
}
|
8901
|
+
}
|
8902
|
+
|
8727
8903
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8728
|
-
|
8904
|
+
int result = 0;
|
8905
|
+
|
8906
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8907
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8908
|
+
}
|
8909
|
+
|
8910
|
+
return result;
|
8911
|
+
}
|
8912
|
+
|
8913
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8914
|
+
return ctx->kv_self.used;
|
8729
8915
|
}
|
8730
8916
|
|
8731
8917
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8895,10 +9081,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8895
9081
|
const size_t kv_buf_size = kv_self.buf.size;
|
8896
9082
|
const uint32_t kv_head = kv_self.head;
|
8897
9083
|
const uint32_t kv_size = kv_self.size;
|
9084
|
+
const uint32_t kv_used = kv_self.used;
|
8898
9085
|
|
8899
9086
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8900
9087
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8901
9088
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9089
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8902
9090
|
|
8903
9091
|
if (kv_buf_size) {
|
8904
9092
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -9021,10 +9209,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9021
9209
|
size_t kv_buf_size;
|
9022
9210
|
uint32_t kv_head;
|
9023
9211
|
uint32_t kv_size;
|
9212
|
+
uint32_t kv_used;
|
9024
9213
|
|
9025
9214
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9026
9215
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9027
9216
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9217
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9028
9218
|
|
9029
9219
|
if (kv_buf_size) {
|
9030
9220
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
@@ -9059,6 +9249,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9059
9249
|
|
9060
9250
|
ctx->kv_self.head = kv_head;
|
9061
9251
|
ctx->kv_self.size = kv_size;
|
9252
|
+
ctx->kv_self.used = kv_used;
|
9062
9253
|
|
9063
9254
|
ctx->kv_self.cells.resize(kv_size);
|
9064
9255
|
|
@@ -9521,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
9521
9712
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
9522
9713
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
9523
9714
|
g_state.log_callback_user_data = user_data;
|
9715
|
+
#ifdef GGML_USE_METAL
|
9716
|
+
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
9717
|
+
#endif
|
9524
9718
|
}
|
9525
9719
|
|
9526
9720
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|