llama_cpp 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -577,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
577
604
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
578
605
|
}
|
579
606
|
|
607
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
608
|
+
switch (type) {
|
609
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
618
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
619
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
620
|
+
default: return format("unknown type %d", type);
|
621
|
+
}
|
622
|
+
}
|
623
|
+
|
624
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
625
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
626
|
+
|
627
|
+
switch (type) {
|
628
|
+
case GGUF_TYPE_STRING:
|
629
|
+
return gguf_get_val_str(ctx_gguf, i);
|
630
|
+
case GGUF_TYPE_ARRAY:
|
631
|
+
{
|
632
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
633
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
634
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
635
|
+
std::stringstream ss;
|
636
|
+
ss << "[";
|
637
|
+
for (int j = 0; j < arr_n; j++) {
|
638
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
639
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
640
|
+
// escape quotes
|
641
|
+
replace_all(val, "\\", "\\\\");
|
642
|
+
replace_all(val, "\"", "\\\"");
|
643
|
+
ss << '"' << val << '"';
|
644
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
645
|
+
ss << "???";
|
646
|
+
} else {
|
647
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
648
|
+
}
|
649
|
+
if (j < arr_n - 1) {
|
650
|
+
ss << ", ";
|
651
|
+
}
|
652
|
+
}
|
653
|
+
ss << "]";
|
654
|
+
return ss.str();
|
655
|
+
}
|
656
|
+
default:
|
657
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
658
|
+
}
|
659
|
+
}
|
660
|
+
|
580
661
|
//
|
581
662
|
// ggml helpers
|
582
663
|
//
|
@@ -1055,9 +1136,9 @@ enum e_model {
|
|
1055
1136
|
MODEL_70B,
|
1056
1137
|
};
|
1057
1138
|
|
1058
|
-
static const size_t
|
1059
|
-
static const size_t
|
1060
|
-
static const size_t
|
1139
|
+
static const size_t kiB = 1024;
|
1140
|
+
static const size_t MiB = 1024*kiB;
|
1141
|
+
static const size_t GiB = 1024*MiB;
|
1061
1142
|
|
1062
1143
|
struct llama_hparams {
|
1063
1144
|
bool vocab_only;
|
@@ -1194,6 +1275,7 @@ struct llama_kv_cache {
|
|
1194
1275
|
// cannot be freely changed after a slot has been allocated.
|
1195
1276
|
uint32_t head = 0;
|
1196
1277
|
uint32_t size = 0;
|
1278
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1197
1279
|
|
1198
1280
|
// computed before each graph build
|
1199
1281
|
uint32_t n = 0;
|
@@ -1248,6 +1330,9 @@ struct llama_vocab {
|
|
1248
1330
|
id special_sep_id = -1;
|
1249
1331
|
id special_pad_id = -1;
|
1250
1332
|
|
1333
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1334
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1335
|
+
|
1251
1336
|
id linefeed_id = 13;
|
1252
1337
|
id special_prefix_id = 32007;
|
1253
1338
|
id special_middle_id = 32009;
|
@@ -1292,6 +1377,9 @@ struct llama_model {
|
|
1292
1377
|
|
1293
1378
|
int n_gpu_layers;
|
1294
1379
|
|
1380
|
+
// gguf metadata
|
1381
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1382
|
+
|
1295
1383
|
// context
|
1296
1384
|
struct ggml_context * ctx = NULL;
|
1297
1385
|
|
@@ -1412,6 +1500,7 @@ static bool llama_kv_cache_init(
|
|
1412
1500
|
|
1413
1501
|
cache.head = 0;
|
1414
1502
|
cache.size = n_ctx;
|
1503
|
+
cache.used = 0;
|
1415
1504
|
|
1416
1505
|
cache.cells.clear();
|
1417
1506
|
cache.cells.resize(n_ctx);
|
@@ -1453,7 +1542,7 @@ static bool llama_kv_cache_init(
|
|
1453
1542
|
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
1543
|
}
|
1455
1544
|
if (vram_kv_cache > 0) {
|
1456
|
-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f
|
1545
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
1546
|
}
|
1458
1547
|
}
|
1459
1548
|
#endif
|
@@ -1513,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
|
|
1513
1602
|
}
|
1514
1603
|
}
|
1515
1604
|
|
1605
|
+
cache.used += n_tokens;
|
1606
|
+
|
1516
1607
|
return true;
|
1517
1608
|
}
|
1518
1609
|
|
@@ -1533,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1533
1624
|
cache.cells[i].seq_id.clear();
|
1534
1625
|
}
|
1535
1626
|
cache.head = 0;
|
1627
|
+
cache.used = 0;
|
1536
1628
|
}
|
1537
1629
|
|
1538
1630
|
static void llama_kv_cache_seq_rm(
|
@@ -1555,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
|
|
1555
1647
|
continue;
|
1556
1648
|
}
|
1557
1649
|
if (cache.cells[i].seq_id.empty()) {
|
1650
|
+
// keep count of the number of used cells
|
1651
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1652
|
+
|
1558
1653
|
cache.cells[i].pos = -1;
|
1559
1654
|
if (new_head == cache.size) new_head = i;
|
1560
1655
|
}
|
@@ -1562,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
|
|
1562
1657
|
}
|
1563
1658
|
|
1564
1659
|
// If we freed up a slot, set head to it so searching can start there.
|
1565
|
-
if (new_head != cache.size) cache.head = new_head;
|
1660
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1566
1661
|
}
|
1567
1662
|
|
1568
1663
|
static void llama_kv_cache_seq_cp(
|
@@ -1588,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1588
1683
|
|
1589
1684
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1590
1685
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1686
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1591
1687
|
cache.cells[i].pos = -1;
|
1592
1688
|
cache.cells[i].seq_id.clear();
|
1593
1689
|
if (new_head == cache.size) new_head = i;
|
@@ -1598,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1598
1694
|
}
|
1599
1695
|
|
1600
1696
|
// If we freed up a slot, set head to it so searching can start there.
|
1601
|
-
if (new_head != cache.size) cache.head = new_head;
|
1697
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1602
1698
|
}
|
1603
1699
|
|
1604
1700
|
static void llama_kv_cache_seq_shift(
|
@@ -1619,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
|
|
1619
1715
|
cache.cells[i].delta += delta;
|
1620
1716
|
|
1621
1717
|
if (cache.cells[i].pos < 0) {
|
1718
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1622
1719
|
cache.cells[i].pos = -1;
|
1623
1720
|
cache.cells[i].seq_id.clear();
|
1624
1721
|
if (new_head == cache.size) new_head = i;
|
@@ -1750,10 +1847,10 @@ struct llama_model_loader {
|
|
1750
1847
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1751
1848
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1752
1849
|
default:
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1850
|
+
{
|
1851
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1852
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1853
|
+
} break;
|
1757
1854
|
}
|
1758
1855
|
|
1759
1856
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1767,10 +1864,21 @@ struct llama_model_loader {
|
|
1767
1864
|
}
|
1768
1865
|
|
1769
1866
|
for (int i = 0; i < n_kv; i++) {
|
1770
|
-
const char * name
|
1771
|
-
const enum gguf_type type
|
1867
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1868
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1869
|
+
const std::string type_name =
|
1870
|
+
type == GGUF_TYPE_ARRAY
|
1871
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1872
|
+
: gguf_type_name(type);
|
1873
|
+
|
1874
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1875
|
+
const size_t MAX_VALUE_LEN = 40;
|
1876
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1877
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1878
|
+
}
|
1879
|
+
replace_all(value, "\n", "\\n");
|
1772
1880
|
|
1773
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1881
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1774
1882
|
}
|
1775
1883
|
|
1776
1884
|
// print type counts
|
@@ -2065,6 +2173,17 @@ static void llm_load_hparams(
|
|
2065
2173
|
|
2066
2174
|
auto & hparams = model.hparams;
|
2067
2175
|
|
2176
|
+
// get metadata as string
|
2177
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2178
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2179
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2180
|
+
continue;
|
2181
|
+
}
|
2182
|
+
const char * name = gguf_get_key(ctx, i);
|
2183
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2184
|
+
model.gguf_kv.emplace(name, value);
|
2185
|
+
}
|
2186
|
+
|
2068
2187
|
// get general kv
|
2069
2188
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2070
2189
|
|
@@ -2209,6 +2328,16 @@ static void llm_load_hparams(
|
|
2209
2328
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2210
2329
|
}
|
2211
2330
|
} break;
|
2331
|
+
case LLM_ARCH_STABLELM:
|
2332
|
+
{
|
2333
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2334
|
+
|
2335
|
+
switch (hparams.n_layer) {
|
2336
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2338
|
+
}
|
2339
|
+
} break;
|
2340
|
+
|
2212
2341
|
default: (void)0;
|
2213
2342
|
}
|
2214
2343
|
|
@@ -2350,6 +2479,23 @@ static void llm_load_vocab(
|
|
2350
2479
|
__func__, key.c_str(), id, old_id);
|
2351
2480
|
id = old_id;
|
2352
2481
|
}
|
2482
|
+
|
2483
|
+
}
|
2484
|
+
|
2485
|
+
// Handle add_bos_token and add_eos_token
|
2486
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2487
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2488
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2489
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2490
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2491
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2492
|
+
}
|
2493
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2494
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2495
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2496
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2497
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2498
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2353
2499
|
}
|
2354
2500
|
}
|
2355
2501
|
|
@@ -2481,8 +2627,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2481
2627
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2482
2628
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2483
2629
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2484
|
-
if (ml.n_bytes <
|
2485
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2630
|
+
if (ml.n_bytes < GiB) {
|
2631
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2486
2632
|
} else {
|
2487
2633
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2488
2634
|
}
|
@@ -2520,7 +2666,7 @@ static void llm_load_tensors(
|
|
2520
2666
|
|
2521
2667
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2522
2668
|
|
2523
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2669
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2524
2670
|
|
2525
2671
|
// create the ggml context
|
2526
2672
|
{
|
@@ -2872,6 +3018,13 @@ static void llm_load_tensors(
|
|
2872
3018
|
ggml_backend_type backend_output;
|
2873
3019
|
|
2874
3020
|
if (n_gpu_layers > int(n_layer)) {
|
3021
|
+
#ifdef GGML_USE_CUBLAS
|
3022
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
3023
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3024
|
+
__func__, n_layer + 1);
|
3025
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
3026
|
+
}
|
3027
|
+
#endif
|
2875
3028
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2876
3029
|
// on Windows however this is detrimental unless everything is on the GPU
|
2877
3030
|
#ifndef _WIN32
|
@@ -3073,6 +3226,81 @@ static void llm_load_tensors(
|
|
3073
3226
|
}
|
3074
3227
|
}
|
3075
3228
|
} break;
|
3229
|
+
case LLM_ARCH_STABLELM:
|
3230
|
+
{
|
3231
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3232
|
+
|
3233
|
+
// output
|
3234
|
+
{
|
3235
|
+
ggml_backend_type backend_norm;
|
3236
|
+
ggml_backend_type backend_output;
|
3237
|
+
|
3238
|
+
if (n_gpu_layers > int(n_layer)) {
|
3239
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3240
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3241
|
+
#ifndef _WIN32
|
3242
|
+
backend_norm = llama_backend_offload;
|
3243
|
+
#else
|
3244
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3245
|
+
#endif // _WIN32
|
3246
|
+
|
3247
|
+
backend_output = llama_backend_offload_split;
|
3248
|
+
} else {
|
3249
|
+
backend_norm = GGML_BACKEND_CPU;
|
3250
|
+
backend_output = GGML_BACKEND_CPU;
|
3251
|
+
}
|
3252
|
+
|
3253
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3254
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3255
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3256
|
+
|
3257
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3258
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3259
|
+
}
|
3260
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3261
|
+
vram_weights += ggml_nbytes(model.output);
|
3262
|
+
}
|
3263
|
+
}
|
3264
|
+
|
3265
|
+
const uint32_t n_ff = hparams.n_ff;
|
3266
|
+
|
3267
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3268
|
+
|
3269
|
+
model.layers.resize(n_layer);
|
3270
|
+
|
3271
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3272
|
+
/*
|
3273
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3274
|
+
*/
|
3275
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3276
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3277
|
+
|
3278
|
+
auto & layer = model.layers[i];
|
3279
|
+
|
3280
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3281
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3282
|
+
|
3283
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3284
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3285
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3286
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3287
|
+
|
3288
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3289
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3290
|
+
|
3291
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3292
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3293
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3294
|
+
|
3295
|
+
if (backend == GGML_BACKEND_GPU) {
|
3296
|
+
vram_weights +=
|
3297
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3298
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3299
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3300
|
+
}
|
3301
|
+
}
|
3302
|
+
} break;
|
3303
|
+
|
3076
3304
|
default:
|
3077
3305
|
throw std::runtime_error("unknown architecture");
|
3078
3306
|
}
|
@@ -3087,7 +3315,7 @@ static void llm_load_tensors(
|
|
3087
3315
|
ctx_size +
|
3088
3316
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3089
3317
|
|
3090
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3318
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3091
3319
|
|
3092
3320
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3093
3321
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3106,7 +3334,7 @@ static void llm_load_tensors(
|
|
3106
3334
|
#endif // GGML_USE_CUBLAS
|
3107
3335
|
|
3108
3336
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3109
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3337
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3110
3338
|
#else
|
3111
3339
|
(void) n_gpu_layers;
|
3112
3340
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3606,7 +3834,7 @@ struct llm_build_context {
|
|
3606
3834
|
}
|
3607
3835
|
|
3608
3836
|
struct ggml_cgraph * build_llama() {
|
3609
|
-
struct ggml_cgraph * gf =
|
3837
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3610
3838
|
|
3611
3839
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3612
3840
|
|
@@ -3718,7 +3946,7 @@ struct llm_build_context {
|
|
3718
3946
|
}
|
3719
3947
|
|
3720
3948
|
struct ggml_cgraph * build_baichuan() {
|
3721
|
-
struct ggml_cgraph * gf =
|
3949
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3722
3950
|
|
3723
3951
|
struct ggml_tensor * cur;
|
3724
3952
|
struct ggml_tensor * inpL;
|
@@ -3838,7 +4066,7 @@ struct llm_build_context {
|
|
3838
4066
|
}
|
3839
4067
|
|
3840
4068
|
struct ggml_cgraph * build_falcon() {
|
3841
|
-
struct ggml_cgraph * gf =
|
4069
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3842
4070
|
|
3843
4071
|
struct ggml_tensor * cur;
|
3844
4072
|
struct ggml_tensor * inpL;
|
@@ -3960,7 +4188,7 @@ struct llm_build_context {
|
|
3960
4188
|
}
|
3961
4189
|
|
3962
4190
|
struct ggml_cgraph * build_starcoder() {
|
3963
|
-
struct ggml_cgraph * gf =
|
4191
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3964
4192
|
|
3965
4193
|
struct ggml_tensor * cur;
|
3966
4194
|
struct ggml_tensor * pos;
|
@@ -4059,7 +4287,7 @@ struct llm_build_context {
|
|
4059
4287
|
}
|
4060
4288
|
|
4061
4289
|
struct ggml_cgraph * build_persimmon() {
|
4062
|
-
struct ggml_cgraph * gf =
|
4290
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4063
4291
|
|
4064
4292
|
const int64_t n_rot = n_embd_head / 2;
|
4065
4293
|
|
@@ -4204,7 +4432,7 @@ struct llm_build_context {
|
|
4204
4432
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4205
4433
|
cb(Kcur, "Kcur", il);
|
4206
4434
|
|
4207
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4435
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4208
4436
|
cb(Q, "Q", il);
|
4209
4437
|
|
4210
4438
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4269,7 +4497,7 @@ struct llm_build_context {
|
|
4269
4497
|
}
|
4270
4498
|
|
4271
4499
|
struct ggml_cgraph * build_refact() {
|
4272
|
-
struct ggml_cgraph * gf =
|
4500
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4273
4501
|
|
4274
4502
|
struct ggml_tensor * cur;
|
4275
4503
|
struct ggml_tensor * inpL;
|
@@ -4360,7 +4588,7 @@ struct llm_build_context {
|
|
4360
4588
|
}
|
4361
4589
|
|
4362
4590
|
struct ggml_cgraph * build_bloom() {
|
4363
|
-
struct ggml_cgraph * gf =
|
4591
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4364
4592
|
|
4365
4593
|
struct ggml_tensor * cur;
|
4366
4594
|
struct ggml_tensor * inpL;
|
@@ -4454,7 +4682,7 @@ struct llm_build_context {
|
|
4454
4682
|
}
|
4455
4683
|
|
4456
4684
|
struct ggml_cgraph * build_mpt() {
|
4457
|
-
struct ggml_cgraph * gf =
|
4685
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4458
4686
|
|
4459
4687
|
struct ggml_tensor * cur;
|
4460
4688
|
struct ggml_tensor * inpL;
|
@@ -4551,6 +4779,119 @@ struct llm_build_context {
|
|
4551
4779
|
|
4552
4780
|
return gf;
|
4553
4781
|
}
|
4782
|
+
|
4783
|
+
struct ggml_cgraph * build_stablelm() {
|
4784
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4785
|
+
|
4786
|
+
struct ggml_tensor * cur;
|
4787
|
+
struct ggml_tensor * inpL;
|
4788
|
+
|
4789
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4790
|
+
cb(inpL, "inp_embd", -1);
|
4791
|
+
|
4792
|
+
// inp_pos - contains the positions
|
4793
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4794
|
+
cb(inp_pos, "inp_pos", -1);
|
4795
|
+
|
4796
|
+
// KQ_scale
|
4797
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4798
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4799
|
+
|
4800
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4801
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4802
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4803
|
+
|
4804
|
+
// shift the entire K-cache if needed
|
4805
|
+
if (do_rope_shift) {
|
4806
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4807
|
+
}
|
4808
|
+
|
4809
|
+
for (int il = 0; il < n_layer; ++il) {
|
4810
|
+
struct ggml_tensor * inpSA = inpL;
|
4811
|
+
|
4812
|
+
// norm
|
4813
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4814
|
+
model.layers[il].attn_norm,
|
4815
|
+
model.layers[il].attn_norm_b,
|
4816
|
+
LLM_NORM, cb, il);
|
4817
|
+
cb(cur, "attn_norm", il);
|
4818
|
+
|
4819
|
+
// self-attention
|
4820
|
+
{
|
4821
|
+
// compute Q and K and RoPE them
|
4822
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4823
|
+
cb(Qcur, "Qcur", il);
|
4824
|
+
|
4825
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4826
|
+
cb(Kcur, "Kcur", il);
|
4827
|
+
|
4828
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4829
|
+
cb(Vcur, "Vcur", il);
|
4830
|
+
|
4831
|
+
Qcur = ggml_rope_custom(
|
4832
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4833
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4835
|
+
);
|
4836
|
+
cb(Qcur, "Qcur", il);
|
4837
|
+
|
4838
|
+
Kcur = ggml_rope_custom(
|
4839
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4840
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4841
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
);
|
4843
|
+
cb(Kcur, "Kcur", il);
|
4844
|
+
|
4845
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4846
|
+
|
4847
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4848
|
+
model.layers[il].wo, NULL,
|
4849
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4850
|
+
cb(cur, "kqv_out", il);
|
4851
|
+
}
|
4852
|
+
|
4853
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4854
|
+
cb(ffn_inp, "ffn_inp", il);
|
4855
|
+
|
4856
|
+
// feed-forward network
|
4857
|
+
{
|
4858
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4859
|
+
model.layers[il].ffn_norm,
|
4860
|
+
model.layers[il].ffn_norm_b,
|
4861
|
+
LLM_NORM, cb, il);
|
4862
|
+
cb(cur, "ffn_norm", il);
|
4863
|
+
|
4864
|
+
cur = llm_build_ffn(ctx0, cur,
|
4865
|
+
model.layers[il].ffn_up, NULL,
|
4866
|
+
model.layers[il].ffn_gate, NULL,
|
4867
|
+
model.layers[il].ffn_down, NULL,
|
4868
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4869
|
+
cb(cur, "ffn_out", il);
|
4870
|
+
}
|
4871
|
+
|
4872
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4873
|
+
cb(cur, "l_out", il);
|
4874
|
+
|
4875
|
+
// input for next layer
|
4876
|
+
inpL = cur;
|
4877
|
+
}
|
4878
|
+
|
4879
|
+
cur = inpL;
|
4880
|
+
|
4881
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4882
|
+
model.output_norm,
|
4883
|
+
model.output_norm_b,
|
4884
|
+
LLM_NORM, cb, -1);
|
4885
|
+
cb(cur, "result_norm", -1);
|
4886
|
+
|
4887
|
+
// lm_head
|
4888
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4889
|
+
cb(cur, "result_output", -1);
|
4890
|
+
|
4891
|
+
ggml_build_forward_expand(gf, cur);
|
4892
|
+
|
4893
|
+
return gf;
|
4894
|
+
}
|
4554
4895
|
};
|
4555
4896
|
|
4556
4897
|
//
|
@@ -5020,6 +5361,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5020
5361
|
{
|
5021
5362
|
result = llm.build_mpt();
|
5022
5363
|
} break;
|
5364
|
+
case LLM_ARCH_STABLELM:
|
5365
|
+
{
|
5366
|
+
result = llm.build_stablelm();
|
5367
|
+
} break;
|
5023
5368
|
default:
|
5024
5369
|
GGML_ASSERT(false);
|
5025
5370
|
}
|
@@ -5129,6 +5474,12 @@ static int llama_decode_internal(
|
|
5129
5474
|
batch.seq_id = seq_id_arr.data();
|
5130
5475
|
}
|
5131
5476
|
|
5477
|
+
// if we have enough unused cells before the current head ->
|
5478
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5479
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5480
|
+
kv_self.head = 0;
|
5481
|
+
}
|
5482
|
+
|
5132
5483
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5133
5484
|
return 1;
|
5134
5485
|
}
|
@@ -5139,7 +5490,7 @@ static int llama_decode_internal(
|
|
5139
5490
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5140
5491
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5141
5492
|
|
5142
|
-
//printf("kv_self.n = %
|
5493
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5143
5494
|
|
5144
5495
|
ggml_allocr_reset(lctx.alloc);
|
5145
5496
|
|
@@ -5195,7 +5546,8 @@ static int llama_decode_internal(
|
|
5195
5546
|
model.arch == LLM_ARCH_FALCON ||
|
5196
5547
|
model.arch == LLM_ARCH_REFACT ||
|
5197
5548
|
model.arch == LLM_ARCH_MPT ||
|
5198
|
-
model.arch == LLM_ARCH_STARCODER
|
5549
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5550
|
+
model.arch == LLM_ARCH_STABLELM;
|
5199
5551
|
|
5200
5552
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5201
5553
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5987,7 +6339,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5987
6339
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5988
6340
|
// and passing 'add space prefix' as bool argument
|
5989
6341
|
//
|
5990
|
-
auto raw_text =
|
6342
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6343
|
+
if (&fragment == &fragment_buffer.front()) {
|
6344
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6345
|
+
}
|
5991
6346
|
|
5992
6347
|
#ifdef PRETOKENIZERDEBUG
|
5993
6348
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7639,7 +7994,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7639
7994
|
workers.clear();
|
7640
7995
|
}
|
7641
7996
|
|
7642
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7997
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7643
7998
|
int64_t tot_count = 0;
|
7644
7999
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7645
8000
|
hist_all[i] += hist_cur[i];
|
@@ -8179,7 +8534,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8179
8534
|
|
8180
8535
|
{
|
8181
8536
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8182
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8537
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8183
8538
|
}
|
8184
8539
|
|
8185
8540
|
// resized during inference
|
@@ -8196,7 +8551,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8196
8551
|
{
|
8197
8552
|
static const size_t tensor_alignment = 32;
|
8198
8553
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8199
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8554
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8200
8555
|
|
8201
8556
|
// create measure allocator
|
8202
8557
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8224,7 +8579,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8224
8579
|
// measure memory requirements for the graph
|
8225
8580
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8226
8581
|
|
8227
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8582
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8228
8583
|
|
8229
8584
|
// recreate allocator with exact memory requirements
|
8230
8585
|
ggml_allocr_free(ctx->alloc);
|
@@ -8238,7 +8593,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8238
8593
|
#endif
|
8239
8594
|
#ifdef GGML_USE_CUBLAS
|
8240
8595
|
ggml_cuda_set_scratch_size(alloc_size);
|
8241
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8596
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8242
8597
|
|
8243
8598
|
// calculate total VRAM usage
|
8244
8599
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8258,10 +8613,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8258
8613
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8259
8614
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8260
8615
|
|
8261
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8616
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8262
8617
|
total_vram_size / 1024.0 / 1024.0,
|
8263
8618
|
model_vram_size / 1024.0 / 1024.0,
|
8264
|
-
ctx_vram_size
|
8619
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8265
8620
|
#endif
|
8266
8621
|
}
|
8267
8622
|
|
@@ -8282,7 +8637,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8282
8637
|
|
8283
8638
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8284
8639
|
|
8285
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8640
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8286
8641
|
|
8287
8642
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8288
8643
|
if (!(result)) { \
|
@@ -8348,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8348
8703
|
return model->hparams.rope_freq_scale_train;
|
8349
8704
|
}
|
8350
8705
|
|
8706
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8707
|
+
const auto & it = model->gguf_kv.find(key);
|
8708
|
+
if (it == model->gguf_kv.end()) {
|
8709
|
+
if (buf_size > 0) {
|
8710
|
+
buf[0] = '\0';
|
8711
|
+
}
|
8712
|
+
return -1;
|
8713
|
+
}
|
8714
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8715
|
+
}
|
8716
|
+
|
8717
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8718
|
+
return (int)model->gguf_kv.size();
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8722
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8723
|
+
if (buf_size > 0) {
|
8724
|
+
buf[0] = '\0';
|
8725
|
+
}
|
8726
|
+
return -1;
|
8727
|
+
}
|
8728
|
+
auto it = model->gguf_kv.begin();
|
8729
|
+
std::advance(it, i);
|
8730
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8731
|
+
}
|
8732
|
+
|
8733
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8734
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8735
|
+
if (buf_size > 0) {
|
8736
|
+
buf[0] = '\0';
|
8737
|
+
}
|
8738
|
+
return -1;
|
8739
|
+
}
|
8740
|
+
auto it = model->gguf_kv.begin();
|
8741
|
+
std::advance(it, i);
|
8742
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8743
|
+
}
|
8744
|
+
|
8351
8745
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8352
8746
|
return snprintf(buf, buf_size, "%s %s %s",
|
8353
8747
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8406,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8406
8800
|
}
|
8407
8801
|
}
|
8408
8802
|
|
8803
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8804
|
+
struct llama_kv_cache_view result = {
|
8805
|
+
/*.n_cells = */ 0,
|
8806
|
+
/*.n_max_seq = */ n_max_seq,
|
8807
|
+
/*.token_count = */ 0,
|
8808
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8809
|
+
/*.max_contiguous = */ 0,
|
8810
|
+
/*.max_contiguous_idx = */ -1,
|
8811
|
+
/*.cells = */ nullptr,
|
8812
|
+
/*.cells_sequences = */ nullptr,
|
8813
|
+
};
|
8814
|
+
return result;
|
8815
|
+
}
|
8816
|
+
|
8817
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8818
|
+
if (view->cells != nullptr) {
|
8819
|
+
free(view->cells);
|
8820
|
+
view->cells = nullptr;
|
8821
|
+
}
|
8822
|
+
if (view->cells_sequences != nullptr) {
|
8823
|
+
free(view->cells_sequences);
|
8824
|
+
view->cells_sequences = nullptr;
|
8825
|
+
}
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8829
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8830
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8831
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8832
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8833
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8834
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8835
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8836
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8840
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8841
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8842
|
+
int32_t used_cells = 0;
|
8843
|
+
int32_t token_count = 0;
|
8844
|
+
int32_t curr_contig_idx = -1;
|
8845
|
+
uint32_t max_contig = 0;
|
8846
|
+
int32_t max_contig_idx = -1;
|
8847
|
+
|
8848
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8849
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8850
|
+
token_count += curr_size;
|
8851
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8852
|
+
|
8853
|
+
if (curr_size > 0) {
|
8854
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8855
|
+
max_contig = i - curr_contig_idx;
|
8856
|
+
max_contig_idx = curr_contig_idx;
|
8857
|
+
}
|
8858
|
+
curr_contig_idx = -1;
|
8859
|
+
} else if (curr_contig_idx < 0) {
|
8860
|
+
curr_contig_idx = i;
|
8861
|
+
}
|
8862
|
+
|
8863
|
+
int seq_idx = 0;
|
8864
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8865
|
+
if (seq_idx >= view->n_max_seq) {
|
8866
|
+
break;
|
8867
|
+
}
|
8868
|
+
cs_curr[seq_idx] = it;
|
8869
|
+
seq_idx++;
|
8870
|
+
}
|
8871
|
+
if (seq_idx != 0) {
|
8872
|
+
used_cells++;
|
8873
|
+
}
|
8874
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8875
|
+
cs_curr[seq_idx] = -1;
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8879
|
+
max_contig_idx = curr_contig_idx;
|
8880
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8881
|
+
}
|
8882
|
+
view->max_contiguous = max_contig;
|
8883
|
+
view->max_contiguous_idx = max_contig_idx;
|
8884
|
+
view->token_count = token_count;
|
8885
|
+
view->used_cells = used_cells;
|
8886
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8887
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8888
|
+
__func__, ctx->kv_self.used, used_cells);
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
|
8409
8892
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8410
|
-
|
8893
|
+
int result = 0;
|
8894
|
+
|
8895
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8896
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8897
|
+
}
|
8898
|
+
|
8899
|
+
return result;
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8903
|
+
return ctx->kv_self.used;
|
8411
8904
|
}
|
8412
8905
|
|
8413
8906
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8577,16 +9070,18 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8577
9070
|
const size_t kv_buf_size = kv_self.buf.size;
|
8578
9071
|
const uint32_t kv_head = kv_self.head;
|
8579
9072
|
const uint32_t kv_size = kv_self.size;
|
9073
|
+
const uint32_t kv_used = kv_self.used;
|
8580
9074
|
|
8581
9075
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8582
9076
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8583
9077
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9078
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8584
9079
|
|
8585
9080
|
if (kv_buf_size) {
|
8586
9081
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8587
9082
|
|
8588
|
-
ggml_context * cpy_ctx = ggml_init({
|
8589
|
-
ggml_cgraph gf
|
9083
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9084
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8590
9085
|
|
8591
9086
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8592
9087
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8604,9 +9099,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8604
9099
|
kv_head, n_embd, n_layer,
|
8605
9100
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8606
9101
|
|
8607
|
-
ggml_build_forward_expand(
|
8608
|
-
ggml_build_forward_expand(
|
8609
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9102
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9103
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9104
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8610
9105
|
|
8611
9106
|
ggml_free(cpy_ctx);
|
8612
9107
|
|
@@ -8703,18 +9198,20 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8703
9198
|
size_t kv_buf_size;
|
8704
9199
|
uint32_t kv_head;
|
8705
9200
|
uint32_t kv_size;
|
9201
|
+
uint32_t kv_used;
|
8706
9202
|
|
8707
9203
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
8708
9204
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
8709
9205
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9206
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
8710
9207
|
|
8711
9208
|
if (kv_buf_size) {
|
8712
9209
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
8713
9210
|
|
8714
9211
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8715
9212
|
|
8716
|
-
ggml_context * cpy_ctx = ggml_init({
|
8717
|
-
ggml_cgraph gf
|
9213
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9214
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8718
9215
|
|
8719
9216
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8720
9217
|
kin3d->data = (void *) inp;
|
@@ -8732,15 +9229,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8732
9229
|
kv_head, n_embd, n_layer,
|
8733
9230
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8734
9231
|
|
8735
|
-
ggml_build_forward_expand(
|
8736
|
-
ggml_build_forward_expand(
|
8737
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9232
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9233
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9234
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8738
9235
|
|
8739
9236
|
ggml_free(cpy_ctx);
|
8740
9237
|
}
|
8741
9238
|
|
8742
9239
|
ctx->kv_self.head = kv_head;
|
8743
9240
|
ctx->kv_self.size = kv_size;
|
9241
|
+
ctx->kv_self.used = kv_used;
|
8744
9242
|
|
8745
9243
|
ctx->kv_self.cells.resize(kv_size);
|
8746
9244
|
|
@@ -8989,6 +9487,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8989
9487
|
return model->vocab.linefeed_id;
|
8990
9488
|
}
|
8991
9489
|
|
9490
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9491
|
+
return model->vocab.special_add_bos;
|
9492
|
+
}
|
9493
|
+
|
9494
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9495
|
+
return model->vocab.special_add_eos;
|
9496
|
+
}
|
9497
|
+
|
8992
9498
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8993
9499
|
return model->vocab.special_prefix_id;
|
8994
9500
|
}
|