llama_cpp 0.9.2 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -577,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
577
604
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
578
605
|
}
|
579
606
|
|
607
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
608
|
+
switch (type) {
|
609
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
618
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
619
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
620
|
+
default: return format("unknown type %d", type);
|
621
|
+
}
|
622
|
+
}
|
623
|
+
|
624
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
625
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
626
|
+
|
627
|
+
switch (type) {
|
628
|
+
case GGUF_TYPE_STRING:
|
629
|
+
return gguf_get_val_str(ctx_gguf, i);
|
630
|
+
case GGUF_TYPE_ARRAY:
|
631
|
+
{
|
632
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
633
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
634
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
635
|
+
std::stringstream ss;
|
636
|
+
ss << "[";
|
637
|
+
for (int j = 0; j < arr_n; j++) {
|
638
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
639
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
640
|
+
// escape quotes
|
641
|
+
replace_all(val, "\\", "\\\\");
|
642
|
+
replace_all(val, "\"", "\\\"");
|
643
|
+
ss << '"' << val << '"';
|
644
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
645
|
+
ss << "???";
|
646
|
+
} else {
|
647
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
648
|
+
}
|
649
|
+
if (j < arr_n - 1) {
|
650
|
+
ss << ", ";
|
651
|
+
}
|
652
|
+
}
|
653
|
+
ss << "]";
|
654
|
+
return ss.str();
|
655
|
+
}
|
656
|
+
default:
|
657
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
658
|
+
}
|
659
|
+
}
|
660
|
+
|
580
661
|
//
|
581
662
|
// ggml helpers
|
582
663
|
//
|
@@ -1055,9 +1136,9 @@ enum e_model {
|
|
1055
1136
|
MODEL_70B,
|
1056
1137
|
};
|
1057
1138
|
|
1058
|
-
static const size_t
|
1059
|
-
static const size_t
|
1060
|
-
static const size_t
|
1139
|
+
static const size_t kiB = 1024;
|
1140
|
+
static const size_t MiB = 1024*kiB;
|
1141
|
+
static const size_t GiB = 1024*MiB;
|
1061
1142
|
|
1062
1143
|
struct llama_hparams {
|
1063
1144
|
bool vocab_only;
|
@@ -1194,6 +1275,7 @@ struct llama_kv_cache {
|
|
1194
1275
|
// cannot be freely changed after a slot has been allocated.
|
1195
1276
|
uint32_t head = 0;
|
1196
1277
|
uint32_t size = 0;
|
1278
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1197
1279
|
|
1198
1280
|
// computed before each graph build
|
1199
1281
|
uint32_t n = 0;
|
@@ -1248,6 +1330,9 @@ struct llama_vocab {
|
|
1248
1330
|
id special_sep_id = -1;
|
1249
1331
|
id special_pad_id = -1;
|
1250
1332
|
|
1333
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1334
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1335
|
+
|
1251
1336
|
id linefeed_id = 13;
|
1252
1337
|
id special_prefix_id = 32007;
|
1253
1338
|
id special_middle_id = 32009;
|
@@ -1292,6 +1377,9 @@ struct llama_model {
|
|
1292
1377
|
|
1293
1378
|
int n_gpu_layers;
|
1294
1379
|
|
1380
|
+
// gguf metadata
|
1381
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1382
|
+
|
1295
1383
|
// context
|
1296
1384
|
struct ggml_context * ctx = NULL;
|
1297
1385
|
|
@@ -1412,6 +1500,7 @@ static bool llama_kv_cache_init(
|
|
1412
1500
|
|
1413
1501
|
cache.head = 0;
|
1414
1502
|
cache.size = n_ctx;
|
1503
|
+
cache.used = 0;
|
1415
1504
|
|
1416
1505
|
cache.cells.clear();
|
1417
1506
|
cache.cells.resize(n_ctx);
|
@@ -1453,7 +1542,7 @@ static bool llama_kv_cache_init(
|
|
1453
1542
|
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
1543
|
}
|
1455
1544
|
if (vram_kv_cache > 0) {
|
1456
|
-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f
|
1545
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
1546
|
}
|
1458
1547
|
}
|
1459
1548
|
#endif
|
@@ -1513,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
|
|
1513
1602
|
}
|
1514
1603
|
}
|
1515
1604
|
|
1605
|
+
cache.used += n_tokens;
|
1606
|
+
|
1516
1607
|
return true;
|
1517
1608
|
}
|
1518
1609
|
|
@@ -1533,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1533
1624
|
cache.cells[i].seq_id.clear();
|
1534
1625
|
}
|
1535
1626
|
cache.head = 0;
|
1627
|
+
cache.used = 0;
|
1536
1628
|
}
|
1537
1629
|
|
1538
1630
|
static void llama_kv_cache_seq_rm(
|
@@ -1555,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
|
|
1555
1647
|
continue;
|
1556
1648
|
}
|
1557
1649
|
if (cache.cells[i].seq_id.empty()) {
|
1650
|
+
// keep count of the number of used cells
|
1651
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1652
|
+
|
1558
1653
|
cache.cells[i].pos = -1;
|
1559
1654
|
if (new_head == cache.size) new_head = i;
|
1560
1655
|
}
|
@@ -1562,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
|
|
1562
1657
|
}
|
1563
1658
|
|
1564
1659
|
// If we freed up a slot, set head to it so searching can start there.
|
1565
|
-
if (new_head != cache.size) cache.head = new_head;
|
1660
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1566
1661
|
}
|
1567
1662
|
|
1568
1663
|
static void llama_kv_cache_seq_cp(
|
@@ -1588,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1588
1683
|
|
1589
1684
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1590
1685
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1686
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1591
1687
|
cache.cells[i].pos = -1;
|
1592
1688
|
cache.cells[i].seq_id.clear();
|
1593
1689
|
if (new_head == cache.size) new_head = i;
|
@@ -1598,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1598
1694
|
}
|
1599
1695
|
|
1600
1696
|
// If we freed up a slot, set head to it so searching can start there.
|
1601
|
-
if (new_head != cache.size) cache.head = new_head;
|
1697
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1602
1698
|
}
|
1603
1699
|
|
1604
1700
|
static void llama_kv_cache_seq_shift(
|
@@ -1619,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
|
|
1619
1715
|
cache.cells[i].delta += delta;
|
1620
1716
|
|
1621
1717
|
if (cache.cells[i].pos < 0) {
|
1718
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1622
1719
|
cache.cells[i].pos = -1;
|
1623
1720
|
cache.cells[i].seq_id.clear();
|
1624
1721
|
if (new_head == cache.size) new_head = i;
|
@@ -1750,10 +1847,10 @@ struct llama_model_loader {
|
|
1750
1847
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1751
1848
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1752
1849
|
default:
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1850
|
+
{
|
1851
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1852
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1853
|
+
} break;
|
1757
1854
|
}
|
1758
1855
|
|
1759
1856
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1767,10 +1864,21 @@ struct llama_model_loader {
|
|
1767
1864
|
}
|
1768
1865
|
|
1769
1866
|
for (int i = 0; i < n_kv; i++) {
|
1770
|
-
const char * name
|
1771
|
-
const enum gguf_type type
|
1867
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1868
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1869
|
+
const std::string type_name =
|
1870
|
+
type == GGUF_TYPE_ARRAY
|
1871
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1872
|
+
: gguf_type_name(type);
|
1873
|
+
|
1874
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1875
|
+
const size_t MAX_VALUE_LEN = 40;
|
1876
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1877
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1878
|
+
}
|
1879
|
+
replace_all(value, "\n", "\\n");
|
1772
1880
|
|
1773
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1881
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1774
1882
|
}
|
1775
1883
|
|
1776
1884
|
// print type counts
|
@@ -2065,6 +2173,17 @@ static void llm_load_hparams(
|
|
2065
2173
|
|
2066
2174
|
auto & hparams = model.hparams;
|
2067
2175
|
|
2176
|
+
// get metadata as string
|
2177
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2178
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2179
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2180
|
+
continue;
|
2181
|
+
}
|
2182
|
+
const char * name = gguf_get_key(ctx, i);
|
2183
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2184
|
+
model.gguf_kv.emplace(name, value);
|
2185
|
+
}
|
2186
|
+
|
2068
2187
|
// get general kv
|
2069
2188
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2070
2189
|
|
@@ -2209,6 +2328,16 @@ static void llm_load_hparams(
|
|
2209
2328
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2210
2329
|
}
|
2211
2330
|
} break;
|
2331
|
+
case LLM_ARCH_STABLELM:
|
2332
|
+
{
|
2333
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2334
|
+
|
2335
|
+
switch (hparams.n_layer) {
|
2336
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2338
|
+
}
|
2339
|
+
} break;
|
2340
|
+
|
2212
2341
|
default: (void)0;
|
2213
2342
|
}
|
2214
2343
|
|
@@ -2350,6 +2479,23 @@ static void llm_load_vocab(
|
|
2350
2479
|
__func__, key.c_str(), id, old_id);
|
2351
2480
|
id = old_id;
|
2352
2481
|
}
|
2482
|
+
|
2483
|
+
}
|
2484
|
+
|
2485
|
+
// Handle add_bos_token and add_eos_token
|
2486
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2487
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2488
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2489
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2490
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2491
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2492
|
+
}
|
2493
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2494
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2495
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2496
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2497
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2498
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2353
2499
|
}
|
2354
2500
|
}
|
2355
2501
|
|
@@ -2481,8 +2627,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2481
2627
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2482
2628
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2483
2629
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2484
|
-
if (ml.n_bytes <
|
2485
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2630
|
+
if (ml.n_bytes < GiB) {
|
2631
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2486
2632
|
} else {
|
2487
2633
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2488
2634
|
}
|
@@ -2520,7 +2666,7 @@ static void llm_load_tensors(
|
|
2520
2666
|
|
2521
2667
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2522
2668
|
|
2523
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2669
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2524
2670
|
|
2525
2671
|
// create the ggml context
|
2526
2672
|
{
|
@@ -2872,6 +3018,13 @@ static void llm_load_tensors(
|
|
2872
3018
|
ggml_backend_type backend_output;
|
2873
3019
|
|
2874
3020
|
if (n_gpu_layers > int(n_layer)) {
|
3021
|
+
#ifdef GGML_USE_CUBLAS
|
3022
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
3023
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3024
|
+
__func__, n_layer + 1);
|
3025
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
3026
|
+
}
|
3027
|
+
#endif
|
2875
3028
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2876
3029
|
// on Windows however this is detrimental unless everything is on the GPU
|
2877
3030
|
#ifndef _WIN32
|
@@ -3073,6 +3226,81 @@ static void llm_load_tensors(
|
|
3073
3226
|
}
|
3074
3227
|
}
|
3075
3228
|
} break;
|
3229
|
+
case LLM_ARCH_STABLELM:
|
3230
|
+
{
|
3231
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3232
|
+
|
3233
|
+
// output
|
3234
|
+
{
|
3235
|
+
ggml_backend_type backend_norm;
|
3236
|
+
ggml_backend_type backend_output;
|
3237
|
+
|
3238
|
+
if (n_gpu_layers > int(n_layer)) {
|
3239
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3240
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3241
|
+
#ifndef _WIN32
|
3242
|
+
backend_norm = llama_backend_offload;
|
3243
|
+
#else
|
3244
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3245
|
+
#endif // _WIN32
|
3246
|
+
|
3247
|
+
backend_output = llama_backend_offload_split;
|
3248
|
+
} else {
|
3249
|
+
backend_norm = GGML_BACKEND_CPU;
|
3250
|
+
backend_output = GGML_BACKEND_CPU;
|
3251
|
+
}
|
3252
|
+
|
3253
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3254
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3255
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3256
|
+
|
3257
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3258
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3259
|
+
}
|
3260
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3261
|
+
vram_weights += ggml_nbytes(model.output);
|
3262
|
+
}
|
3263
|
+
}
|
3264
|
+
|
3265
|
+
const uint32_t n_ff = hparams.n_ff;
|
3266
|
+
|
3267
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3268
|
+
|
3269
|
+
model.layers.resize(n_layer);
|
3270
|
+
|
3271
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3272
|
+
/*
|
3273
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3274
|
+
*/
|
3275
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3276
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3277
|
+
|
3278
|
+
auto & layer = model.layers[i];
|
3279
|
+
|
3280
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3281
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3282
|
+
|
3283
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3284
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3285
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3286
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3287
|
+
|
3288
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3289
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3290
|
+
|
3291
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3292
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3293
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3294
|
+
|
3295
|
+
if (backend == GGML_BACKEND_GPU) {
|
3296
|
+
vram_weights +=
|
3297
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3298
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3299
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3300
|
+
}
|
3301
|
+
}
|
3302
|
+
} break;
|
3303
|
+
|
3076
3304
|
default:
|
3077
3305
|
throw std::runtime_error("unknown architecture");
|
3078
3306
|
}
|
@@ -3087,7 +3315,7 @@ static void llm_load_tensors(
|
|
3087
3315
|
ctx_size +
|
3088
3316
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3089
3317
|
|
3090
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3318
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3091
3319
|
|
3092
3320
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3093
3321
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3106,7 +3334,7 @@ static void llm_load_tensors(
|
|
3106
3334
|
#endif // GGML_USE_CUBLAS
|
3107
3335
|
|
3108
3336
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3109
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3337
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3110
3338
|
#else
|
3111
3339
|
(void) n_gpu_layers;
|
3112
3340
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3606,7 +3834,7 @@ struct llm_build_context {
|
|
3606
3834
|
}
|
3607
3835
|
|
3608
3836
|
struct ggml_cgraph * build_llama() {
|
3609
|
-
struct ggml_cgraph * gf =
|
3837
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3610
3838
|
|
3611
3839
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3612
3840
|
|
@@ -3718,7 +3946,7 @@ struct llm_build_context {
|
|
3718
3946
|
}
|
3719
3947
|
|
3720
3948
|
struct ggml_cgraph * build_baichuan() {
|
3721
|
-
struct ggml_cgraph * gf =
|
3949
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3722
3950
|
|
3723
3951
|
struct ggml_tensor * cur;
|
3724
3952
|
struct ggml_tensor * inpL;
|
@@ -3838,7 +4066,7 @@ struct llm_build_context {
|
|
3838
4066
|
}
|
3839
4067
|
|
3840
4068
|
struct ggml_cgraph * build_falcon() {
|
3841
|
-
struct ggml_cgraph * gf =
|
4069
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3842
4070
|
|
3843
4071
|
struct ggml_tensor * cur;
|
3844
4072
|
struct ggml_tensor * inpL;
|
@@ -3960,7 +4188,7 @@ struct llm_build_context {
|
|
3960
4188
|
}
|
3961
4189
|
|
3962
4190
|
struct ggml_cgraph * build_starcoder() {
|
3963
|
-
struct ggml_cgraph * gf =
|
4191
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3964
4192
|
|
3965
4193
|
struct ggml_tensor * cur;
|
3966
4194
|
struct ggml_tensor * pos;
|
@@ -4059,7 +4287,7 @@ struct llm_build_context {
|
|
4059
4287
|
}
|
4060
4288
|
|
4061
4289
|
struct ggml_cgraph * build_persimmon() {
|
4062
|
-
struct ggml_cgraph * gf =
|
4290
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4063
4291
|
|
4064
4292
|
const int64_t n_rot = n_embd_head / 2;
|
4065
4293
|
|
@@ -4204,7 +4432,7 @@ struct llm_build_context {
|
|
4204
4432
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4205
4433
|
cb(Kcur, "Kcur", il);
|
4206
4434
|
|
4207
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4435
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4208
4436
|
cb(Q, "Q", il);
|
4209
4437
|
|
4210
4438
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4269,7 +4497,7 @@ struct llm_build_context {
|
|
4269
4497
|
}
|
4270
4498
|
|
4271
4499
|
struct ggml_cgraph * build_refact() {
|
4272
|
-
struct ggml_cgraph * gf =
|
4500
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4273
4501
|
|
4274
4502
|
struct ggml_tensor * cur;
|
4275
4503
|
struct ggml_tensor * inpL;
|
@@ -4360,7 +4588,7 @@ struct llm_build_context {
|
|
4360
4588
|
}
|
4361
4589
|
|
4362
4590
|
struct ggml_cgraph * build_bloom() {
|
4363
|
-
struct ggml_cgraph * gf =
|
4591
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4364
4592
|
|
4365
4593
|
struct ggml_tensor * cur;
|
4366
4594
|
struct ggml_tensor * inpL;
|
@@ -4454,7 +4682,7 @@ struct llm_build_context {
|
|
4454
4682
|
}
|
4455
4683
|
|
4456
4684
|
struct ggml_cgraph * build_mpt() {
|
4457
|
-
struct ggml_cgraph * gf =
|
4685
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4458
4686
|
|
4459
4687
|
struct ggml_tensor * cur;
|
4460
4688
|
struct ggml_tensor * inpL;
|
@@ -4551,6 +4779,119 @@ struct llm_build_context {
|
|
4551
4779
|
|
4552
4780
|
return gf;
|
4553
4781
|
}
|
4782
|
+
|
4783
|
+
struct ggml_cgraph * build_stablelm() {
|
4784
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4785
|
+
|
4786
|
+
struct ggml_tensor * cur;
|
4787
|
+
struct ggml_tensor * inpL;
|
4788
|
+
|
4789
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4790
|
+
cb(inpL, "inp_embd", -1);
|
4791
|
+
|
4792
|
+
// inp_pos - contains the positions
|
4793
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4794
|
+
cb(inp_pos, "inp_pos", -1);
|
4795
|
+
|
4796
|
+
// KQ_scale
|
4797
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4798
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4799
|
+
|
4800
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4801
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4802
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4803
|
+
|
4804
|
+
// shift the entire K-cache if needed
|
4805
|
+
if (do_rope_shift) {
|
4806
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4807
|
+
}
|
4808
|
+
|
4809
|
+
for (int il = 0; il < n_layer; ++il) {
|
4810
|
+
struct ggml_tensor * inpSA = inpL;
|
4811
|
+
|
4812
|
+
// norm
|
4813
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4814
|
+
model.layers[il].attn_norm,
|
4815
|
+
model.layers[il].attn_norm_b,
|
4816
|
+
LLM_NORM, cb, il);
|
4817
|
+
cb(cur, "attn_norm", il);
|
4818
|
+
|
4819
|
+
// self-attention
|
4820
|
+
{
|
4821
|
+
// compute Q and K and RoPE them
|
4822
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4823
|
+
cb(Qcur, "Qcur", il);
|
4824
|
+
|
4825
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4826
|
+
cb(Kcur, "Kcur", il);
|
4827
|
+
|
4828
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4829
|
+
cb(Vcur, "Vcur", il);
|
4830
|
+
|
4831
|
+
Qcur = ggml_rope_custom(
|
4832
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4833
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4835
|
+
);
|
4836
|
+
cb(Qcur, "Qcur", il);
|
4837
|
+
|
4838
|
+
Kcur = ggml_rope_custom(
|
4839
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4840
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4841
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
);
|
4843
|
+
cb(Kcur, "Kcur", il);
|
4844
|
+
|
4845
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4846
|
+
|
4847
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4848
|
+
model.layers[il].wo, NULL,
|
4849
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4850
|
+
cb(cur, "kqv_out", il);
|
4851
|
+
}
|
4852
|
+
|
4853
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4854
|
+
cb(ffn_inp, "ffn_inp", il);
|
4855
|
+
|
4856
|
+
// feed-forward network
|
4857
|
+
{
|
4858
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4859
|
+
model.layers[il].ffn_norm,
|
4860
|
+
model.layers[il].ffn_norm_b,
|
4861
|
+
LLM_NORM, cb, il);
|
4862
|
+
cb(cur, "ffn_norm", il);
|
4863
|
+
|
4864
|
+
cur = llm_build_ffn(ctx0, cur,
|
4865
|
+
model.layers[il].ffn_up, NULL,
|
4866
|
+
model.layers[il].ffn_gate, NULL,
|
4867
|
+
model.layers[il].ffn_down, NULL,
|
4868
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4869
|
+
cb(cur, "ffn_out", il);
|
4870
|
+
}
|
4871
|
+
|
4872
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4873
|
+
cb(cur, "l_out", il);
|
4874
|
+
|
4875
|
+
// input for next layer
|
4876
|
+
inpL = cur;
|
4877
|
+
}
|
4878
|
+
|
4879
|
+
cur = inpL;
|
4880
|
+
|
4881
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4882
|
+
model.output_norm,
|
4883
|
+
model.output_norm_b,
|
4884
|
+
LLM_NORM, cb, -1);
|
4885
|
+
cb(cur, "result_norm", -1);
|
4886
|
+
|
4887
|
+
// lm_head
|
4888
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4889
|
+
cb(cur, "result_output", -1);
|
4890
|
+
|
4891
|
+
ggml_build_forward_expand(gf, cur);
|
4892
|
+
|
4893
|
+
return gf;
|
4894
|
+
}
|
4554
4895
|
};
|
4555
4896
|
|
4556
4897
|
//
|
@@ -5020,6 +5361,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5020
5361
|
{
|
5021
5362
|
result = llm.build_mpt();
|
5022
5363
|
} break;
|
5364
|
+
case LLM_ARCH_STABLELM:
|
5365
|
+
{
|
5366
|
+
result = llm.build_stablelm();
|
5367
|
+
} break;
|
5023
5368
|
default:
|
5024
5369
|
GGML_ASSERT(false);
|
5025
5370
|
}
|
@@ -5129,6 +5474,12 @@ static int llama_decode_internal(
|
|
5129
5474
|
batch.seq_id = seq_id_arr.data();
|
5130
5475
|
}
|
5131
5476
|
|
5477
|
+
// if we have enough unused cells before the current head ->
|
5478
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5479
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5480
|
+
kv_self.head = 0;
|
5481
|
+
}
|
5482
|
+
|
5132
5483
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5133
5484
|
return 1;
|
5134
5485
|
}
|
@@ -5139,7 +5490,7 @@ static int llama_decode_internal(
|
|
5139
5490
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5140
5491
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5141
5492
|
|
5142
|
-
//printf("kv_self.n = %
|
5493
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5143
5494
|
|
5144
5495
|
ggml_allocr_reset(lctx.alloc);
|
5145
5496
|
|
@@ -5195,7 +5546,8 @@ static int llama_decode_internal(
|
|
5195
5546
|
model.arch == LLM_ARCH_FALCON ||
|
5196
5547
|
model.arch == LLM_ARCH_REFACT ||
|
5197
5548
|
model.arch == LLM_ARCH_MPT ||
|
5198
|
-
model.arch == LLM_ARCH_STARCODER
|
5549
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5550
|
+
model.arch == LLM_ARCH_STABLELM;
|
5199
5551
|
|
5200
5552
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5201
5553
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5987,7 +6339,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5987
6339
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5988
6340
|
// and passing 'add space prefix' as bool argument
|
5989
6341
|
//
|
5990
|
-
auto raw_text =
|
6342
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6343
|
+
if (&fragment == &fragment_buffer.front()) {
|
6344
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6345
|
+
}
|
5991
6346
|
|
5992
6347
|
#ifdef PRETOKENIZERDEBUG
|
5993
6348
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7639,7 +7994,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7639
7994
|
workers.clear();
|
7640
7995
|
}
|
7641
7996
|
|
7642
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7997
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7643
7998
|
int64_t tot_count = 0;
|
7644
7999
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7645
8000
|
hist_all[i] += hist_cur[i];
|
@@ -8179,7 +8534,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8179
8534
|
|
8180
8535
|
{
|
8181
8536
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8182
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8537
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8183
8538
|
}
|
8184
8539
|
|
8185
8540
|
// resized during inference
|
@@ -8196,7 +8551,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8196
8551
|
{
|
8197
8552
|
static const size_t tensor_alignment = 32;
|
8198
8553
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8199
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8554
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8200
8555
|
|
8201
8556
|
// create measure allocator
|
8202
8557
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8224,7 +8579,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8224
8579
|
// measure memory requirements for the graph
|
8225
8580
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8226
8581
|
|
8227
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8582
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8228
8583
|
|
8229
8584
|
// recreate allocator with exact memory requirements
|
8230
8585
|
ggml_allocr_free(ctx->alloc);
|
@@ -8238,7 +8593,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8238
8593
|
#endif
|
8239
8594
|
#ifdef GGML_USE_CUBLAS
|
8240
8595
|
ggml_cuda_set_scratch_size(alloc_size);
|
8241
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8596
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8242
8597
|
|
8243
8598
|
// calculate total VRAM usage
|
8244
8599
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8258,10 +8613,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8258
8613
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8259
8614
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8260
8615
|
|
8261
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8616
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8262
8617
|
total_vram_size / 1024.0 / 1024.0,
|
8263
8618
|
model_vram_size / 1024.0 / 1024.0,
|
8264
|
-
ctx_vram_size
|
8619
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8265
8620
|
#endif
|
8266
8621
|
}
|
8267
8622
|
|
@@ -8282,7 +8637,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8282
8637
|
|
8283
8638
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8284
8639
|
|
8285
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8640
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8286
8641
|
|
8287
8642
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8288
8643
|
if (!(result)) { \
|
@@ -8348,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8348
8703
|
return model->hparams.rope_freq_scale_train;
|
8349
8704
|
}
|
8350
8705
|
|
8706
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8707
|
+
const auto & it = model->gguf_kv.find(key);
|
8708
|
+
if (it == model->gguf_kv.end()) {
|
8709
|
+
if (buf_size > 0) {
|
8710
|
+
buf[0] = '\0';
|
8711
|
+
}
|
8712
|
+
return -1;
|
8713
|
+
}
|
8714
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8715
|
+
}
|
8716
|
+
|
8717
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8718
|
+
return (int)model->gguf_kv.size();
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8722
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8723
|
+
if (buf_size > 0) {
|
8724
|
+
buf[0] = '\0';
|
8725
|
+
}
|
8726
|
+
return -1;
|
8727
|
+
}
|
8728
|
+
auto it = model->gguf_kv.begin();
|
8729
|
+
std::advance(it, i);
|
8730
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8731
|
+
}
|
8732
|
+
|
8733
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8734
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8735
|
+
if (buf_size > 0) {
|
8736
|
+
buf[0] = '\0';
|
8737
|
+
}
|
8738
|
+
return -1;
|
8739
|
+
}
|
8740
|
+
auto it = model->gguf_kv.begin();
|
8741
|
+
std::advance(it, i);
|
8742
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8743
|
+
}
|
8744
|
+
|
8351
8745
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8352
8746
|
return snprintf(buf, buf_size, "%s %s %s",
|
8353
8747
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8406,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8406
8800
|
}
|
8407
8801
|
}
|
8408
8802
|
|
8803
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8804
|
+
struct llama_kv_cache_view result = {
|
8805
|
+
/*.n_cells = */ 0,
|
8806
|
+
/*.n_max_seq = */ n_max_seq,
|
8807
|
+
/*.token_count = */ 0,
|
8808
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8809
|
+
/*.max_contiguous = */ 0,
|
8810
|
+
/*.max_contiguous_idx = */ -1,
|
8811
|
+
/*.cells = */ nullptr,
|
8812
|
+
/*.cells_sequences = */ nullptr,
|
8813
|
+
};
|
8814
|
+
return result;
|
8815
|
+
}
|
8816
|
+
|
8817
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8818
|
+
if (view->cells != nullptr) {
|
8819
|
+
free(view->cells);
|
8820
|
+
view->cells = nullptr;
|
8821
|
+
}
|
8822
|
+
if (view->cells_sequences != nullptr) {
|
8823
|
+
free(view->cells_sequences);
|
8824
|
+
view->cells_sequences = nullptr;
|
8825
|
+
}
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8829
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8830
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8831
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8832
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8833
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8834
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8835
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8836
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8840
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8841
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8842
|
+
int32_t used_cells = 0;
|
8843
|
+
int32_t token_count = 0;
|
8844
|
+
int32_t curr_contig_idx = -1;
|
8845
|
+
uint32_t max_contig = 0;
|
8846
|
+
int32_t max_contig_idx = -1;
|
8847
|
+
|
8848
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8849
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8850
|
+
token_count += curr_size;
|
8851
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8852
|
+
|
8853
|
+
if (curr_size > 0) {
|
8854
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8855
|
+
max_contig = i - curr_contig_idx;
|
8856
|
+
max_contig_idx = curr_contig_idx;
|
8857
|
+
}
|
8858
|
+
curr_contig_idx = -1;
|
8859
|
+
} else if (curr_contig_idx < 0) {
|
8860
|
+
curr_contig_idx = i;
|
8861
|
+
}
|
8862
|
+
|
8863
|
+
int seq_idx = 0;
|
8864
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8865
|
+
if (seq_idx >= view->n_max_seq) {
|
8866
|
+
break;
|
8867
|
+
}
|
8868
|
+
cs_curr[seq_idx] = it;
|
8869
|
+
seq_idx++;
|
8870
|
+
}
|
8871
|
+
if (seq_idx != 0) {
|
8872
|
+
used_cells++;
|
8873
|
+
}
|
8874
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8875
|
+
cs_curr[seq_idx] = -1;
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8879
|
+
max_contig_idx = curr_contig_idx;
|
8880
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8881
|
+
}
|
8882
|
+
view->max_contiguous = max_contig;
|
8883
|
+
view->max_contiguous_idx = max_contig_idx;
|
8884
|
+
view->token_count = token_count;
|
8885
|
+
view->used_cells = used_cells;
|
8886
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8887
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8888
|
+
__func__, ctx->kv_self.used, used_cells);
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
|
8409
8892
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8410
|
-
|
8893
|
+
int result = 0;
|
8894
|
+
|
8895
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8896
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8897
|
+
}
|
8898
|
+
|
8899
|
+
return result;
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8903
|
+
return ctx->kv_self.used;
|
8411
8904
|
}
|
8412
8905
|
|
8413
8906
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8577,16 +9070,18 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8577
9070
|
const size_t kv_buf_size = kv_self.buf.size;
|
8578
9071
|
const uint32_t kv_head = kv_self.head;
|
8579
9072
|
const uint32_t kv_size = kv_self.size;
|
9073
|
+
const uint32_t kv_used = kv_self.used;
|
8580
9074
|
|
8581
9075
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8582
9076
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8583
9077
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9078
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8584
9079
|
|
8585
9080
|
if (kv_buf_size) {
|
8586
9081
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8587
9082
|
|
8588
|
-
ggml_context * cpy_ctx = ggml_init({
|
8589
|
-
ggml_cgraph gf
|
9083
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9084
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8590
9085
|
|
8591
9086
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8592
9087
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8604,9 +9099,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8604
9099
|
kv_head, n_embd, n_layer,
|
8605
9100
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8606
9101
|
|
8607
|
-
ggml_build_forward_expand(
|
8608
|
-
ggml_build_forward_expand(
|
8609
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9102
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9103
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9104
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8610
9105
|
|
8611
9106
|
ggml_free(cpy_ctx);
|
8612
9107
|
|
@@ -8703,18 +9198,20 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8703
9198
|
size_t kv_buf_size;
|
8704
9199
|
uint32_t kv_head;
|
8705
9200
|
uint32_t kv_size;
|
9201
|
+
uint32_t kv_used;
|
8706
9202
|
|
8707
9203
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
8708
9204
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
8709
9205
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9206
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
8710
9207
|
|
8711
9208
|
if (kv_buf_size) {
|
8712
9209
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
8713
9210
|
|
8714
9211
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8715
9212
|
|
8716
|
-
ggml_context * cpy_ctx = ggml_init({
|
8717
|
-
ggml_cgraph gf
|
9213
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9214
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8718
9215
|
|
8719
9216
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8720
9217
|
kin3d->data = (void *) inp;
|
@@ -8732,15 +9229,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8732
9229
|
kv_head, n_embd, n_layer,
|
8733
9230
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8734
9231
|
|
8735
|
-
ggml_build_forward_expand(
|
8736
|
-
ggml_build_forward_expand(
|
8737
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9232
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9233
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9234
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8738
9235
|
|
8739
9236
|
ggml_free(cpy_ctx);
|
8740
9237
|
}
|
8741
9238
|
|
8742
9239
|
ctx->kv_self.head = kv_head;
|
8743
9240
|
ctx->kv_self.size = kv_size;
|
9241
|
+
ctx->kv_self.used = kv_used;
|
8744
9242
|
|
8745
9243
|
ctx->kv_self.cells.resize(kv_size);
|
8746
9244
|
|
@@ -8989,6 +9487,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8989
9487
|
return model->vocab.linefeed_id;
|
8990
9488
|
}
|
8991
9489
|
|
9490
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9491
|
+
return model->vocab.special_add_bos;
|
9492
|
+
}
|
9493
|
+
|
9494
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9495
|
+
return model->vocab.special_add_eos;
|
9496
|
+
}
|
9497
|
+
|
8992
9498
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8993
9499
|
return model->vocab.special_prefix_id;
|
8994
9500
|
}
|