llama_cpp 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <thread>
|
76
76
|
#include <unordered_map>
|
77
77
|
#include <set>
|
78
|
+
#include <forward_list>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
80
81
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -186,7 +187,9 @@ enum llm_arch {
|
|
186
187
|
LLM_ARCH_GPTNEOX,
|
187
188
|
LLM_ARCH_MPT,
|
188
189
|
LLM_ARCH_STARCODER,
|
190
|
+
LLM_ARCH_PERSIMMON,
|
189
191
|
LLM_ARCH_REFACT,
|
192
|
+
LLM_ARCH_BLOOM,
|
190
193
|
LLM_ARCH_UNKNOWN,
|
191
194
|
};
|
192
195
|
|
@@ -199,7 +202,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
199
202
|
{ LLM_ARCH_MPT, "mpt" },
|
200
203
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
201
204
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
-
{
|
205
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
|
+
{ LLM_ARCH_REFACT, "refact" },
|
207
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
203
208
|
};
|
204
209
|
|
205
210
|
enum llm_kv {
|
@@ -302,6 +307,7 @@ struct LLM_KV {
|
|
302
307
|
|
303
308
|
enum llm_tensor {
|
304
309
|
LLM_TENSOR_TOKEN_EMBD,
|
310
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
305
311
|
LLM_TENSOR_POS_EMBD,
|
306
312
|
LLM_TENSOR_OUTPUT,
|
307
313
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -318,6 +324,8 @@ enum llm_tensor {
|
|
318
324
|
LLM_TENSOR_FFN_DOWN,
|
319
325
|
LLM_TENSOR_FFN_UP,
|
320
326
|
LLM_TENSOR_FFN_NORM,
|
327
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
328
|
+
LLM_TENSOR_ATTN_K_NORM,
|
321
329
|
};
|
322
330
|
|
323
331
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -399,10 +407,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
399
407
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
400
408
|
},
|
401
409
|
},
|
410
|
+
{
|
411
|
+
LLM_ARCH_PERSIMMON,
|
412
|
+
{
|
413
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
414
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
415
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
416
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
417
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
418
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
419
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
420
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
422
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
423
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
424
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
425
|
+
},
|
426
|
+
},
|
402
427
|
{
|
403
428
|
LLM_ARCH_MPT,
|
404
429
|
{
|
405
430
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
431
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
432
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
433
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
436
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
437
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
438
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
406
439
|
},
|
407
440
|
},
|
408
441
|
{
|
@@ -437,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
437
470
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
471
|
},
|
439
472
|
},
|
473
|
+
{
|
474
|
+
LLM_ARCH_BLOOM,
|
475
|
+
{
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
477
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
479
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
480
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
481
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
482
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
483
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
484
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
485
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
486
|
+
},
|
487
|
+
},
|
440
488
|
{
|
441
489
|
LLM_ARCH_UNKNOWN,
|
442
490
|
{
|
@@ -954,6 +1002,7 @@ enum e_model {
|
|
954
1002
|
MODEL_1B,
|
955
1003
|
MODEL_3B,
|
956
1004
|
MODEL_7B,
|
1005
|
+
MODEL_8B,
|
957
1006
|
MODEL_13B,
|
958
1007
|
MODEL_15B,
|
959
1008
|
MODEL_30B,
|
@@ -984,6 +1033,9 @@ struct llama_hparams {
|
|
984
1033
|
float rope_freq_base_train;
|
985
1034
|
float rope_freq_scale_train;
|
986
1035
|
|
1036
|
+
float f_clamp_kqv;
|
1037
|
+
float f_max_alibi_bias;
|
1038
|
+
|
987
1039
|
bool operator!=(const llama_hparams & other) const {
|
988
1040
|
if (this->vocab_only != other.vocab_only) return true;
|
989
1041
|
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1036,6 +1088,10 @@ struct llama_layer {
|
|
1036
1088
|
struct ggml_tensor * attn_norm_b;
|
1037
1089
|
struct ggml_tensor * attn_norm_2;
|
1038
1090
|
struct ggml_tensor * attn_norm_2_b;
|
1091
|
+
struct ggml_tensor * attn_q_norm;
|
1092
|
+
struct ggml_tensor * attn_q_norm_b;
|
1093
|
+
struct ggml_tensor * attn_k_norm;
|
1094
|
+
struct ggml_tensor * attn_k_norm_b;
|
1039
1095
|
|
1040
1096
|
// attention
|
1041
1097
|
struct ggml_tensor * wq;
|
@@ -1077,6 +1133,9 @@ struct llama_kv_cell {
|
|
1077
1133
|
struct llama_kv_cache {
|
1078
1134
|
bool has_shift = false;
|
1079
1135
|
|
1136
|
+
// Note: The value of head isn't only used to optimize searching
|
1137
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1138
|
+
// cannot be freely changed after a slot has been allocated.
|
1080
1139
|
uint32_t head = 0;
|
1081
1140
|
uint32_t size = 0;
|
1082
1141
|
|
@@ -1120,6 +1179,8 @@ struct llama_vocab {
|
|
1120
1179
|
std::unordered_map<token, id> token_to_id;
|
1121
1180
|
std::vector<token_data> id_to_token;
|
1122
1181
|
|
1182
|
+
std::unordered_map<token, id> special_tokens_cache;
|
1183
|
+
|
1123
1184
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
1124
1185
|
|
1125
1186
|
// default LLaMA special tokens
|
@@ -1162,6 +1223,8 @@ struct llama_model {
|
|
1162
1223
|
|
1163
1224
|
struct ggml_tensor * tok_embeddings;
|
1164
1225
|
struct ggml_tensor * pos_embeddings;
|
1226
|
+
struct ggml_tensor * tok_norm;
|
1227
|
+
struct ggml_tensor * tok_norm_b;
|
1165
1228
|
|
1166
1229
|
struct ggml_tensor * output_norm;
|
1167
1230
|
struct ggml_tensor * output_norm_b;
|
@@ -1291,7 +1354,11 @@ static bool llama_kv_cache_init(
|
|
1291
1354
|
cache.cells.clear();
|
1292
1355
|
cache.cells.resize(n_ctx);
|
1293
1356
|
|
1357
|
+
// TODO: this should be:
|
1358
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1359
|
+
// change it and test that it works
|
1294
1360
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1361
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1295
1362
|
|
1296
1363
|
struct ggml_init_params params;
|
1297
1364
|
params.mem_size = cache.buf.size;
|
@@ -1334,6 +1401,8 @@ static bool llama_kv_cache_init(
|
|
1334
1401
|
|
1335
1402
|
// find an empty slot of size "n_tokens" in the cache
|
1336
1403
|
// updates the cache head
|
1404
|
+
// Note: On success, it's important that cache.head points
|
1405
|
+
// to the first cell of the slot.
|
1337
1406
|
static bool llama_kv_cache_find_slot(
|
1338
1407
|
struct llama_kv_cache & cache,
|
1339
1408
|
const struct llama_batch & batch) {
|
@@ -1349,8 +1418,8 @@ static bool llama_kv_cache_find_slot(
|
|
1349
1418
|
|
1350
1419
|
while (true) {
|
1351
1420
|
if (cache.head + n_tokens > n_ctx) {
|
1421
|
+
n_tested += n_ctx - cache.head;
|
1352
1422
|
cache.head = 0;
|
1353
|
-
n_tested += n_ctx - cache.head;
|
1354
1423
|
continue;
|
1355
1424
|
}
|
1356
1425
|
|
@@ -1376,7 +1445,10 @@ static bool llama_kv_cache_find_slot(
|
|
1376
1445
|
|
1377
1446
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
1378
1447
|
cache.cells[cache.head + i].pos = batch.pos[i];
|
1379
|
-
|
1448
|
+
|
1449
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
1450
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
|
1451
|
+
}
|
1380
1452
|
}
|
1381
1453
|
|
1382
1454
|
return true;
|
@@ -1401,6 +1473,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1401
1473
|
cache.cells[i].pos = -1;
|
1402
1474
|
cache.cells[i].seq_id.clear();
|
1403
1475
|
}
|
1476
|
+
|
1477
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1478
|
+
cache.head = uint32_t(c0);
|
1404
1479
|
}
|
1405
1480
|
|
1406
1481
|
static void llama_kv_cache_seq_rm(
|
@@ -1408,6 +1483,8 @@ static void llama_kv_cache_seq_rm(
|
|
1408
1483
|
llama_seq_id seq_id,
|
1409
1484
|
llama_pos p0,
|
1410
1485
|
llama_pos p1) {
|
1486
|
+
uint32_t new_head = cache.size;
|
1487
|
+
|
1411
1488
|
if (p0 < 0) p0 = 0;
|
1412
1489
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
1490
|
|
@@ -1416,9 +1493,13 @@ static void llama_kv_cache_seq_rm(
|
|
1416
1493
|
cache.cells[i].seq_id.erase(seq_id);
|
1417
1494
|
if (cache.cells[i].seq_id.empty()) {
|
1418
1495
|
cache.cells[i].pos = -1;
|
1496
|
+
if (new_head == cache.size) new_head = i;
|
1419
1497
|
}
|
1420
1498
|
}
|
1421
1499
|
}
|
1500
|
+
|
1501
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1502
|
+
if (new_head != cache.size) cache.head = new_head;
|
1422
1503
|
}
|
1423
1504
|
|
1424
1505
|
static void llama_kv_cache_seq_cp(
|
@@ -1430,6 +1511,8 @@ static void llama_kv_cache_seq_cp(
|
|
1430
1511
|
if (p0 < 0) p0 = 0;
|
1431
1512
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
1513
|
|
1514
|
+
cache.head = 0;
|
1515
|
+
|
1433
1516
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
1517
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
1518
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1438,12 +1521,21 @@ static void llama_kv_cache_seq_cp(
|
|
1438
1521
|
}
|
1439
1522
|
|
1440
1523
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1524
|
+
uint32_t new_head = cache.size;
|
1525
|
+
|
1441
1526
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
1527
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
1528
|
cache.cells[i].pos = -1;
|
1444
1529
|
cache.cells[i].seq_id.clear();
|
1530
|
+
if (new_head == cache.size) new_head = i;
|
1531
|
+
} else {
|
1532
|
+
cache.cells[i].seq_id.clear();
|
1533
|
+
cache.cells[i].seq_id.insert(seq_id);
|
1445
1534
|
}
|
1446
1535
|
}
|
1536
|
+
|
1537
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1538
|
+
if (new_head != cache.size) cache.head = new_head;
|
1447
1539
|
}
|
1448
1540
|
|
1449
1541
|
static void llama_kv_cache_seq_shift(
|
@@ -1452,6 +1544,8 @@ static void llama_kv_cache_seq_shift(
|
|
1452
1544
|
llama_pos p0,
|
1453
1545
|
llama_pos p1,
|
1454
1546
|
llama_pos delta) {
|
1547
|
+
uint32_t new_head = cache.size;
|
1548
|
+
|
1455
1549
|
if (p0 < 0) p0 = 0;
|
1456
1550
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
1551
|
|
@@ -1461,12 +1555,17 @@ static void llama_kv_cache_seq_shift(
|
|
1461
1555
|
if (cache.cells[i].pos < 0) {
|
1462
1556
|
cache.cells[i].pos = -1;
|
1463
1557
|
cache.cells[i].seq_id.clear();
|
1558
|
+
if (new_head == cache.size) new_head = i;
|
1464
1559
|
} else {
|
1465
1560
|
cache.has_shift = true;
|
1466
1561
|
cache.cells[i].delta = delta;
|
1467
1562
|
}
|
1468
1563
|
}
|
1469
1564
|
}
|
1565
|
+
|
1566
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1567
|
+
// Otherwise we just start the next search from the beginning.
|
1568
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1470
1569
|
}
|
1471
1570
|
|
1472
1571
|
//
|
@@ -1670,7 +1769,7 @@ struct llama_model_loader {
|
|
1670
1769
|
}
|
1671
1770
|
}
|
1672
1771
|
|
1673
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1772
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1674
1773
|
if (backend != GGML_BACKEND_CPU) {
|
1675
1774
|
ggml_set_no_alloc(ctx, true);
|
1676
1775
|
}
|
@@ -1688,7 +1787,7 @@ struct llama_model_loader {
|
|
1688
1787
|
return tensor;
|
1689
1788
|
}
|
1690
1789
|
|
1691
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1790
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1692
1791
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1693
1792
|
|
1694
1793
|
if (cur == NULL) {
|
@@ -1867,6 +1966,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1867
1966
|
case MODEL_1B: return "1B";
|
1868
1967
|
case MODEL_3B: return "3B";
|
1869
1968
|
case MODEL_7B: return "7B";
|
1969
|
+
case MODEL_8B: return "8B";
|
1870
1970
|
case MODEL_13B: return "13B";
|
1871
1971
|
case MODEL_15B: return "15B";
|
1872
1972
|
case MODEL_30B: return "30B";
|
@@ -1979,6 +2079,14 @@ static void llm_load_hparams(
|
|
1979
2079
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1980
2080
|
}
|
1981
2081
|
} break;
|
2082
|
+
case LLM_ARCH_PERSIMMON:
|
2083
|
+
{
|
2084
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2085
|
+
switch (hparams.n_layer) {
|
2086
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2087
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2088
|
+
}
|
2089
|
+
} break;
|
1982
2090
|
case LLM_ARCH_REFACT:
|
1983
2091
|
{
|
1984
2092
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -1987,6 +2095,33 @@ static void llm_load_hparams(
|
|
1987
2095
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
2096
|
}
|
1989
2097
|
} break;
|
2098
|
+
case LLM_ARCH_BLOOM:
|
2099
|
+
{
|
2100
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2101
|
+
|
2102
|
+
switch (hparams.n_layer) {
|
2103
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2104
|
+
case 30:
|
2105
|
+
switch (hparams.n_embd) {
|
2106
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2107
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2108
|
+
} break;
|
2109
|
+
}
|
2110
|
+
} break;
|
2111
|
+
case LLM_ARCH_MPT:
|
2112
|
+
{
|
2113
|
+
hparams.f_clamp_kqv = 0.0f;
|
2114
|
+
|
2115
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2116
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2117
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2118
|
+
|
2119
|
+
switch (hparams.n_layer) {
|
2120
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2121
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2122
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2123
|
+
}
|
2124
|
+
} break;
|
1990
2125
|
default: (void)0;
|
1991
2126
|
}
|
1992
2127
|
|
@@ -1994,7 +2129,7 @@ static void llm_load_hparams(
|
|
1994
2129
|
}
|
1995
2130
|
|
1996
2131
|
// TODO: This should probably be in llama.h
|
1997
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
2132
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
1998
2133
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1999
2134
|
|
2000
2135
|
static void llm_load_vocab(
|
@@ -2110,6 +2245,101 @@ static void llm_load_vocab(
|
|
2110
2245
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
2111
2246
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
2112
2247
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
2248
|
+
|
2249
|
+
// build special tokens cache
|
2250
|
+
{
|
2251
|
+
// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
|
2252
|
+
// and will always be correctly labeled in 'added_tokens.json' etc.
|
2253
|
+
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2254
|
+
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2255
|
+
// are special tokens.
|
2256
|
+
// From testing, this appears to corelate 1:1 with special tokens.
|
2257
|
+
//
|
2258
|
+
|
2259
|
+
// Counting special tokens and verifying in only one direction
|
2260
|
+
// is sufficient to detect difference in those two sets.
|
2261
|
+
//
|
2262
|
+
uint32_t special_tokens_count_by_type = 0;
|
2263
|
+
uint32_t special_tokens_count_from_verification = 0;
|
2264
|
+
|
2265
|
+
bool special_tokens_definition_mismatch = false;
|
2266
|
+
|
2267
|
+
for (const auto & t : vocab.token_to_id) {
|
2268
|
+
const auto & token = t.first;
|
2269
|
+
const auto & id = t.second;
|
2270
|
+
|
2271
|
+
// Count all non-normal tokens in the vocab while iterating
|
2272
|
+
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
2273
|
+
special_tokens_count_by_type++;
|
2274
|
+
}
|
2275
|
+
|
2276
|
+
// Skip single character tokens
|
2277
|
+
if (token.length() > 1) {
|
2278
|
+
bool is_tokenizable = false;
|
2279
|
+
|
2280
|
+
// Split token string representation in two, in all possible ways
|
2281
|
+
// and check if both halves can be matched to a valid token
|
2282
|
+
for (unsigned i = 1; i < token.length();) {
|
2283
|
+
const auto left = token.substr(0, i);
|
2284
|
+
const auto right = token.substr(i);
|
2285
|
+
|
2286
|
+
// check if we didnt partition in the middle of a utf sequence
|
2287
|
+
auto utf = utf8_len(left.at(left.length() - 1));
|
2288
|
+
|
2289
|
+
if (utf == 1) {
|
2290
|
+
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
2291
|
+
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
2292
|
+
is_tokenizable = true;
|
2293
|
+
break;
|
2294
|
+
}
|
2295
|
+
i++;
|
2296
|
+
} else {
|
2297
|
+
// skip over the rest of multibyte utf sequence
|
2298
|
+
i += utf - 1;
|
2299
|
+
}
|
2300
|
+
}
|
2301
|
+
|
2302
|
+
if (!is_tokenizable) {
|
2303
|
+
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
|
2304
|
+
// it's faster to re-filter them here, since there are way less candidates now
|
2305
|
+
|
2306
|
+
// Calculate a total "utf" length of a token string representation
|
2307
|
+
size_t utf8_str_len = 0;
|
2308
|
+
for (unsigned i = 0; i < token.length();) {
|
2309
|
+
utf8_str_len++;
|
2310
|
+
i += utf8_len(token.at(i));
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
// And skip the ones which are one character
|
2314
|
+
if (utf8_str_len > 1) {
|
2315
|
+
// At this point what we have left are special tokens only
|
2316
|
+
vocab.special_tokens_cache[token] = id;
|
2317
|
+
|
2318
|
+
// Count manually found special tokens
|
2319
|
+
special_tokens_count_from_verification++;
|
2320
|
+
|
2321
|
+
// If this manually found special token is not marked as such, flag a mismatch
|
2322
|
+
if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
|
2323
|
+
special_tokens_definition_mismatch = true;
|
2324
|
+
}
|
2325
|
+
}
|
2326
|
+
}
|
2327
|
+
}
|
2328
|
+
}
|
2329
|
+
|
2330
|
+
if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
|
2331
|
+
LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
|
2332
|
+
__func__,
|
2333
|
+
special_tokens_count_from_verification, vocab.id_to_token.size(),
|
2334
|
+
special_tokens_count_by_type, vocab.id_to_token.size()
|
2335
|
+
);
|
2336
|
+
} else {
|
2337
|
+
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
2338
|
+
__func__,
|
2339
|
+
special_tokens_count_from_verification, vocab.id_to_token.size()
|
2340
|
+
);
|
2341
|
+
}
|
2342
|
+
}
|
2113
2343
|
}
|
2114
2344
|
|
2115
2345
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
@@ -2131,6 +2361,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2131
2361
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
2362
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
2363
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2364
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2365
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2134
2366
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
2367
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
2368
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2230,8 +2462,8 @@ static void llm_load_tensors(
|
|
2230
2462
|
|
2231
2463
|
// output
|
2232
2464
|
{
|
2233
|
-
|
2234
|
-
|
2465
|
+
ggml_backend_type backend_norm;
|
2466
|
+
ggml_backend_type backend_output;
|
2235
2467
|
|
2236
2468
|
if (n_gpu_layers > int(n_layer)) {
|
2237
2469
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2266,8 +2498,8 @@ static void llm_load_tensors(
|
|
2266
2498
|
model.layers.resize(n_layer);
|
2267
2499
|
|
2268
2500
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2269
|
-
const
|
2270
|
-
const
|
2501
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2502
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2271
2503
|
|
2272
2504
|
auto & layer = model.layers[i];
|
2273
2505
|
|
@@ -2296,8 +2528,8 @@ static void llm_load_tensors(
|
|
2296
2528
|
{
|
2297
2529
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2298
2530
|
{
|
2299
|
-
|
2300
|
-
|
2531
|
+
ggml_backend_type backend_norm;
|
2532
|
+
ggml_backend_type backend_output;
|
2301
2533
|
|
2302
2534
|
if (n_gpu_layers > int(n_layer)) {
|
2303
2535
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2332,8 +2564,8 @@ static void llm_load_tensors(
|
|
2332
2564
|
model.layers.resize(n_layer);
|
2333
2565
|
|
2334
2566
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2335
|
-
const
|
2336
|
-
const
|
2567
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2568
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2337
2569
|
|
2338
2570
|
auto & layer = model.layers[i];
|
2339
2571
|
|
@@ -2366,8 +2598,8 @@ static void llm_load_tensors(
|
|
2366
2598
|
|
2367
2599
|
// output
|
2368
2600
|
{
|
2369
|
-
|
2370
|
-
|
2601
|
+
ggml_backend_type backend_norm;
|
2602
|
+
ggml_backend_type backend_output;
|
2371
2603
|
|
2372
2604
|
if (n_gpu_layers > int(n_layer)) {
|
2373
2605
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2404,8 +2636,8 @@ static void llm_load_tensors(
|
|
2404
2636
|
model.layers.resize(n_layer);
|
2405
2637
|
|
2406
2638
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2407
|
-
const
|
2408
|
-
const
|
2639
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2640
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2409
2641
|
|
2410
2642
|
auto & layer = model.layers[i];
|
2411
2643
|
|
@@ -2443,8 +2675,8 @@ static void llm_load_tensors(
|
|
2443
2675
|
|
2444
2676
|
// output
|
2445
2677
|
{
|
2446
|
-
|
2447
|
-
|
2678
|
+
ggml_backend_type backend_norm;
|
2679
|
+
ggml_backend_type backend_output;
|
2448
2680
|
|
2449
2681
|
if (n_gpu_layers > int(n_layer)) {
|
2450
2682
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2481,8 +2713,8 @@ static void llm_load_tensors(
|
|
2481
2713
|
model.layers.resize(n_layer);
|
2482
2714
|
|
2483
2715
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2484
|
-
const
|
2485
|
-
const
|
2716
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2717
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2486
2718
|
|
2487
2719
|
auto & layer = model.layers[i];
|
2488
2720
|
|
@@ -2515,117 +2747,327 @@ static void llm_load_tensors(
|
|
2515
2747
|
}
|
2516
2748
|
}
|
2517
2749
|
} break;
|
2518
|
-
|
2519
|
-
|
2520
|
-
|
2521
|
-
}
|
2750
|
+
case LLM_ARCH_PERSIMMON:
|
2751
|
+
{
|
2752
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2522
2753
|
|
2523
|
-
|
2754
|
+
{
|
2755
|
+
ggml_backend_type backend_norm;
|
2756
|
+
ggml_backend_type backend_output;
|
2524
2757
|
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2758
|
+
if (n_gpu_layers > int(n_layer)) {
|
2759
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2760
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2761
|
+
#ifndef _WIN32
|
2762
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2763
|
+
#else
|
2764
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2765
|
+
#endif // _WIN32
|
2531
2766
|
|
2532
|
-
|
2767
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2768
|
+
} else {
|
2769
|
+
backend_norm = GGML_BACKEND_CPU;
|
2770
|
+
backend_output = GGML_BACKEND_CPU;
|
2771
|
+
}
|
2533
2772
|
|
2534
|
-
|
2535
|
-
|
2773
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2774
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2775
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2536
2776
|
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2777
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2778
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2779
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2780
|
+
}
|
2781
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2782
|
+
vram_weights += ggml_nbytes(model.output);
|
2783
|
+
}
|
2784
|
+
}
|
2541
2785
|
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2546
|
-
|
2547
|
-
|
2548
|
-
|
2786
|
+
const uint32_t n_ff = hparams.n_ff;
|
2787
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2788
|
+
model.layers.resize(n_layer);
|
2789
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2790
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2791
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2792
|
+
auto & layer = model.layers[i];
|
2793
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2794
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2795
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2796
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2797
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2798
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2799
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2800
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2801
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2802
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2803
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2804
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2805
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2806
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2807
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2808
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2809
|
+
}
|
2810
|
+
} break;
|
2811
|
+
case LLM_ARCH_BLOOM:
|
2812
|
+
{
|
2813
|
+
// TODO: CPU-only for now
|
2549
2814
|
|
2550
|
-
|
2551
|
-
|
2552
|
-
|
2553
|
-
(void) n_gpu_layers;
|
2554
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2555
|
-
}
|
2815
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2816
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2817
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2556
2818
|
|
2557
|
-
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2561
|
-
}
|
2819
|
+
// output
|
2820
|
+
{
|
2821
|
+
ggml_backend_type backend_norm;
|
2822
|
+
ggml_backend_type backend_output;
|
2562
2823
|
|
2563
|
-
|
2564
|
-
|
2565
|
-
|
2566
|
-
|
2567
|
-
|
2568
|
-
#
|
2824
|
+
if (n_gpu_layers > int(n_layer)) {
|
2825
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2826
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2827
|
+
#ifndef _WIN32
|
2828
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2829
|
+
#else
|
2830
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2831
|
+
#endif // _WIN32
|
2569
2832
|
|
2570
|
-
|
2833
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2834
|
+
} else {
|
2835
|
+
backend_norm = GGML_BACKEND_CPU;
|
2836
|
+
backend_output = GGML_BACKEND_CPU;
|
2837
|
+
}
|
2571
2838
|
|
2572
|
-
|
2573
|
-
|
2574
|
-
|
2839
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2840
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2841
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2575
2842
|
|
2576
|
-
|
2843
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2844
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2845
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2846
|
+
}
|
2847
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2848
|
+
vram_weights += ggml_nbytes(model.output);
|
2849
|
+
}
|
2850
|
+
}
|
2577
2851
|
|
2578
|
-
|
2579
|
-
// we take page faults deferred by mmap() into consideration
|
2580
|
-
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2581
|
-
}
|
2852
|
+
const uint32_t n_ff = hparams.n_ff;
|
2582
2853
|
|
2583
|
-
|
2584
|
-
const std::string & fname,
|
2585
|
-
llama_model & model,
|
2586
|
-
int n_gpu_layers,
|
2587
|
-
int main_gpu,
|
2588
|
-
const float * tensor_split,
|
2589
|
-
bool use_mmap,
|
2590
|
-
bool use_mlock,
|
2591
|
-
bool vocab_only,
|
2592
|
-
llama_progress_callback progress_callback,
|
2593
|
-
void *progress_callback_user_data) {
|
2594
|
-
try {
|
2595
|
-
llama_model_loader ml(fname, use_mmap);
|
2854
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2596
2855
|
|
2597
|
-
|
2856
|
+
model.layers.resize(n_layer);
|
2598
2857
|
|
2599
|
-
|
2600
|
-
|
2601
|
-
|
2858
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2859
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2860
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2602
2861
|
|
2603
|
-
|
2862
|
+
auto & layer = model.layers[i];
|
2604
2863
|
|
2605
|
-
|
2606
|
-
|
2607
|
-
}
|
2864
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2865
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2608
2866
|
|
2609
|
-
|
2610
|
-
|
2611
|
-
return true;
|
2612
|
-
}
|
2867
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2868
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2613
2869
|
|
2614
|
-
|
2615
|
-
|
2616
|
-
main_gpu, tensor_split,
|
2617
|
-
use_mlock, progress_callback, progress_callback_user_data);
|
2618
|
-
} catch (const std::exception & err) {
|
2619
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2620
|
-
return false;
|
2621
|
-
}
|
2870
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2871
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2622
2872
|
|
2623
|
-
|
2624
|
-
}
|
2873
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2874
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2875
|
+
|
2876
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2877
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2878
|
+
|
2879
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2880
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2881
|
+
|
2882
|
+
if (backend == GGML_BACKEND_GPU) {
|
2883
|
+
vram_weights +=
|
2884
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2885
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2886
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2887
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2888
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2889
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2890
|
+
}
|
2891
|
+
}
|
2892
|
+
} break;
|
2893
|
+
case LLM_ARCH_MPT:
|
2894
|
+
{
|
2895
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2896
|
+
|
2897
|
+
// output
|
2898
|
+
{
|
2899
|
+
ggml_backend_type backend_norm;
|
2900
|
+
ggml_backend_type backend_output;
|
2901
|
+
|
2902
|
+
if (n_gpu_layers > int(n_layer)) {
|
2903
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2904
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2905
|
+
#ifndef _WIN32
|
2906
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2907
|
+
#else
|
2908
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2909
|
+
#endif // _WIN32
|
2910
|
+
|
2911
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2912
|
+
} else {
|
2913
|
+
backend_norm = GGML_BACKEND_CPU;
|
2914
|
+
backend_output = GGML_BACKEND_CPU;
|
2915
|
+
}
|
2916
|
+
|
2917
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2918
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2919
|
+
|
2920
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2921
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2922
|
+
}
|
2923
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2924
|
+
vram_weights += ggml_nbytes(model.output);
|
2925
|
+
}
|
2926
|
+
}
|
2927
|
+
|
2928
|
+
const uint32_t n_ff = hparams.n_ff;
|
2929
|
+
|
2930
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2931
|
+
|
2932
|
+
model.layers.resize(n_layer);
|
2933
|
+
|
2934
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2935
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2936
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2937
|
+
|
2938
|
+
auto & layer = model.layers[i];
|
2939
|
+
|
2940
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2941
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2942
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2943
|
+
|
2944
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2945
|
+
|
2946
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2947
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2948
|
+
|
2949
|
+
if (backend == GGML_BACKEND_GPU) {
|
2950
|
+
vram_weights +=
|
2951
|
+
ggml_nbytes(layer.attn_norm) +
|
2952
|
+
ggml_nbytes(layer.wqkv) +
|
2953
|
+
ggml_nbytes(layer.wo) +
|
2954
|
+
ggml_nbytes(layer.ffn_norm) +
|
2955
|
+
ggml_nbytes(layer.w2) +
|
2956
|
+
ggml_nbytes(layer.w3);
|
2957
|
+
}
|
2958
|
+
}
|
2959
|
+
} break;
|
2960
|
+
default:
|
2961
|
+
throw std::runtime_error("unknown architecture");
|
2962
|
+
}
|
2963
|
+
}
|
2964
|
+
|
2965
|
+
ml.done_getting_tensors();
|
2966
|
+
|
2967
|
+
// print memory requirements
|
2968
|
+
{
|
2969
|
+
// this is the total memory required to run the inference
|
2970
|
+
size_t mem_required =
|
2971
|
+
ctx_size +
|
2972
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2973
|
+
|
2974
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2975
|
+
|
2976
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2977
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
2978
|
+
|
2979
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
2980
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
2981
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2982
|
+
}
|
2983
|
+
|
2984
|
+
#ifdef GGML_USE_CUBLAS
|
2985
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2986
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2987
|
+
#elif defined(GGML_USE_CLBLAST)
|
2988
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2989
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2990
|
+
#endif // GGML_USE_CUBLAS
|
2991
|
+
|
2992
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2993
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2994
|
+
#else
|
2995
|
+
(void) n_gpu_layers;
|
2996
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2997
|
+
}
|
2998
|
+
|
2999
|
+
// populate `tensors_by_name`
|
3000
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
3001
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
3002
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3003
|
+
}
|
3004
|
+
|
3005
|
+
(void) tensor_split;
|
3006
|
+
#ifdef GGML_USE_CUBLAS
|
3007
|
+
{
|
3008
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
3009
|
+
}
|
3010
|
+
#endif
|
3011
|
+
|
3012
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
3013
|
+
|
3014
|
+
if (progress_callback) {
|
3015
|
+
progress_callback(1.0f, progress_callback_user_data);
|
3016
|
+
}
|
3017
|
+
|
3018
|
+
model.mapping = std::move(ml.mapping);
|
3019
|
+
|
3020
|
+
// loading time will be recalculate after the first eval, so
|
3021
|
+
// we take page faults deferred by mmap() into consideration
|
3022
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
3023
|
+
}
|
3024
|
+
|
3025
|
+
static bool llama_model_load(
|
3026
|
+
const std::string & fname,
|
3027
|
+
llama_model & model,
|
3028
|
+
int n_gpu_layers,
|
3029
|
+
int main_gpu,
|
3030
|
+
const float * tensor_split,
|
3031
|
+
bool use_mmap,
|
3032
|
+
bool use_mlock,
|
3033
|
+
bool vocab_only,
|
3034
|
+
llama_progress_callback progress_callback,
|
3035
|
+
void *progress_callback_user_data) {
|
3036
|
+
try {
|
3037
|
+
llama_model_loader ml(fname, use_mmap);
|
3038
|
+
|
3039
|
+
model.hparams.vocab_only = vocab_only;
|
3040
|
+
|
3041
|
+
llm_load_arch (ml, model);
|
3042
|
+
llm_load_hparams(ml, model);
|
3043
|
+
llm_load_vocab (ml, model);
|
3044
|
+
|
3045
|
+
llm_load_print_meta(ml, model);
|
3046
|
+
|
3047
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
3048
|
+
throw std::runtime_error("vocab size mismatch");
|
3049
|
+
}
|
3050
|
+
|
3051
|
+
if (vocab_only) {
|
3052
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
3053
|
+
return true;
|
3054
|
+
}
|
3055
|
+
|
3056
|
+
llm_load_tensors(
|
3057
|
+
ml, model, n_gpu_layers,
|
3058
|
+
main_gpu, tensor_split,
|
3059
|
+
use_mlock, progress_callback, progress_callback_user_data);
|
3060
|
+
} catch (const std::exception & err) {
|
3061
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3062
|
+
return false;
|
3063
|
+
}
|
3064
|
+
|
3065
|
+
return true;
|
3066
|
+
}
|
2625
3067
|
|
2626
3068
|
static struct ggml_cgraph * llm_build_llama(
|
2627
|
-
|
2628
|
-
|
3069
|
+
llama_context & lctx,
|
3070
|
+
const llama_batch & batch) {
|
2629
3071
|
const auto & model = lctx.model;
|
2630
3072
|
const auto & hparams = model.hparams;
|
2631
3073
|
const auto & cparams = lctx.cparams;
|
@@ -2663,11 +3105,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2663
3105
|
struct ggml_init_params params = {
|
2664
3106
|
/*.mem_size =*/ buf_compute.size,
|
2665
3107
|
/*.mem_buffer =*/ buf_compute.data,
|
2666
|
-
/*.no_alloc =*/
|
3108
|
+
/*.no_alloc =*/ true,
|
2667
3109
|
};
|
2668
3110
|
|
2669
|
-
params.no_alloc = true;
|
2670
|
-
|
2671
3111
|
struct ggml_context * ctx0 = ggml_init(params);
|
2672
3112
|
|
2673
3113
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -2739,7 +3179,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2739
3179
|
for (int h = 0; h < 1; ++h) {
|
2740
3180
|
for (int j = 0; j < n_tokens; ++j) {
|
2741
3181
|
const llama_pos pos = batch.pos[j];
|
2742
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3182
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
2743
3183
|
|
2744
3184
|
for (int i = 0; i < n_kv; ++i) {
|
2745
3185
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3051,11 +3491,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3051
3491
|
struct ggml_init_params params = {
|
3052
3492
|
/*.mem_size =*/ buf_compute.size,
|
3053
3493
|
/*.mem_buffer =*/ buf_compute.data,
|
3054
|
-
/*.no_alloc =*/
|
3494
|
+
/*.no_alloc =*/ true,
|
3055
3495
|
};
|
3056
3496
|
|
3057
|
-
params.no_alloc = true;
|
3058
|
-
|
3059
3497
|
struct ggml_context * ctx0 = ggml_init(params);
|
3060
3498
|
|
3061
3499
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3127,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3127
3565
|
for (int h = 0; h < 1; ++h) {
|
3128
3566
|
for (int j = 0; j < n_tokens; ++j) {
|
3129
3567
|
const llama_pos pos = batch.pos[j];
|
3130
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3131
3569
|
|
3132
3570
|
for (int i = 0; i < n_kv; ++i) {
|
3133
3571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3452,11 +3890,9 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3452
3890
|
struct ggml_init_params params = {
|
3453
3891
|
/*.mem_size =*/ buf_compute.size,
|
3454
3892
|
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
-
/*.no_alloc =*/
|
3893
|
+
/*.no_alloc =*/ true,
|
3456
3894
|
};
|
3457
3895
|
|
3458
|
-
params.no_alloc = true;
|
3459
|
-
|
3460
3896
|
struct ggml_context * ctx0 = ggml_init(params);
|
3461
3897
|
|
3462
3898
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3528,7 +3964,7 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3528
3964
|
for (int h = 0; h < 1; ++h) {
|
3529
3965
|
for (int j = 0; j < n_tokens; ++j) {
|
3530
3966
|
const llama_pos pos = batch.pos[j];
|
3531
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3967
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3532
3968
|
|
3533
3969
|
for (int i = 0; i < n_kv; ++i) {
|
3534
3970
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3806,11 +4242,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3806
4242
|
struct ggml_init_params params = {
|
3807
4243
|
/*.mem_size =*/ buf_compute.size,
|
3808
4244
|
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
-
/*.no_alloc =*/
|
4245
|
+
/*.no_alloc =*/ true,
|
3810
4246
|
};
|
3811
4247
|
|
3812
|
-
params.no_alloc = true;
|
3813
|
-
|
3814
4248
|
struct ggml_context * ctx0 = ggml_init(params);
|
3815
4249
|
|
3816
4250
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3882,7 +4316,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3882
4316
|
for (int h = 0; h < 1; ++h) {
|
3883
4317
|
for (int j = 0; j < n_tokens; ++j) {
|
3884
4318
|
const llama_pos pos = batch.pos[j];
|
3885
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4319
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3886
4320
|
|
3887
4321
|
for (int i = 0; i < n_kv; ++i) {
|
3888
4322
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4166,11 +4600,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4166
4600
|
struct ggml_init_params params = {
|
4167
4601
|
/*.mem_size =*/ buf_compute.size,
|
4168
4602
|
/*.mem_buffer =*/ buf_compute.data,
|
4169
|
-
/*.no_alloc =*/
|
4603
|
+
/*.no_alloc =*/ true,
|
4170
4604
|
};
|
4171
4605
|
|
4172
|
-
params.no_alloc = true;
|
4173
|
-
|
4174
4606
|
struct ggml_context * ctx0 = ggml_init(params);
|
4175
4607
|
|
4176
4608
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4199,23 +4631,919 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4199
4631
|
|
4200
4632
|
ggml_allocr_alloc(lctx.alloc, token);
|
4201
4633
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4202
|
-
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4634
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4635
|
+
}
|
4636
|
+
}
|
4637
|
+
|
4638
|
+
{
|
4639
|
+
// Compute position embeddings.
|
4640
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4641
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4642
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4643
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4644
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4645
|
+
}
|
4646
|
+
}
|
4647
|
+
ggml_set_name(inp_positions, "inp_positions");
|
4648
|
+
|
4649
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
4650
|
+
}
|
4651
|
+
|
4652
|
+
// KQ_scale
|
4653
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4654
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4655
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4656
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4657
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
4658
|
+
}
|
4659
|
+
|
4660
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4661
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4662
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4663
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4664
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4665
|
+
float * data = (float *) KQ_mask->data;
|
4666
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4667
|
+
|
4668
|
+
for (int h = 0; h < 1; ++h) {
|
4669
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4670
|
+
const llama_pos pos = batch.pos[j];
|
4671
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4672
|
+
|
4673
|
+
for (int i = 0; i < n_kv; ++i) {
|
4674
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4675
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4676
|
+
}
|
4677
|
+
}
|
4678
|
+
}
|
4679
|
+
}
|
4680
|
+
}
|
4681
|
+
|
4682
|
+
inpL = ggml_add(ctx0, token, position);
|
4683
|
+
ggml_set_name(inpL, "inpL");
|
4684
|
+
|
4685
|
+
for (int il = 0; il < n_layer; ++il) {
|
4686
|
+
{
|
4687
|
+
// Norm
|
4688
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4689
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
{
|
4693
|
+
// Self Attention
|
4694
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
4695
|
+
|
4696
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4697
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4698
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
4699
|
+
|
4700
|
+
struct ggml_tensor * Qcur = tmpq;
|
4701
|
+
struct ggml_tensor * Kcur = tmpk;
|
4702
|
+
|
4703
|
+
{
|
4704
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4705
|
+
ggml_set_name(Vcur, "Vcur");
|
4706
|
+
|
4707
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4708
|
+
ggml_set_name(k, "k");
|
4709
|
+
|
4710
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4711
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4712
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4713
|
+
|
4714
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4715
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4716
|
+
}
|
4717
|
+
|
4718
|
+
struct ggml_tensor * Q =
|
4719
|
+
ggml_permute(ctx0,
|
4720
|
+
ggml_cpy(ctx0,
|
4721
|
+
Qcur,
|
4722
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4723
|
+
0, 2, 1, 3);
|
4724
|
+
ggml_set_name(Q, "Q");
|
4725
|
+
|
4726
|
+
struct ggml_tensor * K =
|
4727
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4728
|
+
n_embd_head, n_kv, n_head_kv,
|
4729
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4730
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4731
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4732
|
+
ggml_set_name(K, "K");
|
4733
|
+
|
4734
|
+
// K * Q
|
4735
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4736
|
+
ggml_set_name(KQ, "KQ");
|
4737
|
+
|
4738
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
4739
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
4740
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
4741
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4742
|
+
|
4743
|
+
// KQ_masked = mask_past(KQ_scaled)
|
4744
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4745
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4746
|
+
|
4747
|
+
// KQ = soft_max(KQ_masked)
|
4748
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
4749
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4750
|
+
|
4751
|
+
// split cached V into n_head heads
|
4752
|
+
struct ggml_tensor * V =
|
4753
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4754
|
+
n_kv, n_embd_head, n_head_kv,
|
4755
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4756
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4757
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4758
|
+
ggml_set_name(V, "V");
|
4759
|
+
|
4760
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4761
|
+
ggml_set_name(KQV, "KQV");
|
4762
|
+
|
4763
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4764
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4765
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4766
|
+
|
4767
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4768
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4769
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4770
|
+
}
|
4771
|
+
|
4772
|
+
// Projection
|
4773
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
4774
|
+
|
4775
|
+
// Add the input
|
4776
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4777
|
+
|
4778
|
+
struct ggml_tensor * inpFF = cur;
|
4779
|
+
|
4780
|
+
// FF
|
4781
|
+
{
|
4782
|
+
// Norm
|
4783
|
+
{
|
4784
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
4785
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
4786
|
+
}
|
4787
|
+
|
4788
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
4789
|
+
|
4790
|
+
// GELU activation
|
4791
|
+
cur = ggml_gelu(ctx0, cur);
|
4792
|
+
|
4793
|
+
// Projection
|
4794
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
4795
|
+
}
|
4796
|
+
|
4797
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
4798
|
+
}
|
4799
|
+
|
4800
|
+
// Output Norm
|
4801
|
+
{
|
4802
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4803
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
4804
|
+
}
|
4805
|
+
ggml_set_name(cur, "result_norm");
|
4806
|
+
|
4807
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4808
|
+
ggml_set_name(cur, "result_output");
|
4809
|
+
|
4810
|
+
ggml_build_forward_expand(gf, cur);
|
4811
|
+
ggml_free(ctx0);
|
4812
|
+
|
4813
|
+
return gf;
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4817
|
+
llama_context & lctx,
|
4818
|
+
const llama_batch & batch) {
|
4819
|
+
const auto & model = lctx.model;
|
4820
|
+
const auto & hparams = model.hparams;
|
4821
|
+
|
4822
|
+
const auto & kv_self = lctx.kv_self;
|
4823
|
+
|
4824
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4825
|
+
|
4826
|
+
const auto & cparams = lctx.cparams;
|
4827
|
+
const int64_t n_embd = hparams.n_embd;
|
4828
|
+
const int64_t n_layer = hparams.n_layer;
|
4829
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4830
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4831
|
+
const int64_t n_head = hparams.n_head;
|
4832
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4833
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4834
|
+
const size_t n_rot = n_embd_head / 2;
|
4835
|
+
|
4836
|
+
const float freq_base = cparams.rope_freq_base;
|
4837
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4838
|
+
const float norm_eps = hparams.f_norm_eps;
|
4839
|
+
|
4840
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4841
|
+
|
4842
|
+
|
4843
|
+
const int32_t n_tokens = batch.n_tokens;
|
4844
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4845
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4846
|
+
|
4847
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4848
|
+
|
4849
|
+
auto & buf_compute = lctx.buf_compute;
|
4850
|
+
struct ggml_init_params params = {
|
4851
|
+
/*.mem_size =*/ buf_compute.size,
|
4852
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4853
|
+
/*.no_alloc =*/ true,
|
4854
|
+
};
|
4855
|
+
|
4856
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4857
|
+
|
4858
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4859
|
+
|
4860
|
+
struct ggml_tensor * cur;
|
4861
|
+
struct ggml_tensor * inpL;
|
4862
|
+
|
4863
|
+
if (batch.token) {
|
4864
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4865
|
+
|
4866
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4867
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4868
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4869
|
+
}
|
4870
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4871
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4872
|
+
} else {
|
4873
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4874
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4875
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4876
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4877
|
+
}
|
4878
|
+
}
|
4879
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4880
|
+
(void) i_gpu_start;
|
4881
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4882
|
+
offload_func_t offload_func_kq = llama_nop;
|
4883
|
+
offload_func_t offload_func_v = llama_nop;
|
4884
|
+
// KQ_scale
|
4885
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4886
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4887
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4888
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4889
|
+
}
|
4890
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4891
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4892
|
+
offload_func_kq(KQ_mask);
|
4893
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4894
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4895
|
+
|
4896
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4897
|
+
float * data = (float *) KQ_mask->data;
|
4898
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4899
|
+
for (int h = 0; h < 1; ++h) {
|
4900
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4901
|
+
const llama_pos pos = batch.pos[j];
|
4902
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4903
|
+
for (int i = 0; i < n_kv; ++i) {
|
4904
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4905
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4906
|
+
}
|
4907
|
+
}
|
4908
|
+
}
|
4909
|
+
}
|
4910
|
+
}
|
4911
|
+
|
4912
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4913
|
+
offload_func_kq(KQ_pos);
|
4914
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4915
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4916
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4917
|
+
int * data = (int *) KQ_pos->data;
|
4918
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4919
|
+
data[i] = batch.pos[i];
|
4920
|
+
}
|
4921
|
+
}
|
4922
|
+
if (do_rope_shift) {
|
4923
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4924
|
+
offload_func_kq(K_shift);
|
4925
|
+
ggml_set_name(K_shift, "K_shift");
|
4926
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4927
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4928
|
+
int * data = (int *) K_shift->data;
|
4929
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4930
|
+
data[i] = kv_self.cells[i].delta;
|
4931
|
+
}
|
4932
|
+
}
|
4933
|
+
for (int il = 0; il < n_layer; ++il) {
|
4934
|
+
struct ggml_tensor * tmp =
|
4935
|
+
// we rotate only the first n_rot dimensions.
|
4936
|
+
ggml_rope_custom_inplace(ctx0,
|
4937
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4938
|
+
n_rot, n_head, n_ctx,
|
4939
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4940
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4941
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4942
|
+
),
|
4943
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4944
|
+
offload_func_kq(tmp);
|
4945
|
+
ggml_build_forward_expand(gf, tmp);
|
4946
|
+
}
|
4947
|
+
}
|
4948
|
+
for (int il=0; il < n_layer; ++il) {
|
4949
|
+
struct ggml_tensor * residual = inpL;
|
4950
|
+
offload_func_t offload_func = llama_nop;
|
4951
|
+
{
|
4952
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4953
|
+
offload_func(cur);
|
4954
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4955
|
+
offload_func(cur);
|
4956
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4957
|
+
offload_func(cur);
|
4958
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4959
|
+
}
|
4960
|
+
// self attention
|
4961
|
+
{
|
4962
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4963
|
+
offload_func_kq(cur);
|
4964
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4965
|
+
offload_func_kq(cur);
|
4966
|
+
|
4967
|
+
// split qkv
|
4968
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4969
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4970
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4971
|
+
offload_func_kq(tmpqkv);
|
4972
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4973
|
+
offload_func_kq(tmpqkv_perm);
|
4974
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4975
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4976
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4977
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4978
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4979
|
+
0
|
4980
|
+
);
|
4981
|
+
offload_func_kq(tmpq);
|
4982
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4983
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4984
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4985
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4986
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4987
|
+
);
|
4988
|
+
offload_func_kq(tmpk);
|
4989
|
+
// Q/K Layernorm
|
4990
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4991
|
+
offload_func_kq(tmpq);
|
4992
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4993
|
+
offload_func_kq(tmpq);
|
4994
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4995
|
+
offload_func_kq(tmpq);
|
4996
|
+
|
4997
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4998
|
+
offload_func_v(tmpk);
|
4999
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
5000
|
+
offload_func_v(tmpk);
|
5001
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
5002
|
+
offload_func_v(tmpk);
|
5003
|
+
|
5004
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5005
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
5006
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5007
|
+
ggml_element_size(tmpq) * n_embd_head,
|
5008
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5009
|
+
0
|
5010
|
+
);
|
5011
|
+
offload_func_kq(qrot);
|
5012
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
5013
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
5014
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5015
|
+
ggml_element_size(tmpk) * n_embd_head,
|
5016
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5017
|
+
0
|
5018
|
+
);
|
5019
|
+
offload_func_kq(krot);
|
5020
|
+
ggml_format_name(krot, "krot_%d", il);
|
5021
|
+
|
5022
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5023
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
5024
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5025
|
+
ggml_element_size(tmpq) * n_embd_head,
|
5026
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5027
|
+
ggml_element_size(tmpq) * n_rot
|
5028
|
+
);
|
5029
|
+
offload_func_kq(qpass);
|
5030
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
5031
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
5032
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5033
|
+
ggml_element_size(tmpk) * n_embd_head,
|
5034
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5035
|
+
ggml_element_size(tmpk) * n_rot
|
5036
|
+
);
|
5037
|
+
offload_func_kq(kpass);
|
5038
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
5039
|
+
|
5040
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5041
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
5042
|
+
);
|
5043
|
+
offload_func_kq(qrotated);
|
5044
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
5045
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
5046
|
+
);
|
5047
|
+
offload_func_kq(krotated);
|
5048
|
+
// ggml currently only supports concatenation on dim=2
|
5049
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
5050
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
5051
|
+
offload_func_kq(qrotated);
|
5052
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
5053
|
+
offload_func_kq(krotated);
|
5054
|
+
|
5055
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
5056
|
+
offload_func_kq(qpass);
|
5057
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
5058
|
+
offload_func_kq(kpass);
|
5059
|
+
|
5060
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
5061
|
+
offload_func_kq(Qcur);
|
5062
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
5063
|
+
offload_func_kq(Kcur);
|
5064
|
+
|
5065
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
5066
|
+
offload_func_kq(Q);
|
5067
|
+
|
5068
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
5069
|
+
offload_func_kq(Kcur);
|
5070
|
+
{
|
5071
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5072
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
5073
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
5074
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
5075
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
5076
|
+
);
|
5077
|
+
offload_func_v(tmpv);
|
5078
|
+
// store K, V in cache
|
5079
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
5080
|
+
offload_func_v(Vcur);
|
5081
|
+
ggml_set_name(Vcur, "Vcur");
|
5082
|
+
|
5083
|
+
struct ggml_tensor * k = ggml_view_1d(
|
5084
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
5085
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
5086
|
+
);
|
5087
|
+
offload_func_kq(k);
|
5088
|
+
ggml_set_name(k, "k");
|
5089
|
+
|
5090
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5091
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5092
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5093
|
+
offload_func_v(v);
|
5094
|
+
ggml_set_name(v, "v");
|
5095
|
+
|
5096
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
5097
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5098
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5099
|
+
}
|
5100
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
5101
|
+
n_embd_head, n_kv, n_head_kv,
|
5102
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5103
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5104
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5105
|
+
|
5106
|
+
offload_func_kq(K);
|
5107
|
+
ggml_format_name(K, "K_%d", il);
|
5108
|
+
|
5109
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5110
|
+
offload_func_kq(KQ);
|
5111
|
+
ggml_set_name(KQ, "KQ");
|
5112
|
+
|
5113
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5114
|
+
offload_func_kq(KQ_scaled);
|
5115
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5116
|
+
|
5117
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5118
|
+
offload_func_kq(KQ_masked);
|
5119
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5120
|
+
|
5121
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5122
|
+
offload_func_kq(KQ_soft_max);
|
5123
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5124
|
+
|
5125
|
+
struct ggml_tensor * V =
|
5126
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5127
|
+
n_kv, n_embd_head, n_head_kv,
|
5128
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5129
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5130
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5131
|
+
offload_func_v(V);
|
5132
|
+
ggml_set_name(V, "V");
|
5133
|
+
|
5134
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5135
|
+
offload_func_v(KQV);
|
5136
|
+
ggml_set_name(KQV, "KQV");
|
5137
|
+
|
5138
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5139
|
+
offload_func_v(KQV_merged);
|
5140
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5141
|
+
|
5142
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5143
|
+
offload_func_v(cur);
|
5144
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5145
|
+
|
5146
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5147
|
+
offload_func(cur);
|
5148
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5149
|
+
offload_func(cur);
|
5150
|
+
ggml_set_name(cur, "result_wo");
|
5151
|
+
}
|
5152
|
+
|
5153
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5154
|
+
offload_func(inpFF);
|
5155
|
+
ggml_set_name(inpFF, "inpFF");
|
5156
|
+
{
|
5157
|
+
// MLP
|
5158
|
+
{
|
5159
|
+
// Norm
|
5160
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5161
|
+
offload_func(cur);
|
5162
|
+
cur = ggml_add(ctx0,
|
5163
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5164
|
+
model.layers[il].ffn_norm_b
|
5165
|
+
);
|
5166
|
+
ggml_set_name(cur, "ffn_norm");
|
5167
|
+
offload_func(cur);
|
5168
|
+
}
|
5169
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5170
|
+
offload_func(cur);
|
5171
|
+
|
5172
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5173
|
+
offload_func(cur);
|
5174
|
+
ggml_set_name(cur, "result_ffn_up");
|
5175
|
+
|
5176
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5177
|
+
ggml_set_name(cur, "result_ffn_act");
|
5178
|
+
offload_func(cur);
|
5179
|
+
offload_func(cur->src[0]);
|
5180
|
+
|
5181
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5182
|
+
offload_func(cur);
|
5183
|
+
cur = ggml_add(ctx0,
|
5184
|
+
cur,
|
5185
|
+
model.layers[il].b2);
|
5186
|
+
offload_func(cur);
|
5187
|
+
ggml_set_name(cur, "outFF");
|
5188
|
+
}
|
5189
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5190
|
+
offload_func(cur);
|
5191
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5192
|
+
inpL = cur;
|
5193
|
+
}
|
5194
|
+
cur = inpL;
|
5195
|
+
{
|
5196
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5197
|
+
offload_func_nr(cur);
|
5198
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5199
|
+
offload_func_nr(cur);
|
5200
|
+
|
5201
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5202
|
+
// offload_func_nr(cur);
|
5203
|
+
|
5204
|
+
ggml_set_name(cur, "result_norm");
|
5205
|
+
}
|
5206
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5207
|
+
ggml_set_name(cur, "result_output");
|
5208
|
+
ggml_build_forward_expand(gf, cur);
|
5209
|
+
ggml_free(ctx0);
|
5210
|
+
return gf;
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5214
|
+
llama_context & lctx,
|
5215
|
+
const llama_batch & batch) {
|
5216
|
+
const auto & model = lctx.model;
|
5217
|
+
const auto & hparams = model.hparams;
|
5218
|
+
const auto & cparams = lctx.cparams;
|
5219
|
+
|
5220
|
+
const auto & kv_self = lctx.kv_self;
|
5221
|
+
|
5222
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5223
|
+
|
5224
|
+
const int64_t n_embd = hparams.n_embd;
|
5225
|
+
const int64_t n_layer = hparams.n_layer;
|
5226
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5227
|
+
const int64_t n_head = hparams.n_head;
|
5228
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5229
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5230
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5231
|
+
|
5232
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5233
|
+
|
5234
|
+
const float norm_eps = hparams.f_norm_eps;
|
5235
|
+
|
5236
|
+
const int32_t n_tokens = batch.n_tokens;
|
5237
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5238
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5239
|
+
|
5240
|
+
auto & buf_compute = lctx.buf_compute;
|
5241
|
+
|
5242
|
+
struct ggml_init_params params = {
|
5243
|
+
/*.mem_size =*/ buf_compute.size,
|
5244
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5245
|
+
/*.no_alloc =*/ false,
|
5246
|
+
};
|
5247
|
+
|
5248
|
+
params.no_alloc = true;
|
5249
|
+
|
5250
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5251
|
+
|
5252
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5253
|
+
|
5254
|
+
struct ggml_tensor * cur;
|
5255
|
+
struct ggml_tensor * token;
|
5256
|
+
struct ggml_tensor * inpL;
|
5257
|
+
|
5258
|
+
if (batch.token) {
|
5259
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5260
|
+
|
5261
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5262
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5263
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5264
|
+
}
|
5265
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5266
|
+
|
5267
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5268
|
+
} else {
|
5269
|
+
#ifdef GGML_USE_MPI
|
5270
|
+
GGML_ASSERT(false && "not implemented");
|
5271
|
+
#endif
|
5272
|
+
|
5273
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5274
|
+
|
5275
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5276
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5277
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5278
|
+
}
|
5279
|
+
}
|
5280
|
+
|
5281
|
+
// KQ_scale
|
5282
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5283
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5284
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5285
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5286
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5287
|
+
}
|
5288
|
+
|
5289
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5290
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5291
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5292
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5293
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5294
|
+
float * data = (float *) KQ_mask->data;
|
5295
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5296
|
+
|
5297
|
+
for (int h = 0; h < 1; ++h) {
|
5298
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5299
|
+
const llama_pos pos = batch.pos[j];
|
5300
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5301
|
+
|
5302
|
+
for (int i = 0; i < n_kv; ++i) {
|
5303
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5304
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5305
|
+
}
|
5306
|
+
}
|
5307
|
+
}
|
5308
|
+
}
|
5309
|
+
}
|
5310
|
+
|
5311
|
+
// norm
|
5312
|
+
{
|
5313
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5314
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5315
|
+
}
|
5316
|
+
|
5317
|
+
ggml_set_name(inpL, "inpL");
|
5318
|
+
|
5319
|
+
for (int il = 0; il < n_layer; ++il) {
|
5320
|
+
{
|
5321
|
+
// Norm
|
5322
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5323
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5324
|
+
}
|
5325
|
+
|
5326
|
+
{
|
5327
|
+
// Self Attention
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5329
|
+
|
5330
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5331
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5332
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5333
|
+
|
5334
|
+
struct ggml_tensor * Qcur = tmpq;
|
5335
|
+
struct ggml_tensor * Kcur = tmpk;
|
5336
|
+
|
5337
|
+
// store key and value to memory
|
5338
|
+
{
|
5339
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5340
|
+
ggml_set_name(Vcur, "Vcur");
|
5341
|
+
|
5342
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5343
|
+
ggml_set_name(k, "k");
|
5344
|
+
|
5345
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5346
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5347
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5348
|
+
|
5349
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5350
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5351
|
+
}
|
5352
|
+
|
5353
|
+
struct ggml_tensor * Q =
|
5354
|
+
ggml_permute(ctx0,
|
5355
|
+
ggml_cpy(ctx0,
|
5356
|
+
Qcur,
|
5357
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5358
|
+
0, 2, 1, 3);
|
5359
|
+
ggml_set_name(Q, "Q");
|
5360
|
+
|
5361
|
+
struct ggml_tensor * K =
|
5362
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5363
|
+
n_embd_head, n_kv, n_head_kv,
|
5364
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5365
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5366
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5367
|
+
ggml_set_name(K, "K");
|
5368
|
+
|
5369
|
+
// K * Q
|
5370
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5371
|
+
ggml_set_name(KQ, "KQ");
|
5372
|
+
|
5373
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5374
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5375
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5376
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5377
|
+
|
5378
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5379
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5380
|
+
|
5381
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5382
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5383
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5384
|
+
|
5385
|
+
// KQ = soft_max(KQ_masked)
|
5386
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5387
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5388
|
+
|
5389
|
+
// split cached V into n_head heads
|
5390
|
+
struct ggml_tensor * V =
|
5391
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5392
|
+
n_kv, n_embd_head, n_head_kv,
|
5393
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5394
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5395
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5396
|
+
ggml_set_name(V, "V");
|
5397
|
+
|
5398
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5399
|
+
ggml_set_name(KQV, "KQV");
|
5400
|
+
|
5401
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5402
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5403
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5404
|
+
|
5405
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5406
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5407
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5408
|
+
}
|
5409
|
+
|
5410
|
+
// Projection
|
5411
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5412
|
+
|
5413
|
+
// Add the input
|
5414
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5415
|
+
|
5416
|
+
struct ggml_tensor * inpFF = cur;
|
5417
|
+
|
5418
|
+
// FF
|
5419
|
+
{
|
5420
|
+
// Norm
|
5421
|
+
{
|
5422
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5423
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5424
|
+
}
|
5425
|
+
|
5426
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5427
|
+
|
5428
|
+
// GELU activation
|
5429
|
+
cur = ggml_gelu(ctx0, cur);
|
5430
|
+
|
5431
|
+
// Projection
|
5432
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5433
|
+
}
|
5434
|
+
|
5435
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5436
|
+
}
|
5437
|
+
|
5438
|
+
// Output Norm
|
5439
|
+
{
|
5440
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5441
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5442
|
+
}
|
5443
|
+
ggml_set_name(cur, "result_norm");
|
5444
|
+
|
5445
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5446
|
+
ggml_set_name(cur, "result_output");
|
5447
|
+
|
5448
|
+
ggml_build_forward_expand(gf, cur);
|
5449
|
+
|
5450
|
+
ggml_free(ctx0);
|
5451
|
+
|
5452
|
+
return gf;
|
5453
|
+
}
|
5454
|
+
|
5455
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5456
|
+
llama_context & lctx,
|
5457
|
+
const llama_batch & batch) {
|
5458
|
+
const auto & model = lctx.model;
|
5459
|
+
const auto & hparams = model.hparams;
|
5460
|
+
const auto & cparams = lctx.cparams;
|
5461
|
+
|
5462
|
+
const auto & kv_self = lctx.kv_self;
|
5463
|
+
|
5464
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5465
|
+
|
5466
|
+
const int64_t n_embd = hparams.n_embd;
|
5467
|
+
const int64_t n_layer = hparams.n_layer;
|
5468
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5469
|
+
const int64_t n_head = hparams.n_head;
|
5470
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5471
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5472
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5473
|
+
|
5474
|
+
const float norm_eps = hparams.f_norm_eps;
|
5475
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5476
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5477
|
+
|
5478
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5479
|
+
|
5480
|
+
const int32_t n_tokens = batch.n_tokens;
|
5481
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5482
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5483
|
+
|
5484
|
+
auto & buf_compute = lctx.buf_compute;
|
5485
|
+
|
5486
|
+
struct ggml_init_params params = {
|
5487
|
+
/*.mem_size =*/ buf_compute.size,
|
5488
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5489
|
+
/*.no_alloc =*/ false,
|
5490
|
+
};
|
5491
|
+
|
5492
|
+
params.no_alloc = true;
|
5493
|
+
|
5494
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5495
|
+
|
5496
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5497
|
+
|
5498
|
+
struct ggml_tensor * cur;
|
5499
|
+
struct ggml_tensor * inpL;
|
5500
|
+
|
5501
|
+
//int warmup = 0;
|
5502
|
+
if (batch.token) {
|
5503
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5504
|
+
|
5505
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5506
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5507
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5508
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5509
|
+
}
|
5510
|
+
|
5511
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5512
|
+
|
5513
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5514
|
+
} else {
|
5515
|
+
#ifdef GGML_USE_MPI
|
5516
|
+
GGML_ASSERT(false && "not implemented");
|
5517
|
+
#endif
|
5518
|
+
|
5519
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5520
|
+
|
5521
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5522
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5523
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4203
5524
|
}
|
4204
5525
|
}
|
4205
5526
|
|
4206
|
-
|
4207
|
-
|
4208
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4209
|
-
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4210
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4211
|
-
for (int i = 0; i < n_tokens; ++i) {
|
4212
|
-
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4213
|
-
}
|
4214
|
-
}
|
4215
|
-
ggml_set_name(inp_positions, "inp_positions");
|
5527
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5528
|
+
(void) i_gpu_start;
|
4216
5529
|
|
4217
|
-
|
5530
|
+
// offload functions set the tensor output backend to GPU
|
5531
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5532
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5533
|
+
offload_func_t offload_func_kq = llama_nop;
|
5534
|
+
offload_func_t offload_func_v = llama_nop;
|
5535
|
+
|
5536
|
+
#ifdef GGML_USE_CUBLAS
|
5537
|
+
if (n_gpu_layers > n_layer) {
|
5538
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
4218
5539
|
}
|
5540
|
+
if (n_gpu_layers > n_layer + 1) {
|
5541
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5542
|
+
}
|
5543
|
+
if (n_gpu_layers > n_layer + 2) {
|
5544
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5545
|
+
}
|
5546
|
+
#endif // GGML_USE_CUBLAS
|
4219
5547
|
|
4220
5548
|
// KQ_scale
|
4221
5549
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
@@ -4227,6 +5555,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4227
5555
|
|
4228
5556
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4229
5557
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5558
|
+
offload_func_kq(KQ_mask);
|
4230
5559
|
ggml_set_name(KQ_mask, "KQ_mask");
|
4231
5560
|
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4232
5561
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
@@ -4236,7 +5565,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4236
5565
|
for (int h = 0; h < 1; ++h) {
|
4237
5566
|
for (int j = 0; j < n_tokens; ++j) {
|
4238
5567
|
const llama_pos pos = batch.pos[j];
|
4239
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4240
5569
|
|
4241
5570
|
for (int i = 0; i < n_kv; ++i) {
|
4242
5571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4247,48 +5576,87 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4247
5576
|
}
|
4248
5577
|
}
|
4249
5578
|
|
4250
|
-
inpL = ggml_add(ctx0, token, position);
|
4251
|
-
ggml_set_name(inpL, "inpL");
|
4252
|
-
|
4253
5579
|
for (int il = 0; il < n_layer; ++il) {
|
4254
|
-
|
4255
|
-
|
4256
|
-
|
4257
|
-
|
5580
|
+
struct ggml_tensor * attn_norm;
|
5581
|
+
|
5582
|
+
offload_func_t offload_func = llama_nop;
|
5583
|
+
|
5584
|
+
#ifdef GGML_USE_CUBLAS
|
5585
|
+
if (il >= i_gpu_start) {
|
5586
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
4258
5587
|
}
|
5588
|
+
#endif // GGML_USE_CUBLAS
|
4259
5589
|
|
5590
|
+
// self-attention
|
5591
|
+
// TODO: refactor into common function (shared with LLaMA)
|
4260
5592
|
{
|
4261
|
-
|
4262
|
-
|
5593
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5594
|
+
offload_func(attn_norm);
|
4263
5595
|
|
4264
|
-
|
4265
|
-
|
4266
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5596
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5597
|
+
offload_func(attn_norm);
|
4267
5598
|
|
4268
|
-
|
4269
|
-
|
5599
|
+
if (1) {
|
5600
|
+
cur = attn_norm;
|
5601
|
+
}
|
5602
|
+
|
5603
|
+
// compute QKV
|
5604
|
+
|
5605
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5606
|
+
offload_func_kq(cur);
|
5607
|
+
|
5608
|
+
if (clamp_kqv > 0.0f) {
|
5609
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5610
|
+
offload_func_kq(cur);
|
5611
|
+
}
|
5612
|
+
|
5613
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5614
|
+
|
5615
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5616
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5617
|
+
wsize * n_embd_head,
|
5618
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5619
|
+
0);
|
5620
|
+
offload_func_kq(Qcur);
|
5621
|
+
|
5622
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5623
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5624
|
+
wsize * n_embd_head,
|
5625
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5626
|
+
wsize * n_embd_head * n_head);
|
5627
|
+
offload_func_kq(Kcur);
|
5628
|
+
|
5629
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5630
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5631
|
+
wsize * n_embd_head,
|
5632
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5633
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5634
|
+
offload_func_kq(Kcur);
|
5635
|
+
|
5636
|
+
ggml_set_name(Qcur, "Qcur");
|
5637
|
+
ggml_set_name(Kcur, "Kcur");
|
4270
5638
|
|
4271
5639
|
{
|
4272
5640
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5641
|
+
offload_func_v(Vcur);
|
5642
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
4273
5643
|
ggml_set_name(Vcur, "Vcur");
|
4274
5644
|
|
4275
5645
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5646
|
+
offload_func_kq(k);
|
4276
5647
|
ggml_set_name(k, "k");
|
4277
5648
|
|
4278
5649
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4279
5650
|
( n_ctx)*ggml_element_size(kv_self.v),
|
4280
5651
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5652
|
+
offload_func_v(v);
|
4281
5653
|
|
4282
5654
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4283
5655
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4284
5656
|
}
|
4285
5657
|
|
4286
|
-
struct ggml_tensor * Q =
|
4287
|
-
|
4288
|
-
ggml_cpy(ctx0,
|
4289
|
-
Qcur,
|
4290
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4291
|
-
0, 2, 1, 3);
|
5658
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5659
|
+
offload_func_kq(Q);
|
4292
5660
|
ggml_set_name(Q, "Q");
|
4293
5661
|
|
4294
5662
|
struct ggml_tensor * K =
|
@@ -4297,85 +5665,105 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4297
5665
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4298
5666
|
ggml_element_size(kv_self.k)*n_embd_head,
|
4299
5667
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5668
|
+
offload_func_kq(K);
|
4300
5669
|
ggml_set_name(K, "K");
|
4301
5670
|
|
4302
|
-
// K * Q
|
4303
5671
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5672
|
+
offload_func_kq(KQ);
|
4304
5673
|
ggml_set_name(KQ, "KQ");
|
4305
5674
|
|
4306
|
-
|
4307
|
-
|
4308
|
-
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5675
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5676
|
+
offload_func_kq(KQ_scaled);
|
4309
5677
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4310
5678
|
|
4311
|
-
//
|
4312
|
-
struct ggml_tensor *
|
5679
|
+
// TODO: replace with ggml_add()
|
5680
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5681
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5682
|
+
offload_func_kq(KQ_scaled_alibi);
|
5683
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5684
|
+
|
5685
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5686
|
+
offload_func_kq(KQ_masked);
|
4313
5687
|
ggml_set_name(KQ_masked, "KQ_masked");
|
4314
5688
|
|
4315
|
-
|
4316
|
-
|
5689
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5690
|
+
offload_func_v(KQ_soft_max);
|
4317
5691
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4318
5692
|
|
4319
|
-
// split cached V into n_head heads
|
4320
5693
|
struct ggml_tensor * V =
|
4321
5694
|
ggml_view_3d(ctx0, kv_self.v,
|
4322
5695
|
n_kv, n_embd_head, n_head_kv,
|
4323
5696
|
ggml_element_size(kv_self.v)*n_ctx,
|
4324
5697
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4325
5698
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5699
|
+
offload_func_v(V);
|
4326
5700
|
ggml_set_name(V, "V");
|
4327
5701
|
|
4328
5702
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5703
|
+
offload_func_v(KQV);
|
4329
5704
|
ggml_set_name(KQV, "KQV");
|
4330
5705
|
|
4331
|
-
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4332
5706
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5707
|
+
offload_func_v(KQV_merged);
|
4333
5708
|
ggml_set_name(KQV_merged, "KQV_merged");
|
4334
5709
|
|
4335
|
-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4336
5710
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5711
|
+
offload_func_v(cur);
|
4337
5712
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
4338
|
-
}
|
4339
5713
|
|
4340
|
-
|
4341
|
-
|
5714
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5715
|
+
offload_func(cur);
|
5716
|
+
ggml_set_name(cur, "result_wo");
|
5717
|
+
}
|
4342
5718
|
|
4343
5719
|
// Add the input
|
4344
5720
|
cur = ggml_add(ctx0, cur, inpL);
|
5721
|
+
offload_func(cur);
|
4345
5722
|
|
4346
|
-
struct ggml_tensor *
|
5723
|
+
struct ggml_tensor * attn_out = cur;
|
4347
5724
|
|
4348
|
-
//
|
5725
|
+
// feed forward
|
4349
5726
|
{
|
4350
5727
|
// Norm
|
4351
5728
|
{
|
4352
|
-
cur = ggml_norm(ctx0,
|
4353
|
-
|
5729
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5730
|
+
offload_func(cur);
|
5731
|
+
|
5732
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5733
|
+
offload_func(cur);
|
4354
5734
|
}
|
4355
5735
|
|
4356
|
-
cur =
|
5736
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5737
|
+
offload_func(cur);
|
4357
5738
|
|
4358
|
-
// GELU activation
|
4359
5739
|
cur = ggml_gelu(ctx0, cur);
|
4360
|
-
|
4361
|
-
|
4362
|
-
|
5740
|
+
offload_func(cur);
|
5741
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5742
|
+
offload_func(cur);
|
4363
5743
|
}
|
4364
5744
|
|
4365
|
-
|
5745
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5746
|
+
offload_func(cur);
|
5747
|
+
// input for next layer
|
5748
|
+
inpL = cur;
|
4366
5749
|
}
|
4367
5750
|
|
4368
|
-
|
5751
|
+
cur = inpL;
|
5752
|
+
|
5753
|
+
// norm
|
4369
5754
|
{
|
4370
|
-
cur = ggml_norm(ctx0,
|
4371
|
-
|
5755
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5756
|
+
offload_func_nr(cur);
|
5757
|
+
|
5758
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5759
|
+
ggml_set_name(cur, "result_norm");
|
4372
5760
|
}
|
4373
|
-
ggml_set_name(cur, "result_norm");
|
4374
5761
|
|
4375
5762
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4376
5763
|
ggml_set_name(cur, "result_output");
|
4377
5764
|
|
4378
5765
|
ggml_build_forward_expand(gf, cur);
|
5766
|
+
|
4379
5767
|
ggml_free(ctx0);
|
4380
5768
|
|
4381
5769
|
return gf;
|
@@ -4405,10 +5793,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4405
5793
|
{
|
4406
5794
|
result = llm_build_starcoder(lctx, batch);
|
4407
5795
|
} break;
|
5796
|
+
case LLM_ARCH_PERSIMMON:
|
5797
|
+
{
|
5798
|
+
result = llm_build_persimmon(lctx, batch);
|
5799
|
+
} break;
|
4408
5800
|
case LLM_ARCH_REFACT:
|
4409
5801
|
{
|
4410
5802
|
result = llm_build_refact(lctx, batch);
|
4411
5803
|
} break;
|
5804
|
+
case LLM_ARCH_BLOOM:
|
5805
|
+
{
|
5806
|
+
result = llm_build_bloom(lctx, batch);
|
5807
|
+
} break;
|
5808
|
+
case LLM_ARCH_MPT:
|
5809
|
+
{
|
5810
|
+
result = llm_build_mpt(lctx, batch);
|
5811
|
+
} break;
|
4412
5812
|
default:
|
4413
5813
|
GGML_ASSERT(false);
|
4414
5814
|
}
|
@@ -4420,7 +5820,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4420
5820
|
//
|
4421
5821
|
// - lctx: llama context
|
4422
5822
|
// - batch: batch to evaluate
|
4423
|
-
// - n_threads: number of threads to use
|
4424
5823
|
//
|
4425
5824
|
// return 0 on success
|
4426
5825
|
// return positive int on warning
|
@@ -4466,8 +5865,11 @@ static int llama_decode_internal(
|
|
4466
5865
|
|
4467
5866
|
// helpers for smoother batch API transistion
|
4468
5867
|
// after deprecating the llama_eval calls, these will be removed
|
4469
|
-
std::vector<llama_pos>
|
4470
|
-
|
5868
|
+
std::vector<llama_pos> pos;
|
5869
|
+
|
5870
|
+
std::vector<int32_t> n_seq_id;
|
5871
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
5872
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
4471
5873
|
|
4472
5874
|
if (batch.pos == nullptr) {
|
4473
5875
|
pos.resize(n_tokens);
|
@@ -4479,18 +5881,20 @@ static int llama_decode_internal(
|
|
4479
5881
|
}
|
4480
5882
|
|
4481
5883
|
if (batch.seq_id == nullptr) {
|
5884
|
+
n_seq_id.resize(n_tokens);
|
4482
5885
|
seq_id.resize(n_tokens);
|
5886
|
+
seq_id_arr.resize(n_tokens);
|
4483
5887
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
4484
|
-
|
5888
|
+
n_seq_id[i] = 1;
|
5889
|
+
seq_id[i].resize(1);
|
5890
|
+
seq_id[i][0] = batch.all_seq_id;
|
5891
|
+
seq_id_arr[i] = seq_id[i].data();
|
4485
5892
|
}
|
4486
5893
|
|
4487
|
-
batch.
|
5894
|
+
batch.n_seq_id = n_seq_id.data();
|
5895
|
+
batch.seq_id = seq_id_arr.data();
|
4488
5896
|
}
|
4489
5897
|
|
4490
|
-
// we always start to search for a free slot from the start of the cache
|
4491
|
-
// TODO: better strategies can be implemented
|
4492
|
-
kv_self.head = 0;
|
4493
|
-
|
4494
5898
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
5899
|
return 1;
|
4496
5900
|
}
|
@@ -4509,6 +5913,13 @@ static int llama_decode_internal(
|
|
4509
5913
|
|
4510
5914
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
4511
5915
|
|
5916
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5917
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5918
|
+
|
5919
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5920
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5921
|
+
|
5922
|
+
|
4512
5923
|
#ifdef GGML_USE_CUBLAS
|
4513
5924
|
for (int i = 0; i < gf->n_leafs; i++) {
|
4514
5925
|
ggml_tensor * node = gf->leafs[i];
|
@@ -4526,6 +5937,12 @@ static int llama_decode_internal(
|
|
4526
5937
|
}
|
4527
5938
|
|
4528
5939
|
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
5940
|
+
|
5941
|
+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
5942
|
+
if (!lctx.embedding.empty()) {
|
5943
|
+
embeddings->backend = GGML_BACKEND_CPU;
|
5944
|
+
}
|
5945
|
+
res->backend = GGML_BACKEND_CPU;
|
4529
5946
|
#endif
|
4530
5947
|
|
4531
5948
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -4543,18 +5960,13 @@ static int llama_decode_internal(
|
|
4543
5960
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4544
5961
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4545
5962
|
model.arch == LLM_ARCH_FALCON ||
|
4546
|
-
model.arch == LLM_ARCH_REFACT
|
5963
|
+
model.arch == LLM_ARCH_REFACT ||
|
5964
|
+
model.arch == LLM_ARCH_MPT;
|
4547
5965
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4548
5966
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4549
5967
|
n_threads = 1;
|
4550
5968
|
}
|
4551
5969
|
|
4552
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
4553
|
-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
4554
|
-
|
4555
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
4556
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
4557
|
-
|
4558
5970
|
#if GGML_USE_MPI
|
4559
5971
|
const int64_t n_layer = hparams.n_layer;
|
4560
5972
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -4576,8 +5988,12 @@ static int llama_decode_internal(
|
|
4576
5988
|
#endif
|
4577
5989
|
|
4578
5990
|
// update the kv ring buffer
|
4579
|
-
lctx.kv_self.head += n_tokens;
|
4580
5991
|
lctx.kv_self.has_shift = false;
|
5992
|
+
lctx.kv_self.head += n_tokens;
|
5993
|
+
// Ensure kv cache head points to a valid index.
|
5994
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5995
|
+
lctx.kv_self.head = 0;
|
5996
|
+
}
|
4581
5997
|
|
4582
5998
|
#ifdef GGML_PERF
|
4583
5999
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -4903,7 +6319,6 @@ struct llm_tokenizer_bpe {
|
|
4903
6319
|
llm_symbol sym;
|
4904
6320
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
4905
6321
|
sym.text = word.c_str() + offset;
|
4906
|
-
sym.n = 1;
|
4907
6322
|
sym.n = char_len;
|
4908
6323
|
offset += sym.n;
|
4909
6324
|
sym.prev = index - 1;
|
@@ -5040,7 +6455,6 @@ private:
|
|
5040
6455
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
6456
|
const std::string & utf_char = text_utf[i];
|
5042
6457
|
bool split_condition = false;
|
5043
|
-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
6458
|
int bytes_remain = text_utf.size() - i;
|
5045
6459
|
// forward backward lookups
|
5046
6460
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -5066,9 +6480,9 @@ private:
|
|
5066
6480
|
if (!split_condition && bytes_remain >= 3) {
|
5067
6481
|
// 're|'ve|'ll
|
5068
6482
|
if (utf_char == "\'" && (
|
5069
|
-
(utf_char_next == "r"
|
5070
|
-
(utf_char_next == "v"
|
5071
|
-
(utf_char_next == "l"
|
6483
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6484
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6485
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
5072
6486
|
) {
|
5073
6487
|
split_condition = true;
|
5074
6488
|
}
|
@@ -5119,7 +6533,7 @@ private:
|
|
5119
6533
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
6534
|
split_condition = true;
|
5121
6535
|
}
|
5122
|
-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next)
|
6536
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5123
6537
|
split_condition = true;
|
5124
6538
|
}
|
5125
6539
|
}
|
@@ -5164,7 +6578,137 @@ private:
|
|
5164
6578
|
llm_bigram_bpe::queue work_queue;
|
5165
6579
|
};
|
5166
6580
|
|
5167
|
-
|
6581
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
6582
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
6583
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
6584
|
+
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
6585
|
+
|
6586
|
+
struct fragment_buffer_variant{
|
6587
|
+
fragment_buffer_variant(llama_vocab::id _token)
|
6588
|
+
:
|
6589
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
6590
|
+
token(_token),
|
6591
|
+
raw_text(_dummy),
|
6592
|
+
offset(0),
|
6593
|
+
length(0){}
|
6594
|
+
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
6595
|
+
:
|
6596
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
6597
|
+
token((llama_vocab::id)-1),
|
6598
|
+
raw_text(_raw_text),
|
6599
|
+
offset(_offset),
|
6600
|
+
length(_length){
|
6601
|
+
GGML_ASSERT( _offset >= 0 );
|
6602
|
+
GGML_ASSERT( _length >= 1 );
|
6603
|
+
GGML_ASSERT( offset + length <= raw_text.length() );
|
6604
|
+
}
|
6605
|
+
|
6606
|
+
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
6607
|
+
const llama_vocab::id token;
|
6608
|
+
const std::string _dummy;
|
6609
|
+
const std::string & raw_text;
|
6610
|
+
const uint64_t offset;
|
6611
|
+
const uint64_t length;
|
6612
|
+
};
|
6613
|
+
|
6614
|
+
// #define PRETOKENIZERDEBUG
|
6615
|
+
|
6616
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
6617
|
+
{
|
6618
|
+
// for each special token
|
6619
|
+
for (const auto & st: vocab.special_tokens_cache) {
|
6620
|
+
const auto & special_token = st.first;
|
6621
|
+
const auto & special_id = st.second;
|
6622
|
+
|
6623
|
+
// for each text fragment
|
6624
|
+
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
6625
|
+
while (it != buffer.end()) {
|
6626
|
+
auto & fragment = (*it);
|
6627
|
+
|
6628
|
+
// if a fragment is text ( not yet processed )
|
6629
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
6630
|
+
auto * raw_text = &(fragment.raw_text);
|
6631
|
+
|
6632
|
+
auto raw_text_base_offset = fragment.offset;
|
6633
|
+
auto raw_text_base_length = fragment.length;
|
6634
|
+
|
6635
|
+
// loop over the text
|
6636
|
+
while (true) {
|
6637
|
+
// find the first occurence of a given special token in this fragment
|
6638
|
+
// passing offset argument only limit the "search area" but match coordinates
|
6639
|
+
// are still relative to the source full raw_text
|
6640
|
+
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6641
|
+
|
6642
|
+
// no occurences found, stop processing this fragment for a given special token
|
6643
|
+
if (match == std::string::npos) break;
|
6644
|
+
|
6645
|
+
// check if match is within bounds of offset <-> length
|
6646
|
+
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
6647
|
+
|
6648
|
+
#ifdef PRETOKENIZERDEBUG
|
6649
|
+
fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6650
|
+
#endif
|
6651
|
+
auto source = std::distance(buffer.begin(), it);
|
6652
|
+
|
6653
|
+
// if match is further than base offset
|
6654
|
+
// then we have some text to the left of it
|
6655
|
+
if (match > raw_text_base_offset) {
|
6656
|
+
// left
|
6657
|
+
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
6658
|
+
const int64_t left_reminder_length = match - raw_text_base_offset;
|
6659
|
+
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
6660
|
+
|
6661
|
+
#ifdef PRETOKENIZERDEBUG
|
6662
|
+
fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
6663
|
+
#endif
|
6664
|
+
it++;
|
6665
|
+
}
|
6666
|
+
|
6667
|
+
// special token
|
6668
|
+
buffer.emplace_after(it, special_id);
|
6669
|
+
it++;
|
6670
|
+
|
6671
|
+
// right
|
6672
|
+
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
6673
|
+
const int64_t right_reminder_offset = match + special_token.length();
|
6674
|
+
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
6675
|
+
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
6676
|
+
|
6677
|
+
#ifdef PRETOKENIZERDEBUG
|
6678
|
+
fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
6679
|
+
#endif
|
6680
|
+
|
6681
|
+
it++;
|
6682
|
+
|
6683
|
+
if (source == 0) {
|
6684
|
+
buffer.erase_after(buffer.before_begin());
|
6685
|
+
} else {
|
6686
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6687
|
+
}
|
6688
|
+
|
6689
|
+
// repeat for the right side
|
6690
|
+
raw_text_base_offset = right_reminder_offset;
|
6691
|
+
raw_text_base_length = right_reminder_length;
|
6692
|
+
|
6693
|
+
#ifdef PRETOKENIZERDEBUG
|
6694
|
+
fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6695
|
+
#endif
|
6696
|
+
} else {
|
6697
|
+
if (source == 0) {
|
6698
|
+
buffer.erase_after(buffer.before_begin());
|
6699
|
+
} else {
|
6700
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6701
|
+
}
|
6702
|
+
break;
|
6703
|
+
}
|
6704
|
+
}
|
6705
|
+
}
|
6706
|
+
it++;
|
6707
|
+
}
|
6708
|
+
}
|
6709
|
+
}
|
6710
|
+
|
6711
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
5168
6712
|
std::vector<llama_vocab::id> output;
|
5169
6713
|
|
5170
6714
|
// OG tokenizer behavior:
|
@@ -5180,20 +6724,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5180
6724
|
return output;
|
5181
6725
|
}
|
5182
6726
|
|
6727
|
+
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
6728
|
+
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
|
6729
|
+
|
6730
|
+
if (special) tokenizer_st_partition( vocab, fragment_buffer );
|
6731
|
+
|
5183
6732
|
switch (vocab.type) {
|
5184
6733
|
case LLAMA_VOCAB_TYPE_SPM:
|
5185
6734
|
{
|
5186
|
-
|
5187
|
-
|
6735
|
+
for (const auto & fragment: fragment_buffer)
|
6736
|
+
{
|
6737
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6738
|
+
{
|
6739
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
5188
6740
|
|
5189
|
-
|
5190
|
-
|
5191
|
-
|
6741
|
+
// TODO: It's likely possible to get rid of this string copy entirely
|
6742
|
+
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
6743
|
+
// and passing 'add space prefix' as bool argument
|
6744
|
+
//
|
6745
|
+
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
6746
|
+
|
6747
|
+
#ifdef PRETOKENIZERDEBUG
|
6748
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6749
|
+
#endif
|
6750
|
+
llm_tokenizer_spm tokenizer(vocab);
|
6751
|
+
llama_escape_whitespace(raw_text);
|
6752
|
+
tokenizer.tokenize(raw_text, output);
|
6753
|
+
}
|
6754
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6755
|
+
{
|
6756
|
+
output.push_back(fragment.token);
|
6757
|
+
}
|
6758
|
+
}
|
5192
6759
|
} break;
|
5193
6760
|
case LLAMA_VOCAB_TYPE_BPE:
|
5194
6761
|
{
|
5195
|
-
|
5196
|
-
|
6762
|
+
for (const auto & fragment: fragment_buffer)
|
6763
|
+
{
|
6764
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6765
|
+
{
|
6766
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6767
|
+
|
6768
|
+
#ifdef PRETOKENIZERDEBUG
|
6769
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6770
|
+
#endif
|
6771
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
6772
|
+
tokenizer.tokenize(raw_text, output);
|
6773
|
+
}
|
6774
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6775
|
+
{
|
6776
|
+
output.push_back(fragment.token);
|
6777
|
+
}
|
6778
|
+
}
|
5197
6779
|
} break;
|
5198
6780
|
}
|
5199
6781
|
|
@@ -5466,7 +7048,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5466
7048
|
std::vector<llama_grammar_candidate> rejects;
|
5467
7049
|
|
5468
7050
|
if (stack.empty()) {
|
5469
|
-
for (auto tok : candidates) {
|
7051
|
+
for (const auto & tok : candidates) {
|
5470
7052
|
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
5471
7053
|
rejects.push_back(tok);
|
5472
7054
|
}
|
@@ -5477,7 +7059,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5477
7059
|
const llama_grammar_element * stack_pos = stack.back();
|
5478
7060
|
|
5479
7061
|
std::vector<llama_grammar_candidate> next_candidates;
|
5480
|
-
for (auto tok : candidates) {
|
7062
|
+
for (const auto & tok : candidates) {
|
5481
7063
|
if (*tok.code_points == 0) {
|
5482
7064
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
5483
7065
|
// that cannot satisfy this position in grammar
|
@@ -5503,7 +7085,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5503
7085
|
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
5504
7086
|
|
5505
7087
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
5506
|
-
for (auto tok : next_rejects) {
|
7088
|
+
for (const auto & tok : next_rejects) {
|
5507
7089
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
5508
7090
|
}
|
5509
7091
|
|
@@ -6635,7 +8217,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6635
8217
|
const std::string name = ggml_get_name(meta);
|
6636
8218
|
|
6637
8219
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6638
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
8220
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6639
8221
|
++n_attention_wv;
|
6640
8222
|
}
|
6641
8223
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6672,6 +8254,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6672
8254
|
}
|
6673
8255
|
|
6674
8256
|
std::ofstream fout(fname_out, std::ios::binary);
|
8257
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6675
8258
|
|
6676
8259
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6677
8260
|
|
@@ -7535,6 +9118,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
|
|
7535
9118
|
}
|
7536
9119
|
|
7537
9120
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
9121
|
+
if (seq_id_src == seq_id_dst) {
|
9122
|
+
return;
|
9123
|
+
}
|
7538
9124
|
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
7539
9125
|
}
|
7540
9126
|
|
@@ -7987,7 +9573,7 @@ int llama_eval_embd(
|
|
7987
9573
|
int n_past) {
|
7988
9574
|
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
7989
9575
|
|
7990
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9576
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7991
9577
|
|
7992
9578
|
const int ret = llama_decode_internal(*ctx, batch);
|
7993
9579
|
if (ret < 0) {
|
@@ -8008,20 +9594,21 @@ struct llama_batch llama_batch_get_one(
|
|
8008
9594
|
llama_pos pos_0,
|
8009
9595
|
llama_seq_id seq_id) {
|
8010
9596
|
return {
|
8011
|
-
/*n_tokens
|
8012
|
-
/*tokens
|
8013
|
-
/*embd
|
8014
|
-
/*pos
|
8015
|
-
/*
|
8016
|
-
/*
|
8017
|
-
/*
|
8018
|
-
/*
|
8019
|
-
/*
|
9597
|
+
/*n_tokens =*/ n_tokens,
|
9598
|
+
/*tokens =*/ tokens,
|
9599
|
+
/*embd =*/ nullptr,
|
9600
|
+
/*pos =*/ nullptr,
|
9601
|
+
/*n_seq_id =*/ nullptr,
|
9602
|
+
/*seq_id =*/ nullptr,
|
9603
|
+
/*logits =*/ nullptr,
|
9604
|
+
/*all_pos_0 =*/ pos_0,
|
9605
|
+
/*all_pos_1 =*/ 1,
|
9606
|
+
/*all_seq_id =*/ seq_id,
|
8020
9607
|
};
|
8021
9608
|
}
|
8022
9609
|
|
8023
|
-
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
8024
|
-
llama_batch batch = {
|
9610
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
9611
|
+
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
8025
9612
|
|
8026
9613
|
if (embd) {
|
8027
9614
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
@@ -8029,19 +9616,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
|
8029
9616
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
8030
9617
|
}
|
8031
9618
|
|
8032
|
-
batch.pos
|
8033
|
-
batch.
|
8034
|
-
batch.
|
9619
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
9620
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
9621
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
9622
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9623
|
+
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
9624
|
+
}
|
9625
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
8035
9626
|
|
8036
9627
|
return batch;
|
8037
9628
|
}
|
8038
9629
|
|
8039
9630
|
void llama_batch_free(struct llama_batch batch) {
|
8040
|
-
if (batch.token)
|
8041
|
-
if (batch.embd)
|
8042
|
-
if (batch.pos)
|
8043
|
-
if (batch.
|
8044
|
-
if (batch.
|
9631
|
+
if (batch.token) free(batch.token);
|
9632
|
+
if (batch.embd) free(batch.embd);
|
9633
|
+
if (batch.pos) free(batch.pos);
|
9634
|
+
if (batch.n_seq_id) free(batch.n_seq_id);
|
9635
|
+
if (batch.seq_id) {
|
9636
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
9637
|
+
free(batch.seq_id[i]);
|
9638
|
+
}
|
9639
|
+
free(batch.seq_id);
|
9640
|
+
}
|
9641
|
+
if (batch.logits) free(batch.logits);
|
8045
9642
|
}
|
8046
9643
|
|
8047
9644
|
int llama_decode(
|
@@ -8106,15 +9703,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
|
|
8106
9703
|
return ctx->model.vocab.special_eot_id;
|
8107
9704
|
}
|
8108
9705
|
|
8109
|
-
|
8110
9706
|
int llama_tokenize(
|
8111
9707
|
const struct llama_model * model,
|
8112
9708
|
const char * text,
|
8113
9709
|
int text_len,
|
8114
9710
|
llama_token * tokens,
|
8115
9711
|
int n_max_tokens,
|
8116
|
-
bool add_bos
|
8117
|
-
|
9712
|
+
bool add_bos,
|
9713
|
+
bool special) {
|
9714
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
8118
9715
|
|
8119
9716
|
if (n_max_tokens < (int) res.size()) {
|
8120
9717
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -8166,7 +9763,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8166
9763
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
9764
|
return 1;
|
8168
9765
|
} else {
|
8169
|
-
|
9766
|
+
// TODO: for now we accept all unsupported token types,
|
9767
|
+
// suppressing them like CONTROL tokens.
|
9768
|
+
// GGML_ASSERT(false);
|
8170
9769
|
}
|
8171
9770
|
break;
|
8172
9771
|
}
|
@@ -8182,7 +9781,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8182
9781
|
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
9782
|
;
|
8184
9783
|
} else {
|
8185
|
-
|
9784
|
+
// TODO: for now we accept all unsupported token types,
|
9785
|
+
// suppressing them like CONTROL tokens.
|
9786
|
+
// GGML_ASSERT(false);
|
8186
9787
|
}
|
8187
9788
|
break;
|
8188
9789
|
}
|