llama_cpp 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <thread>
|
76
76
|
#include <unordered_map>
|
77
77
|
#include <set>
|
78
|
+
#include <forward_list>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
80
81
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -186,7 +187,9 @@ enum llm_arch {
|
|
186
187
|
LLM_ARCH_GPTNEOX,
|
187
188
|
LLM_ARCH_MPT,
|
188
189
|
LLM_ARCH_STARCODER,
|
190
|
+
LLM_ARCH_PERSIMMON,
|
189
191
|
LLM_ARCH_REFACT,
|
192
|
+
LLM_ARCH_BLOOM,
|
190
193
|
LLM_ARCH_UNKNOWN,
|
191
194
|
};
|
192
195
|
|
@@ -199,7 +202,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
199
202
|
{ LLM_ARCH_MPT, "mpt" },
|
200
203
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
201
204
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
-
{
|
205
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
|
+
{ LLM_ARCH_REFACT, "refact" },
|
207
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
203
208
|
};
|
204
209
|
|
205
210
|
enum llm_kv {
|
@@ -302,6 +307,7 @@ struct LLM_KV {
|
|
302
307
|
|
303
308
|
enum llm_tensor {
|
304
309
|
LLM_TENSOR_TOKEN_EMBD,
|
310
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
305
311
|
LLM_TENSOR_POS_EMBD,
|
306
312
|
LLM_TENSOR_OUTPUT,
|
307
313
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -318,6 +324,8 @@ enum llm_tensor {
|
|
318
324
|
LLM_TENSOR_FFN_DOWN,
|
319
325
|
LLM_TENSOR_FFN_UP,
|
320
326
|
LLM_TENSOR_FFN_NORM,
|
327
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
328
|
+
LLM_TENSOR_ATTN_K_NORM,
|
321
329
|
};
|
322
330
|
|
323
331
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -399,10 +407,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
399
407
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
400
408
|
},
|
401
409
|
},
|
410
|
+
{
|
411
|
+
LLM_ARCH_PERSIMMON,
|
412
|
+
{
|
413
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
414
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
415
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
416
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
417
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
418
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
419
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
420
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
422
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
423
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
424
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
425
|
+
},
|
426
|
+
},
|
402
427
|
{
|
403
428
|
LLM_ARCH_MPT,
|
404
429
|
{
|
405
430
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
431
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
432
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
433
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
436
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
437
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
438
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
406
439
|
},
|
407
440
|
},
|
408
441
|
{
|
@@ -437,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
437
470
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
471
|
},
|
439
472
|
},
|
473
|
+
{
|
474
|
+
LLM_ARCH_BLOOM,
|
475
|
+
{
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
477
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
479
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
480
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
481
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
482
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
483
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
484
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
485
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
486
|
+
},
|
487
|
+
},
|
440
488
|
{
|
441
489
|
LLM_ARCH_UNKNOWN,
|
442
490
|
{
|
@@ -954,6 +1002,7 @@ enum e_model {
|
|
954
1002
|
MODEL_1B,
|
955
1003
|
MODEL_3B,
|
956
1004
|
MODEL_7B,
|
1005
|
+
MODEL_8B,
|
957
1006
|
MODEL_13B,
|
958
1007
|
MODEL_15B,
|
959
1008
|
MODEL_30B,
|
@@ -984,6 +1033,9 @@ struct llama_hparams {
|
|
984
1033
|
float rope_freq_base_train;
|
985
1034
|
float rope_freq_scale_train;
|
986
1035
|
|
1036
|
+
float f_clamp_kqv;
|
1037
|
+
float f_max_alibi_bias;
|
1038
|
+
|
987
1039
|
bool operator!=(const llama_hparams & other) const {
|
988
1040
|
if (this->vocab_only != other.vocab_only) return true;
|
989
1041
|
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1036,6 +1088,10 @@ struct llama_layer {
|
|
1036
1088
|
struct ggml_tensor * attn_norm_b;
|
1037
1089
|
struct ggml_tensor * attn_norm_2;
|
1038
1090
|
struct ggml_tensor * attn_norm_2_b;
|
1091
|
+
struct ggml_tensor * attn_q_norm;
|
1092
|
+
struct ggml_tensor * attn_q_norm_b;
|
1093
|
+
struct ggml_tensor * attn_k_norm;
|
1094
|
+
struct ggml_tensor * attn_k_norm_b;
|
1039
1095
|
|
1040
1096
|
// attention
|
1041
1097
|
struct ggml_tensor * wq;
|
@@ -1077,6 +1133,9 @@ struct llama_kv_cell {
|
|
1077
1133
|
struct llama_kv_cache {
|
1078
1134
|
bool has_shift = false;
|
1079
1135
|
|
1136
|
+
// Note: The value of head isn't only used to optimize searching
|
1137
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1138
|
+
// cannot be freely changed after a slot has been allocated.
|
1080
1139
|
uint32_t head = 0;
|
1081
1140
|
uint32_t size = 0;
|
1082
1141
|
|
@@ -1120,6 +1179,8 @@ struct llama_vocab {
|
|
1120
1179
|
std::unordered_map<token, id> token_to_id;
|
1121
1180
|
std::vector<token_data> id_to_token;
|
1122
1181
|
|
1182
|
+
std::unordered_map<token, id> special_tokens_cache;
|
1183
|
+
|
1123
1184
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
1124
1185
|
|
1125
1186
|
// default LLaMA special tokens
|
@@ -1162,6 +1223,8 @@ struct llama_model {
|
|
1162
1223
|
|
1163
1224
|
struct ggml_tensor * tok_embeddings;
|
1164
1225
|
struct ggml_tensor * pos_embeddings;
|
1226
|
+
struct ggml_tensor * tok_norm;
|
1227
|
+
struct ggml_tensor * tok_norm_b;
|
1165
1228
|
|
1166
1229
|
struct ggml_tensor * output_norm;
|
1167
1230
|
struct ggml_tensor * output_norm_b;
|
@@ -1291,7 +1354,11 @@ static bool llama_kv_cache_init(
|
|
1291
1354
|
cache.cells.clear();
|
1292
1355
|
cache.cells.resize(n_ctx);
|
1293
1356
|
|
1357
|
+
// TODO: this should be:
|
1358
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1359
|
+
// change it and test that it works
|
1294
1360
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1361
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1295
1362
|
|
1296
1363
|
struct ggml_init_params params;
|
1297
1364
|
params.mem_size = cache.buf.size;
|
@@ -1334,6 +1401,8 @@ static bool llama_kv_cache_init(
|
|
1334
1401
|
|
1335
1402
|
// find an empty slot of size "n_tokens" in the cache
|
1336
1403
|
// updates the cache head
|
1404
|
+
// Note: On success, it's important that cache.head points
|
1405
|
+
// to the first cell of the slot.
|
1337
1406
|
static bool llama_kv_cache_find_slot(
|
1338
1407
|
struct llama_kv_cache & cache,
|
1339
1408
|
const struct llama_batch & batch) {
|
@@ -1349,8 +1418,8 @@ static bool llama_kv_cache_find_slot(
|
|
1349
1418
|
|
1350
1419
|
while (true) {
|
1351
1420
|
if (cache.head + n_tokens > n_ctx) {
|
1421
|
+
n_tested += n_ctx - cache.head;
|
1352
1422
|
cache.head = 0;
|
1353
|
-
n_tested += n_ctx - cache.head;
|
1354
1423
|
continue;
|
1355
1424
|
}
|
1356
1425
|
|
@@ -1376,7 +1445,10 @@ static bool llama_kv_cache_find_slot(
|
|
1376
1445
|
|
1377
1446
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
1378
1447
|
cache.cells[cache.head + i].pos = batch.pos[i];
|
1379
|
-
|
1448
|
+
|
1449
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
1450
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
|
1451
|
+
}
|
1380
1452
|
}
|
1381
1453
|
|
1382
1454
|
return true;
|
@@ -1401,6 +1473,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1401
1473
|
cache.cells[i].pos = -1;
|
1402
1474
|
cache.cells[i].seq_id.clear();
|
1403
1475
|
}
|
1476
|
+
|
1477
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1478
|
+
cache.head = uint32_t(c0);
|
1404
1479
|
}
|
1405
1480
|
|
1406
1481
|
static void llama_kv_cache_seq_rm(
|
@@ -1408,6 +1483,8 @@ static void llama_kv_cache_seq_rm(
|
|
1408
1483
|
llama_seq_id seq_id,
|
1409
1484
|
llama_pos p0,
|
1410
1485
|
llama_pos p1) {
|
1486
|
+
uint32_t new_head = cache.size;
|
1487
|
+
|
1411
1488
|
if (p0 < 0) p0 = 0;
|
1412
1489
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
1490
|
|
@@ -1416,9 +1493,13 @@ static void llama_kv_cache_seq_rm(
|
|
1416
1493
|
cache.cells[i].seq_id.erase(seq_id);
|
1417
1494
|
if (cache.cells[i].seq_id.empty()) {
|
1418
1495
|
cache.cells[i].pos = -1;
|
1496
|
+
if (new_head == cache.size) new_head = i;
|
1419
1497
|
}
|
1420
1498
|
}
|
1421
1499
|
}
|
1500
|
+
|
1501
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1502
|
+
if (new_head != cache.size) cache.head = new_head;
|
1422
1503
|
}
|
1423
1504
|
|
1424
1505
|
static void llama_kv_cache_seq_cp(
|
@@ -1430,6 +1511,8 @@ static void llama_kv_cache_seq_cp(
|
|
1430
1511
|
if (p0 < 0) p0 = 0;
|
1431
1512
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
1513
|
|
1514
|
+
cache.head = 0;
|
1515
|
+
|
1433
1516
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
1517
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
1518
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1438,12 +1521,21 @@ static void llama_kv_cache_seq_cp(
|
|
1438
1521
|
}
|
1439
1522
|
|
1440
1523
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1524
|
+
uint32_t new_head = cache.size;
|
1525
|
+
|
1441
1526
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
1527
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
1528
|
cache.cells[i].pos = -1;
|
1444
1529
|
cache.cells[i].seq_id.clear();
|
1530
|
+
if (new_head == cache.size) new_head = i;
|
1531
|
+
} else {
|
1532
|
+
cache.cells[i].seq_id.clear();
|
1533
|
+
cache.cells[i].seq_id.insert(seq_id);
|
1445
1534
|
}
|
1446
1535
|
}
|
1536
|
+
|
1537
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1538
|
+
if (new_head != cache.size) cache.head = new_head;
|
1447
1539
|
}
|
1448
1540
|
|
1449
1541
|
static void llama_kv_cache_seq_shift(
|
@@ -1452,6 +1544,8 @@ static void llama_kv_cache_seq_shift(
|
|
1452
1544
|
llama_pos p0,
|
1453
1545
|
llama_pos p1,
|
1454
1546
|
llama_pos delta) {
|
1547
|
+
uint32_t new_head = cache.size;
|
1548
|
+
|
1455
1549
|
if (p0 < 0) p0 = 0;
|
1456
1550
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
1551
|
|
@@ -1461,12 +1555,17 @@ static void llama_kv_cache_seq_shift(
|
|
1461
1555
|
if (cache.cells[i].pos < 0) {
|
1462
1556
|
cache.cells[i].pos = -1;
|
1463
1557
|
cache.cells[i].seq_id.clear();
|
1558
|
+
if (new_head == cache.size) new_head = i;
|
1464
1559
|
} else {
|
1465
1560
|
cache.has_shift = true;
|
1466
1561
|
cache.cells[i].delta = delta;
|
1467
1562
|
}
|
1468
1563
|
}
|
1469
1564
|
}
|
1565
|
+
|
1566
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1567
|
+
// Otherwise we just start the next search from the beginning.
|
1568
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1470
1569
|
}
|
1471
1570
|
|
1472
1571
|
//
|
@@ -1670,7 +1769,7 @@ struct llama_model_loader {
|
|
1670
1769
|
}
|
1671
1770
|
}
|
1672
1771
|
|
1673
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1772
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1674
1773
|
if (backend != GGML_BACKEND_CPU) {
|
1675
1774
|
ggml_set_no_alloc(ctx, true);
|
1676
1775
|
}
|
@@ -1688,7 +1787,7 @@ struct llama_model_loader {
|
|
1688
1787
|
return tensor;
|
1689
1788
|
}
|
1690
1789
|
|
1691
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1790
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1692
1791
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1693
1792
|
|
1694
1793
|
if (cur == NULL) {
|
@@ -1867,6 +1966,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1867
1966
|
case MODEL_1B: return "1B";
|
1868
1967
|
case MODEL_3B: return "3B";
|
1869
1968
|
case MODEL_7B: return "7B";
|
1969
|
+
case MODEL_8B: return "8B";
|
1870
1970
|
case MODEL_13B: return "13B";
|
1871
1971
|
case MODEL_15B: return "15B";
|
1872
1972
|
case MODEL_30B: return "30B";
|
@@ -1979,6 +2079,14 @@ static void llm_load_hparams(
|
|
1979
2079
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1980
2080
|
}
|
1981
2081
|
} break;
|
2082
|
+
case LLM_ARCH_PERSIMMON:
|
2083
|
+
{
|
2084
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2085
|
+
switch (hparams.n_layer) {
|
2086
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2087
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2088
|
+
}
|
2089
|
+
} break;
|
1982
2090
|
case LLM_ARCH_REFACT:
|
1983
2091
|
{
|
1984
2092
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -1987,6 +2095,33 @@ static void llm_load_hparams(
|
|
1987
2095
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
2096
|
}
|
1989
2097
|
} break;
|
2098
|
+
case LLM_ARCH_BLOOM:
|
2099
|
+
{
|
2100
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2101
|
+
|
2102
|
+
switch (hparams.n_layer) {
|
2103
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2104
|
+
case 30:
|
2105
|
+
switch (hparams.n_embd) {
|
2106
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2107
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2108
|
+
} break;
|
2109
|
+
}
|
2110
|
+
} break;
|
2111
|
+
case LLM_ARCH_MPT:
|
2112
|
+
{
|
2113
|
+
hparams.f_clamp_kqv = 0.0f;
|
2114
|
+
|
2115
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2116
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2117
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2118
|
+
|
2119
|
+
switch (hparams.n_layer) {
|
2120
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2121
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2122
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2123
|
+
}
|
2124
|
+
} break;
|
1990
2125
|
default: (void)0;
|
1991
2126
|
}
|
1992
2127
|
|
@@ -1994,7 +2129,7 @@ static void llm_load_hparams(
|
|
1994
2129
|
}
|
1995
2130
|
|
1996
2131
|
// TODO: This should probably be in llama.h
|
1997
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
2132
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
1998
2133
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1999
2134
|
|
2000
2135
|
static void llm_load_vocab(
|
@@ -2110,6 +2245,101 @@ static void llm_load_vocab(
|
|
2110
2245
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
2111
2246
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
2112
2247
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
2248
|
+
|
2249
|
+
// build special tokens cache
|
2250
|
+
{
|
2251
|
+
// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
|
2252
|
+
// and will always be correctly labeled in 'added_tokens.json' etc.
|
2253
|
+
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2254
|
+
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2255
|
+
// are special tokens.
|
2256
|
+
// From testing, this appears to corelate 1:1 with special tokens.
|
2257
|
+
//
|
2258
|
+
|
2259
|
+
// Counting special tokens and verifying in only one direction
|
2260
|
+
// is sufficient to detect difference in those two sets.
|
2261
|
+
//
|
2262
|
+
uint32_t special_tokens_count_by_type = 0;
|
2263
|
+
uint32_t special_tokens_count_from_verification = 0;
|
2264
|
+
|
2265
|
+
bool special_tokens_definition_mismatch = false;
|
2266
|
+
|
2267
|
+
for (const auto & t : vocab.token_to_id) {
|
2268
|
+
const auto & token = t.first;
|
2269
|
+
const auto & id = t.second;
|
2270
|
+
|
2271
|
+
// Count all non-normal tokens in the vocab while iterating
|
2272
|
+
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
2273
|
+
special_tokens_count_by_type++;
|
2274
|
+
}
|
2275
|
+
|
2276
|
+
// Skip single character tokens
|
2277
|
+
if (token.length() > 1) {
|
2278
|
+
bool is_tokenizable = false;
|
2279
|
+
|
2280
|
+
// Split token string representation in two, in all possible ways
|
2281
|
+
// and check if both halves can be matched to a valid token
|
2282
|
+
for (unsigned i = 1; i < token.length();) {
|
2283
|
+
const auto left = token.substr(0, i);
|
2284
|
+
const auto right = token.substr(i);
|
2285
|
+
|
2286
|
+
// check if we didnt partition in the middle of a utf sequence
|
2287
|
+
auto utf = utf8_len(left.at(left.length() - 1));
|
2288
|
+
|
2289
|
+
if (utf == 1) {
|
2290
|
+
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
2291
|
+
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
2292
|
+
is_tokenizable = true;
|
2293
|
+
break;
|
2294
|
+
}
|
2295
|
+
i++;
|
2296
|
+
} else {
|
2297
|
+
// skip over the rest of multibyte utf sequence
|
2298
|
+
i += utf - 1;
|
2299
|
+
}
|
2300
|
+
}
|
2301
|
+
|
2302
|
+
if (!is_tokenizable) {
|
2303
|
+
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
|
2304
|
+
// it's faster to re-filter them here, since there are way less candidates now
|
2305
|
+
|
2306
|
+
// Calculate a total "utf" length of a token string representation
|
2307
|
+
size_t utf8_str_len = 0;
|
2308
|
+
for (unsigned i = 0; i < token.length();) {
|
2309
|
+
utf8_str_len++;
|
2310
|
+
i += utf8_len(token.at(i));
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
// And skip the ones which are one character
|
2314
|
+
if (utf8_str_len > 1) {
|
2315
|
+
// At this point what we have left are special tokens only
|
2316
|
+
vocab.special_tokens_cache[token] = id;
|
2317
|
+
|
2318
|
+
// Count manually found special tokens
|
2319
|
+
special_tokens_count_from_verification++;
|
2320
|
+
|
2321
|
+
// If this manually found special token is not marked as such, flag a mismatch
|
2322
|
+
if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
|
2323
|
+
special_tokens_definition_mismatch = true;
|
2324
|
+
}
|
2325
|
+
}
|
2326
|
+
}
|
2327
|
+
}
|
2328
|
+
}
|
2329
|
+
|
2330
|
+
if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
|
2331
|
+
LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
|
2332
|
+
__func__,
|
2333
|
+
special_tokens_count_from_verification, vocab.id_to_token.size(),
|
2334
|
+
special_tokens_count_by_type, vocab.id_to_token.size()
|
2335
|
+
);
|
2336
|
+
} else {
|
2337
|
+
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
2338
|
+
__func__,
|
2339
|
+
special_tokens_count_from_verification, vocab.id_to_token.size()
|
2340
|
+
);
|
2341
|
+
}
|
2342
|
+
}
|
2113
2343
|
}
|
2114
2344
|
|
2115
2345
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
@@ -2131,6 +2361,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2131
2361
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
2362
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
2363
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2364
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2365
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2134
2366
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
2367
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
2368
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2230,8 +2462,8 @@ static void llm_load_tensors(
|
|
2230
2462
|
|
2231
2463
|
// output
|
2232
2464
|
{
|
2233
|
-
|
2234
|
-
|
2465
|
+
ggml_backend_type backend_norm;
|
2466
|
+
ggml_backend_type backend_output;
|
2235
2467
|
|
2236
2468
|
if (n_gpu_layers > int(n_layer)) {
|
2237
2469
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2266,8 +2498,8 @@ static void llm_load_tensors(
|
|
2266
2498
|
model.layers.resize(n_layer);
|
2267
2499
|
|
2268
2500
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2269
|
-
const
|
2270
|
-
const
|
2501
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2502
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2271
2503
|
|
2272
2504
|
auto & layer = model.layers[i];
|
2273
2505
|
|
@@ -2296,8 +2528,8 @@ static void llm_load_tensors(
|
|
2296
2528
|
{
|
2297
2529
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2298
2530
|
{
|
2299
|
-
|
2300
|
-
|
2531
|
+
ggml_backend_type backend_norm;
|
2532
|
+
ggml_backend_type backend_output;
|
2301
2533
|
|
2302
2534
|
if (n_gpu_layers > int(n_layer)) {
|
2303
2535
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2332,8 +2564,8 @@ static void llm_load_tensors(
|
|
2332
2564
|
model.layers.resize(n_layer);
|
2333
2565
|
|
2334
2566
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2335
|
-
const
|
2336
|
-
const
|
2567
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2568
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2337
2569
|
|
2338
2570
|
auto & layer = model.layers[i];
|
2339
2571
|
|
@@ -2366,8 +2598,8 @@ static void llm_load_tensors(
|
|
2366
2598
|
|
2367
2599
|
// output
|
2368
2600
|
{
|
2369
|
-
|
2370
|
-
|
2601
|
+
ggml_backend_type backend_norm;
|
2602
|
+
ggml_backend_type backend_output;
|
2371
2603
|
|
2372
2604
|
if (n_gpu_layers > int(n_layer)) {
|
2373
2605
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2404,8 +2636,8 @@ static void llm_load_tensors(
|
|
2404
2636
|
model.layers.resize(n_layer);
|
2405
2637
|
|
2406
2638
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2407
|
-
const
|
2408
|
-
const
|
2639
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2640
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2409
2641
|
|
2410
2642
|
auto & layer = model.layers[i];
|
2411
2643
|
|
@@ -2443,8 +2675,8 @@ static void llm_load_tensors(
|
|
2443
2675
|
|
2444
2676
|
// output
|
2445
2677
|
{
|
2446
|
-
|
2447
|
-
|
2678
|
+
ggml_backend_type backend_norm;
|
2679
|
+
ggml_backend_type backend_output;
|
2448
2680
|
|
2449
2681
|
if (n_gpu_layers > int(n_layer)) {
|
2450
2682
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2481,8 +2713,8 @@ static void llm_load_tensors(
|
|
2481
2713
|
model.layers.resize(n_layer);
|
2482
2714
|
|
2483
2715
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2484
|
-
const
|
2485
|
-
const
|
2716
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2717
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2486
2718
|
|
2487
2719
|
auto & layer = model.layers[i];
|
2488
2720
|
|
@@ -2515,117 +2747,327 @@ static void llm_load_tensors(
|
|
2515
2747
|
}
|
2516
2748
|
}
|
2517
2749
|
} break;
|
2518
|
-
|
2519
|
-
|
2520
|
-
|
2521
|
-
}
|
2750
|
+
case LLM_ARCH_PERSIMMON:
|
2751
|
+
{
|
2752
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2522
2753
|
|
2523
|
-
|
2754
|
+
{
|
2755
|
+
ggml_backend_type backend_norm;
|
2756
|
+
ggml_backend_type backend_output;
|
2524
2757
|
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2758
|
+
if (n_gpu_layers > int(n_layer)) {
|
2759
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2760
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2761
|
+
#ifndef _WIN32
|
2762
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2763
|
+
#else
|
2764
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2765
|
+
#endif // _WIN32
|
2531
2766
|
|
2532
|
-
|
2767
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2768
|
+
} else {
|
2769
|
+
backend_norm = GGML_BACKEND_CPU;
|
2770
|
+
backend_output = GGML_BACKEND_CPU;
|
2771
|
+
}
|
2533
2772
|
|
2534
|
-
|
2535
|
-
|
2773
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2774
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2775
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2536
2776
|
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2777
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2778
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2779
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2780
|
+
}
|
2781
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2782
|
+
vram_weights += ggml_nbytes(model.output);
|
2783
|
+
}
|
2784
|
+
}
|
2541
2785
|
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2546
|
-
|
2547
|
-
|
2548
|
-
|
2786
|
+
const uint32_t n_ff = hparams.n_ff;
|
2787
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2788
|
+
model.layers.resize(n_layer);
|
2789
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2790
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2791
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2792
|
+
auto & layer = model.layers[i];
|
2793
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2794
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2795
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2796
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2797
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2798
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2799
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2800
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2801
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2802
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2803
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2804
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2805
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2806
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2807
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2808
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2809
|
+
}
|
2810
|
+
} break;
|
2811
|
+
case LLM_ARCH_BLOOM:
|
2812
|
+
{
|
2813
|
+
// TODO: CPU-only for now
|
2549
2814
|
|
2550
|
-
|
2551
|
-
|
2552
|
-
|
2553
|
-
(void) n_gpu_layers;
|
2554
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2555
|
-
}
|
2815
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2816
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2817
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2556
2818
|
|
2557
|
-
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2561
|
-
}
|
2819
|
+
// output
|
2820
|
+
{
|
2821
|
+
ggml_backend_type backend_norm;
|
2822
|
+
ggml_backend_type backend_output;
|
2562
2823
|
|
2563
|
-
|
2564
|
-
|
2565
|
-
|
2566
|
-
|
2567
|
-
|
2568
|
-
#
|
2824
|
+
if (n_gpu_layers > int(n_layer)) {
|
2825
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2826
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2827
|
+
#ifndef _WIN32
|
2828
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2829
|
+
#else
|
2830
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2831
|
+
#endif // _WIN32
|
2569
2832
|
|
2570
|
-
|
2833
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2834
|
+
} else {
|
2835
|
+
backend_norm = GGML_BACKEND_CPU;
|
2836
|
+
backend_output = GGML_BACKEND_CPU;
|
2837
|
+
}
|
2571
2838
|
|
2572
|
-
|
2573
|
-
|
2574
|
-
|
2839
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2840
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2841
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2575
2842
|
|
2576
|
-
|
2843
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2844
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2845
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2846
|
+
}
|
2847
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2848
|
+
vram_weights += ggml_nbytes(model.output);
|
2849
|
+
}
|
2850
|
+
}
|
2577
2851
|
|
2578
|
-
|
2579
|
-
// we take page faults deferred by mmap() into consideration
|
2580
|
-
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2581
|
-
}
|
2852
|
+
const uint32_t n_ff = hparams.n_ff;
|
2582
2853
|
|
2583
|
-
|
2584
|
-
const std::string & fname,
|
2585
|
-
llama_model & model,
|
2586
|
-
int n_gpu_layers,
|
2587
|
-
int main_gpu,
|
2588
|
-
const float * tensor_split,
|
2589
|
-
bool use_mmap,
|
2590
|
-
bool use_mlock,
|
2591
|
-
bool vocab_only,
|
2592
|
-
llama_progress_callback progress_callback,
|
2593
|
-
void *progress_callback_user_data) {
|
2594
|
-
try {
|
2595
|
-
llama_model_loader ml(fname, use_mmap);
|
2854
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2596
2855
|
|
2597
|
-
|
2856
|
+
model.layers.resize(n_layer);
|
2598
2857
|
|
2599
|
-
|
2600
|
-
|
2601
|
-
|
2858
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2859
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2860
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2602
2861
|
|
2603
|
-
|
2862
|
+
auto & layer = model.layers[i];
|
2604
2863
|
|
2605
|
-
|
2606
|
-
|
2607
|
-
}
|
2864
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2865
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2608
2866
|
|
2609
|
-
|
2610
|
-
|
2611
|
-
return true;
|
2612
|
-
}
|
2867
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2868
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2613
2869
|
|
2614
|
-
|
2615
|
-
|
2616
|
-
main_gpu, tensor_split,
|
2617
|
-
use_mlock, progress_callback, progress_callback_user_data);
|
2618
|
-
} catch (const std::exception & err) {
|
2619
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
2620
|
-
return false;
|
2621
|
-
}
|
2870
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2871
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2622
2872
|
|
2623
|
-
|
2624
|
-
}
|
2873
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2874
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2875
|
+
|
2876
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2877
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2878
|
+
|
2879
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2880
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2881
|
+
|
2882
|
+
if (backend == GGML_BACKEND_GPU) {
|
2883
|
+
vram_weights +=
|
2884
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2885
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2886
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2887
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2888
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2889
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2890
|
+
}
|
2891
|
+
}
|
2892
|
+
} break;
|
2893
|
+
case LLM_ARCH_MPT:
|
2894
|
+
{
|
2895
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2896
|
+
|
2897
|
+
// output
|
2898
|
+
{
|
2899
|
+
ggml_backend_type backend_norm;
|
2900
|
+
ggml_backend_type backend_output;
|
2901
|
+
|
2902
|
+
if (n_gpu_layers > int(n_layer)) {
|
2903
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2904
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2905
|
+
#ifndef _WIN32
|
2906
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2907
|
+
#else
|
2908
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2909
|
+
#endif // _WIN32
|
2910
|
+
|
2911
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2912
|
+
} else {
|
2913
|
+
backend_norm = GGML_BACKEND_CPU;
|
2914
|
+
backend_output = GGML_BACKEND_CPU;
|
2915
|
+
}
|
2916
|
+
|
2917
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2918
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2919
|
+
|
2920
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2921
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2922
|
+
}
|
2923
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2924
|
+
vram_weights += ggml_nbytes(model.output);
|
2925
|
+
}
|
2926
|
+
}
|
2927
|
+
|
2928
|
+
const uint32_t n_ff = hparams.n_ff;
|
2929
|
+
|
2930
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2931
|
+
|
2932
|
+
model.layers.resize(n_layer);
|
2933
|
+
|
2934
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2935
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2936
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2937
|
+
|
2938
|
+
auto & layer = model.layers[i];
|
2939
|
+
|
2940
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2941
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2942
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2943
|
+
|
2944
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2945
|
+
|
2946
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2947
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2948
|
+
|
2949
|
+
if (backend == GGML_BACKEND_GPU) {
|
2950
|
+
vram_weights +=
|
2951
|
+
ggml_nbytes(layer.attn_norm) +
|
2952
|
+
ggml_nbytes(layer.wqkv) +
|
2953
|
+
ggml_nbytes(layer.wo) +
|
2954
|
+
ggml_nbytes(layer.ffn_norm) +
|
2955
|
+
ggml_nbytes(layer.w2) +
|
2956
|
+
ggml_nbytes(layer.w3);
|
2957
|
+
}
|
2958
|
+
}
|
2959
|
+
} break;
|
2960
|
+
default:
|
2961
|
+
throw std::runtime_error("unknown architecture");
|
2962
|
+
}
|
2963
|
+
}
|
2964
|
+
|
2965
|
+
ml.done_getting_tensors();
|
2966
|
+
|
2967
|
+
// print memory requirements
|
2968
|
+
{
|
2969
|
+
// this is the total memory required to run the inference
|
2970
|
+
size_t mem_required =
|
2971
|
+
ctx_size +
|
2972
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2973
|
+
|
2974
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2975
|
+
|
2976
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2977
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
2978
|
+
|
2979
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
2980
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
2981
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2982
|
+
}
|
2983
|
+
|
2984
|
+
#ifdef GGML_USE_CUBLAS
|
2985
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2986
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2987
|
+
#elif defined(GGML_USE_CLBLAST)
|
2988
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2989
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2990
|
+
#endif // GGML_USE_CUBLAS
|
2991
|
+
|
2992
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2993
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2994
|
+
#else
|
2995
|
+
(void) n_gpu_layers;
|
2996
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2997
|
+
}
|
2998
|
+
|
2999
|
+
// populate `tensors_by_name`
|
3000
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
3001
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
3002
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3003
|
+
}
|
3004
|
+
|
3005
|
+
(void) tensor_split;
|
3006
|
+
#ifdef GGML_USE_CUBLAS
|
3007
|
+
{
|
3008
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
3009
|
+
}
|
3010
|
+
#endif
|
3011
|
+
|
3012
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
3013
|
+
|
3014
|
+
if (progress_callback) {
|
3015
|
+
progress_callback(1.0f, progress_callback_user_data);
|
3016
|
+
}
|
3017
|
+
|
3018
|
+
model.mapping = std::move(ml.mapping);
|
3019
|
+
|
3020
|
+
// loading time will be recalculate after the first eval, so
|
3021
|
+
// we take page faults deferred by mmap() into consideration
|
3022
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
3023
|
+
}
|
3024
|
+
|
3025
|
+
static bool llama_model_load(
|
3026
|
+
const std::string & fname,
|
3027
|
+
llama_model & model,
|
3028
|
+
int n_gpu_layers,
|
3029
|
+
int main_gpu,
|
3030
|
+
const float * tensor_split,
|
3031
|
+
bool use_mmap,
|
3032
|
+
bool use_mlock,
|
3033
|
+
bool vocab_only,
|
3034
|
+
llama_progress_callback progress_callback,
|
3035
|
+
void *progress_callback_user_data) {
|
3036
|
+
try {
|
3037
|
+
llama_model_loader ml(fname, use_mmap);
|
3038
|
+
|
3039
|
+
model.hparams.vocab_only = vocab_only;
|
3040
|
+
|
3041
|
+
llm_load_arch (ml, model);
|
3042
|
+
llm_load_hparams(ml, model);
|
3043
|
+
llm_load_vocab (ml, model);
|
3044
|
+
|
3045
|
+
llm_load_print_meta(ml, model);
|
3046
|
+
|
3047
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
3048
|
+
throw std::runtime_error("vocab size mismatch");
|
3049
|
+
}
|
3050
|
+
|
3051
|
+
if (vocab_only) {
|
3052
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
3053
|
+
return true;
|
3054
|
+
}
|
3055
|
+
|
3056
|
+
llm_load_tensors(
|
3057
|
+
ml, model, n_gpu_layers,
|
3058
|
+
main_gpu, tensor_split,
|
3059
|
+
use_mlock, progress_callback, progress_callback_user_data);
|
3060
|
+
} catch (const std::exception & err) {
|
3061
|
+
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3062
|
+
return false;
|
3063
|
+
}
|
3064
|
+
|
3065
|
+
return true;
|
3066
|
+
}
|
2625
3067
|
|
2626
3068
|
static struct ggml_cgraph * llm_build_llama(
|
2627
|
-
|
2628
|
-
|
3069
|
+
llama_context & lctx,
|
3070
|
+
const llama_batch & batch) {
|
2629
3071
|
const auto & model = lctx.model;
|
2630
3072
|
const auto & hparams = model.hparams;
|
2631
3073
|
const auto & cparams = lctx.cparams;
|
@@ -2663,11 +3105,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2663
3105
|
struct ggml_init_params params = {
|
2664
3106
|
/*.mem_size =*/ buf_compute.size,
|
2665
3107
|
/*.mem_buffer =*/ buf_compute.data,
|
2666
|
-
/*.no_alloc =*/
|
3108
|
+
/*.no_alloc =*/ true,
|
2667
3109
|
};
|
2668
3110
|
|
2669
|
-
params.no_alloc = true;
|
2670
|
-
|
2671
3111
|
struct ggml_context * ctx0 = ggml_init(params);
|
2672
3112
|
|
2673
3113
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -2739,7 +3179,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2739
3179
|
for (int h = 0; h < 1; ++h) {
|
2740
3180
|
for (int j = 0; j < n_tokens; ++j) {
|
2741
3181
|
const llama_pos pos = batch.pos[j];
|
2742
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3182
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
2743
3183
|
|
2744
3184
|
for (int i = 0; i < n_kv; ++i) {
|
2745
3185
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3051,11 +3491,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3051
3491
|
struct ggml_init_params params = {
|
3052
3492
|
/*.mem_size =*/ buf_compute.size,
|
3053
3493
|
/*.mem_buffer =*/ buf_compute.data,
|
3054
|
-
/*.no_alloc =*/
|
3494
|
+
/*.no_alloc =*/ true,
|
3055
3495
|
};
|
3056
3496
|
|
3057
|
-
params.no_alloc = true;
|
3058
|
-
|
3059
3497
|
struct ggml_context * ctx0 = ggml_init(params);
|
3060
3498
|
|
3061
3499
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3127,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3127
3565
|
for (int h = 0; h < 1; ++h) {
|
3128
3566
|
for (int j = 0; j < n_tokens; ++j) {
|
3129
3567
|
const llama_pos pos = batch.pos[j];
|
3130
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3131
3569
|
|
3132
3570
|
for (int i = 0; i < n_kv; ++i) {
|
3133
3571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3452,11 +3890,9 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3452
3890
|
struct ggml_init_params params = {
|
3453
3891
|
/*.mem_size =*/ buf_compute.size,
|
3454
3892
|
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
-
/*.no_alloc =*/
|
3893
|
+
/*.no_alloc =*/ true,
|
3456
3894
|
};
|
3457
3895
|
|
3458
|
-
params.no_alloc = true;
|
3459
|
-
|
3460
3896
|
struct ggml_context * ctx0 = ggml_init(params);
|
3461
3897
|
|
3462
3898
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3528,7 +3964,7 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3528
3964
|
for (int h = 0; h < 1; ++h) {
|
3529
3965
|
for (int j = 0; j < n_tokens; ++j) {
|
3530
3966
|
const llama_pos pos = batch.pos[j];
|
3531
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3967
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3532
3968
|
|
3533
3969
|
for (int i = 0; i < n_kv; ++i) {
|
3534
3970
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3806,11 +4242,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3806
4242
|
struct ggml_init_params params = {
|
3807
4243
|
/*.mem_size =*/ buf_compute.size,
|
3808
4244
|
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
-
/*.no_alloc =*/
|
4245
|
+
/*.no_alloc =*/ true,
|
3810
4246
|
};
|
3811
4247
|
|
3812
|
-
params.no_alloc = true;
|
3813
|
-
|
3814
4248
|
struct ggml_context * ctx0 = ggml_init(params);
|
3815
4249
|
|
3816
4250
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3882,7 +4316,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3882
4316
|
for (int h = 0; h < 1; ++h) {
|
3883
4317
|
for (int j = 0; j < n_tokens; ++j) {
|
3884
4318
|
const llama_pos pos = batch.pos[j];
|
3885
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4319
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3886
4320
|
|
3887
4321
|
for (int i = 0; i < n_kv; ++i) {
|
3888
4322
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4166,11 +4600,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4166
4600
|
struct ggml_init_params params = {
|
4167
4601
|
/*.mem_size =*/ buf_compute.size,
|
4168
4602
|
/*.mem_buffer =*/ buf_compute.data,
|
4169
|
-
/*.no_alloc =*/
|
4603
|
+
/*.no_alloc =*/ true,
|
4170
4604
|
};
|
4171
4605
|
|
4172
|
-
params.no_alloc = true;
|
4173
|
-
|
4174
4606
|
struct ggml_context * ctx0 = ggml_init(params);
|
4175
4607
|
|
4176
4608
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4199,23 +4631,919 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4199
4631
|
|
4200
4632
|
ggml_allocr_alloc(lctx.alloc, token);
|
4201
4633
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4202
|
-
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4634
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4635
|
+
}
|
4636
|
+
}
|
4637
|
+
|
4638
|
+
{
|
4639
|
+
// Compute position embeddings.
|
4640
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4641
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4642
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4643
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4644
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4645
|
+
}
|
4646
|
+
}
|
4647
|
+
ggml_set_name(inp_positions, "inp_positions");
|
4648
|
+
|
4649
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
4650
|
+
}
|
4651
|
+
|
4652
|
+
// KQ_scale
|
4653
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4654
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4655
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4656
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4657
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
4658
|
+
}
|
4659
|
+
|
4660
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4661
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4662
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4663
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4664
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4665
|
+
float * data = (float *) KQ_mask->data;
|
4666
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4667
|
+
|
4668
|
+
for (int h = 0; h < 1; ++h) {
|
4669
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4670
|
+
const llama_pos pos = batch.pos[j];
|
4671
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4672
|
+
|
4673
|
+
for (int i = 0; i < n_kv; ++i) {
|
4674
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4675
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4676
|
+
}
|
4677
|
+
}
|
4678
|
+
}
|
4679
|
+
}
|
4680
|
+
}
|
4681
|
+
|
4682
|
+
inpL = ggml_add(ctx0, token, position);
|
4683
|
+
ggml_set_name(inpL, "inpL");
|
4684
|
+
|
4685
|
+
for (int il = 0; il < n_layer; ++il) {
|
4686
|
+
{
|
4687
|
+
// Norm
|
4688
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4689
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
{
|
4693
|
+
// Self Attention
|
4694
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
4695
|
+
|
4696
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4697
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4698
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
4699
|
+
|
4700
|
+
struct ggml_tensor * Qcur = tmpq;
|
4701
|
+
struct ggml_tensor * Kcur = tmpk;
|
4702
|
+
|
4703
|
+
{
|
4704
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4705
|
+
ggml_set_name(Vcur, "Vcur");
|
4706
|
+
|
4707
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4708
|
+
ggml_set_name(k, "k");
|
4709
|
+
|
4710
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4711
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4712
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4713
|
+
|
4714
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4715
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4716
|
+
}
|
4717
|
+
|
4718
|
+
struct ggml_tensor * Q =
|
4719
|
+
ggml_permute(ctx0,
|
4720
|
+
ggml_cpy(ctx0,
|
4721
|
+
Qcur,
|
4722
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4723
|
+
0, 2, 1, 3);
|
4724
|
+
ggml_set_name(Q, "Q");
|
4725
|
+
|
4726
|
+
struct ggml_tensor * K =
|
4727
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4728
|
+
n_embd_head, n_kv, n_head_kv,
|
4729
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4730
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4731
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4732
|
+
ggml_set_name(K, "K");
|
4733
|
+
|
4734
|
+
// K * Q
|
4735
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4736
|
+
ggml_set_name(KQ, "KQ");
|
4737
|
+
|
4738
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
4739
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
4740
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
4741
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4742
|
+
|
4743
|
+
// KQ_masked = mask_past(KQ_scaled)
|
4744
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4745
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4746
|
+
|
4747
|
+
// KQ = soft_max(KQ_masked)
|
4748
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
4749
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4750
|
+
|
4751
|
+
// split cached V into n_head heads
|
4752
|
+
struct ggml_tensor * V =
|
4753
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4754
|
+
n_kv, n_embd_head, n_head_kv,
|
4755
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4756
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4757
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4758
|
+
ggml_set_name(V, "V");
|
4759
|
+
|
4760
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4761
|
+
ggml_set_name(KQV, "KQV");
|
4762
|
+
|
4763
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4764
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4765
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4766
|
+
|
4767
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4768
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4769
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4770
|
+
}
|
4771
|
+
|
4772
|
+
// Projection
|
4773
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
4774
|
+
|
4775
|
+
// Add the input
|
4776
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4777
|
+
|
4778
|
+
struct ggml_tensor * inpFF = cur;
|
4779
|
+
|
4780
|
+
// FF
|
4781
|
+
{
|
4782
|
+
// Norm
|
4783
|
+
{
|
4784
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
4785
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
4786
|
+
}
|
4787
|
+
|
4788
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
4789
|
+
|
4790
|
+
// GELU activation
|
4791
|
+
cur = ggml_gelu(ctx0, cur);
|
4792
|
+
|
4793
|
+
// Projection
|
4794
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
4795
|
+
}
|
4796
|
+
|
4797
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
4798
|
+
}
|
4799
|
+
|
4800
|
+
// Output Norm
|
4801
|
+
{
|
4802
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4803
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
4804
|
+
}
|
4805
|
+
ggml_set_name(cur, "result_norm");
|
4806
|
+
|
4807
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4808
|
+
ggml_set_name(cur, "result_output");
|
4809
|
+
|
4810
|
+
ggml_build_forward_expand(gf, cur);
|
4811
|
+
ggml_free(ctx0);
|
4812
|
+
|
4813
|
+
return gf;
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4817
|
+
llama_context & lctx,
|
4818
|
+
const llama_batch & batch) {
|
4819
|
+
const auto & model = lctx.model;
|
4820
|
+
const auto & hparams = model.hparams;
|
4821
|
+
|
4822
|
+
const auto & kv_self = lctx.kv_self;
|
4823
|
+
|
4824
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4825
|
+
|
4826
|
+
const auto & cparams = lctx.cparams;
|
4827
|
+
const int64_t n_embd = hparams.n_embd;
|
4828
|
+
const int64_t n_layer = hparams.n_layer;
|
4829
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4830
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4831
|
+
const int64_t n_head = hparams.n_head;
|
4832
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4833
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4834
|
+
const size_t n_rot = n_embd_head / 2;
|
4835
|
+
|
4836
|
+
const float freq_base = cparams.rope_freq_base;
|
4837
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4838
|
+
const float norm_eps = hparams.f_norm_eps;
|
4839
|
+
|
4840
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4841
|
+
|
4842
|
+
|
4843
|
+
const int32_t n_tokens = batch.n_tokens;
|
4844
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4845
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4846
|
+
|
4847
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4848
|
+
|
4849
|
+
auto & buf_compute = lctx.buf_compute;
|
4850
|
+
struct ggml_init_params params = {
|
4851
|
+
/*.mem_size =*/ buf_compute.size,
|
4852
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4853
|
+
/*.no_alloc =*/ true,
|
4854
|
+
};
|
4855
|
+
|
4856
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4857
|
+
|
4858
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4859
|
+
|
4860
|
+
struct ggml_tensor * cur;
|
4861
|
+
struct ggml_tensor * inpL;
|
4862
|
+
|
4863
|
+
if (batch.token) {
|
4864
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4865
|
+
|
4866
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4867
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4868
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4869
|
+
}
|
4870
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4871
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4872
|
+
} else {
|
4873
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4874
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4875
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4876
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4877
|
+
}
|
4878
|
+
}
|
4879
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4880
|
+
(void) i_gpu_start;
|
4881
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4882
|
+
offload_func_t offload_func_kq = llama_nop;
|
4883
|
+
offload_func_t offload_func_v = llama_nop;
|
4884
|
+
// KQ_scale
|
4885
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4886
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4887
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4888
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4889
|
+
}
|
4890
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4891
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4892
|
+
offload_func_kq(KQ_mask);
|
4893
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4894
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4895
|
+
|
4896
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4897
|
+
float * data = (float *) KQ_mask->data;
|
4898
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4899
|
+
for (int h = 0; h < 1; ++h) {
|
4900
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4901
|
+
const llama_pos pos = batch.pos[j];
|
4902
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4903
|
+
for (int i = 0; i < n_kv; ++i) {
|
4904
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4905
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4906
|
+
}
|
4907
|
+
}
|
4908
|
+
}
|
4909
|
+
}
|
4910
|
+
}
|
4911
|
+
|
4912
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4913
|
+
offload_func_kq(KQ_pos);
|
4914
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4915
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4916
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4917
|
+
int * data = (int *) KQ_pos->data;
|
4918
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4919
|
+
data[i] = batch.pos[i];
|
4920
|
+
}
|
4921
|
+
}
|
4922
|
+
if (do_rope_shift) {
|
4923
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4924
|
+
offload_func_kq(K_shift);
|
4925
|
+
ggml_set_name(K_shift, "K_shift");
|
4926
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4927
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4928
|
+
int * data = (int *) K_shift->data;
|
4929
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4930
|
+
data[i] = kv_self.cells[i].delta;
|
4931
|
+
}
|
4932
|
+
}
|
4933
|
+
for (int il = 0; il < n_layer; ++il) {
|
4934
|
+
struct ggml_tensor * tmp =
|
4935
|
+
// we rotate only the first n_rot dimensions.
|
4936
|
+
ggml_rope_custom_inplace(ctx0,
|
4937
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4938
|
+
n_rot, n_head, n_ctx,
|
4939
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4940
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4941
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4942
|
+
),
|
4943
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4944
|
+
offload_func_kq(tmp);
|
4945
|
+
ggml_build_forward_expand(gf, tmp);
|
4946
|
+
}
|
4947
|
+
}
|
4948
|
+
for (int il=0; il < n_layer; ++il) {
|
4949
|
+
struct ggml_tensor * residual = inpL;
|
4950
|
+
offload_func_t offload_func = llama_nop;
|
4951
|
+
{
|
4952
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4953
|
+
offload_func(cur);
|
4954
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4955
|
+
offload_func(cur);
|
4956
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4957
|
+
offload_func(cur);
|
4958
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4959
|
+
}
|
4960
|
+
// self attention
|
4961
|
+
{
|
4962
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4963
|
+
offload_func_kq(cur);
|
4964
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4965
|
+
offload_func_kq(cur);
|
4966
|
+
|
4967
|
+
// split qkv
|
4968
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4969
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4970
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4971
|
+
offload_func_kq(tmpqkv);
|
4972
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4973
|
+
offload_func_kq(tmpqkv_perm);
|
4974
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4975
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4976
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4977
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4978
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4979
|
+
0
|
4980
|
+
);
|
4981
|
+
offload_func_kq(tmpq);
|
4982
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4983
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4984
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4985
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4986
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4987
|
+
);
|
4988
|
+
offload_func_kq(tmpk);
|
4989
|
+
// Q/K Layernorm
|
4990
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4991
|
+
offload_func_kq(tmpq);
|
4992
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4993
|
+
offload_func_kq(tmpq);
|
4994
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4995
|
+
offload_func_kq(tmpq);
|
4996
|
+
|
4997
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4998
|
+
offload_func_v(tmpk);
|
4999
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
5000
|
+
offload_func_v(tmpk);
|
5001
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
5002
|
+
offload_func_v(tmpk);
|
5003
|
+
|
5004
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5005
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
5006
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5007
|
+
ggml_element_size(tmpq) * n_embd_head,
|
5008
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5009
|
+
0
|
5010
|
+
);
|
5011
|
+
offload_func_kq(qrot);
|
5012
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
5013
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
5014
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5015
|
+
ggml_element_size(tmpk) * n_embd_head,
|
5016
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5017
|
+
0
|
5018
|
+
);
|
5019
|
+
offload_func_kq(krot);
|
5020
|
+
ggml_format_name(krot, "krot_%d", il);
|
5021
|
+
|
5022
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5023
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
5024
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5025
|
+
ggml_element_size(tmpq) * n_embd_head,
|
5026
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5027
|
+
ggml_element_size(tmpq) * n_rot
|
5028
|
+
);
|
5029
|
+
offload_func_kq(qpass);
|
5030
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
5031
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
5032
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5033
|
+
ggml_element_size(tmpk) * n_embd_head,
|
5034
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5035
|
+
ggml_element_size(tmpk) * n_rot
|
5036
|
+
);
|
5037
|
+
offload_func_kq(kpass);
|
5038
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
5039
|
+
|
5040
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5041
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
5042
|
+
);
|
5043
|
+
offload_func_kq(qrotated);
|
5044
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
5045
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
5046
|
+
);
|
5047
|
+
offload_func_kq(krotated);
|
5048
|
+
// ggml currently only supports concatenation on dim=2
|
5049
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
5050
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
5051
|
+
offload_func_kq(qrotated);
|
5052
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
5053
|
+
offload_func_kq(krotated);
|
5054
|
+
|
5055
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
5056
|
+
offload_func_kq(qpass);
|
5057
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
5058
|
+
offload_func_kq(kpass);
|
5059
|
+
|
5060
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
5061
|
+
offload_func_kq(Qcur);
|
5062
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
5063
|
+
offload_func_kq(Kcur);
|
5064
|
+
|
5065
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
5066
|
+
offload_func_kq(Q);
|
5067
|
+
|
5068
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
5069
|
+
offload_func_kq(Kcur);
|
5070
|
+
{
|
5071
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5072
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
5073
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
5074
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
5075
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
5076
|
+
);
|
5077
|
+
offload_func_v(tmpv);
|
5078
|
+
// store K, V in cache
|
5079
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
5080
|
+
offload_func_v(Vcur);
|
5081
|
+
ggml_set_name(Vcur, "Vcur");
|
5082
|
+
|
5083
|
+
struct ggml_tensor * k = ggml_view_1d(
|
5084
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
5085
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
5086
|
+
);
|
5087
|
+
offload_func_kq(k);
|
5088
|
+
ggml_set_name(k, "k");
|
5089
|
+
|
5090
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5091
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5092
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5093
|
+
offload_func_v(v);
|
5094
|
+
ggml_set_name(v, "v");
|
5095
|
+
|
5096
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
5097
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5098
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5099
|
+
}
|
5100
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
5101
|
+
n_embd_head, n_kv, n_head_kv,
|
5102
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5103
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5104
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5105
|
+
|
5106
|
+
offload_func_kq(K);
|
5107
|
+
ggml_format_name(K, "K_%d", il);
|
5108
|
+
|
5109
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5110
|
+
offload_func_kq(KQ);
|
5111
|
+
ggml_set_name(KQ, "KQ");
|
5112
|
+
|
5113
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5114
|
+
offload_func_kq(KQ_scaled);
|
5115
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5116
|
+
|
5117
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5118
|
+
offload_func_kq(KQ_masked);
|
5119
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5120
|
+
|
5121
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5122
|
+
offload_func_kq(KQ_soft_max);
|
5123
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5124
|
+
|
5125
|
+
struct ggml_tensor * V =
|
5126
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5127
|
+
n_kv, n_embd_head, n_head_kv,
|
5128
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5129
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5130
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5131
|
+
offload_func_v(V);
|
5132
|
+
ggml_set_name(V, "V");
|
5133
|
+
|
5134
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5135
|
+
offload_func_v(KQV);
|
5136
|
+
ggml_set_name(KQV, "KQV");
|
5137
|
+
|
5138
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5139
|
+
offload_func_v(KQV_merged);
|
5140
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5141
|
+
|
5142
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5143
|
+
offload_func_v(cur);
|
5144
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5145
|
+
|
5146
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5147
|
+
offload_func(cur);
|
5148
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5149
|
+
offload_func(cur);
|
5150
|
+
ggml_set_name(cur, "result_wo");
|
5151
|
+
}
|
5152
|
+
|
5153
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5154
|
+
offload_func(inpFF);
|
5155
|
+
ggml_set_name(inpFF, "inpFF");
|
5156
|
+
{
|
5157
|
+
// MLP
|
5158
|
+
{
|
5159
|
+
// Norm
|
5160
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5161
|
+
offload_func(cur);
|
5162
|
+
cur = ggml_add(ctx0,
|
5163
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5164
|
+
model.layers[il].ffn_norm_b
|
5165
|
+
);
|
5166
|
+
ggml_set_name(cur, "ffn_norm");
|
5167
|
+
offload_func(cur);
|
5168
|
+
}
|
5169
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5170
|
+
offload_func(cur);
|
5171
|
+
|
5172
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5173
|
+
offload_func(cur);
|
5174
|
+
ggml_set_name(cur, "result_ffn_up");
|
5175
|
+
|
5176
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5177
|
+
ggml_set_name(cur, "result_ffn_act");
|
5178
|
+
offload_func(cur);
|
5179
|
+
offload_func(cur->src[0]);
|
5180
|
+
|
5181
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5182
|
+
offload_func(cur);
|
5183
|
+
cur = ggml_add(ctx0,
|
5184
|
+
cur,
|
5185
|
+
model.layers[il].b2);
|
5186
|
+
offload_func(cur);
|
5187
|
+
ggml_set_name(cur, "outFF");
|
5188
|
+
}
|
5189
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5190
|
+
offload_func(cur);
|
5191
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5192
|
+
inpL = cur;
|
5193
|
+
}
|
5194
|
+
cur = inpL;
|
5195
|
+
{
|
5196
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5197
|
+
offload_func_nr(cur);
|
5198
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5199
|
+
offload_func_nr(cur);
|
5200
|
+
|
5201
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5202
|
+
// offload_func_nr(cur);
|
5203
|
+
|
5204
|
+
ggml_set_name(cur, "result_norm");
|
5205
|
+
}
|
5206
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5207
|
+
ggml_set_name(cur, "result_output");
|
5208
|
+
ggml_build_forward_expand(gf, cur);
|
5209
|
+
ggml_free(ctx0);
|
5210
|
+
return gf;
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5214
|
+
llama_context & lctx,
|
5215
|
+
const llama_batch & batch) {
|
5216
|
+
const auto & model = lctx.model;
|
5217
|
+
const auto & hparams = model.hparams;
|
5218
|
+
const auto & cparams = lctx.cparams;
|
5219
|
+
|
5220
|
+
const auto & kv_self = lctx.kv_self;
|
5221
|
+
|
5222
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5223
|
+
|
5224
|
+
const int64_t n_embd = hparams.n_embd;
|
5225
|
+
const int64_t n_layer = hparams.n_layer;
|
5226
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5227
|
+
const int64_t n_head = hparams.n_head;
|
5228
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5229
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5230
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5231
|
+
|
5232
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5233
|
+
|
5234
|
+
const float norm_eps = hparams.f_norm_eps;
|
5235
|
+
|
5236
|
+
const int32_t n_tokens = batch.n_tokens;
|
5237
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5238
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5239
|
+
|
5240
|
+
auto & buf_compute = lctx.buf_compute;
|
5241
|
+
|
5242
|
+
struct ggml_init_params params = {
|
5243
|
+
/*.mem_size =*/ buf_compute.size,
|
5244
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5245
|
+
/*.no_alloc =*/ false,
|
5246
|
+
};
|
5247
|
+
|
5248
|
+
params.no_alloc = true;
|
5249
|
+
|
5250
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5251
|
+
|
5252
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5253
|
+
|
5254
|
+
struct ggml_tensor * cur;
|
5255
|
+
struct ggml_tensor * token;
|
5256
|
+
struct ggml_tensor * inpL;
|
5257
|
+
|
5258
|
+
if (batch.token) {
|
5259
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5260
|
+
|
5261
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5262
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5263
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5264
|
+
}
|
5265
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5266
|
+
|
5267
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5268
|
+
} else {
|
5269
|
+
#ifdef GGML_USE_MPI
|
5270
|
+
GGML_ASSERT(false && "not implemented");
|
5271
|
+
#endif
|
5272
|
+
|
5273
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5274
|
+
|
5275
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5276
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5277
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5278
|
+
}
|
5279
|
+
}
|
5280
|
+
|
5281
|
+
// KQ_scale
|
5282
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5283
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5284
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5285
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5286
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5287
|
+
}
|
5288
|
+
|
5289
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5290
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5291
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5292
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5293
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5294
|
+
float * data = (float *) KQ_mask->data;
|
5295
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5296
|
+
|
5297
|
+
for (int h = 0; h < 1; ++h) {
|
5298
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5299
|
+
const llama_pos pos = batch.pos[j];
|
5300
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5301
|
+
|
5302
|
+
for (int i = 0; i < n_kv; ++i) {
|
5303
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5304
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5305
|
+
}
|
5306
|
+
}
|
5307
|
+
}
|
5308
|
+
}
|
5309
|
+
}
|
5310
|
+
|
5311
|
+
// norm
|
5312
|
+
{
|
5313
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5314
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5315
|
+
}
|
5316
|
+
|
5317
|
+
ggml_set_name(inpL, "inpL");
|
5318
|
+
|
5319
|
+
for (int il = 0; il < n_layer; ++il) {
|
5320
|
+
{
|
5321
|
+
// Norm
|
5322
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5323
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5324
|
+
}
|
5325
|
+
|
5326
|
+
{
|
5327
|
+
// Self Attention
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5329
|
+
|
5330
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5331
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5332
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5333
|
+
|
5334
|
+
struct ggml_tensor * Qcur = tmpq;
|
5335
|
+
struct ggml_tensor * Kcur = tmpk;
|
5336
|
+
|
5337
|
+
// store key and value to memory
|
5338
|
+
{
|
5339
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5340
|
+
ggml_set_name(Vcur, "Vcur");
|
5341
|
+
|
5342
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5343
|
+
ggml_set_name(k, "k");
|
5344
|
+
|
5345
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5346
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5347
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5348
|
+
|
5349
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5350
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5351
|
+
}
|
5352
|
+
|
5353
|
+
struct ggml_tensor * Q =
|
5354
|
+
ggml_permute(ctx0,
|
5355
|
+
ggml_cpy(ctx0,
|
5356
|
+
Qcur,
|
5357
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5358
|
+
0, 2, 1, 3);
|
5359
|
+
ggml_set_name(Q, "Q");
|
5360
|
+
|
5361
|
+
struct ggml_tensor * K =
|
5362
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5363
|
+
n_embd_head, n_kv, n_head_kv,
|
5364
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5365
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5366
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5367
|
+
ggml_set_name(K, "K");
|
5368
|
+
|
5369
|
+
// K * Q
|
5370
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5371
|
+
ggml_set_name(KQ, "KQ");
|
5372
|
+
|
5373
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5374
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5375
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5376
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5377
|
+
|
5378
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5379
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5380
|
+
|
5381
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5382
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5383
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5384
|
+
|
5385
|
+
// KQ = soft_max(KQ_masked)
|
5386
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5387
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5388
|
+
|
5389
|
+
// split cached V into n_head heads
|
5390
|
+
struct ggml_tensor * V =
|
5391
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5392
|
+
n_kv, n_embd_head, n_head_kv,
|
5393
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5394
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5395
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5396
|
+
ggml_set_name(V, "V");
|
5397
|
+
|
5398
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5399
|
+
ggml_set_name(KQV, "KQV");
|
5400
|
+
|
5401
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5402
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5403
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5404
|
+
|
5405
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5406
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5407
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5408
|
+
}
|
5409
|
+
|
5410
|
+
// Projection
|
5411
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5412
|
+
|
5413
|
+
// Add the input
|
5414
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5415
|
+
|
5416
|
+
struct ggml_tensor * inpFF = cur;
|
5417
|
+
|
5418
|
+
// FF
|
5419
|
+
{
|
5420
|
+
// Norm
|
5421
|
+
{
|
5422
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5423
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5424
|
+
}
|
5425
|
+
|
5426
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5427
|
+
|
5428
|
+
// GELU activation
|
5429
|
+
cur = ggml_gelu(ctx0, cur);
|
5430
|
+
|
5431
|
+
// Projection
|
5432
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5433
|
+
}
|
5434
|
+
|
5435
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5436
|
+
}
|
5437
|
+
|
5438
|
+
// Output Norm
|
5439
|
+
{
|
5440
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5441
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5442
|
+
}
|
5443
|
+
ggml_set_name(cur, "result_norm");
|
5444
|
+
|
5445
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5446
|
+
ggml_set_name(cur, "result_output");
|
5447
|
+
|
5448
|
+
ggml_build_forward_expand(gf, cur);
|
5449
|
+
|
5450
|
+
ggml_free(ctx0);
|
5451
|
+
|
5452
|
+
return gf;
|
5453
|
+
}
|
5454
|
+
|
5455
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5456
|
+
llama_context & lctx,
|
5457
|
+
const llama_batch & batch) {
|
5458
|
+
const auto & model = lctx.model;
|
5459
|
+
const auto & hparams = model.hparams;
|
5460
|
+
const auto & cparams = lctx.cparams;
|
5461
|
+
|
5462
|
+
const auto & kv_self = lctx.kv_self;
|
5463
|
+
|
5464
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5465
|
+
|
5466
|
+
const int64_t n_embd = hparams.n_embd;
|
5467
|
+
const int64_t n_layer = hparams.n_layer;
|
5468
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5469
|
+
const int64_t n_head = hparams.n_head;
|
5470
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5471
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5472
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5473
|
+
|
5474
|
+
const float norm_eps = hparams.f_norm_eps;
|
5475
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5476
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5477
|
+
|
5478
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5479
|
+
|
5480
|
+
const int32_t n_tokens = batch.n_tokens;
|
5481
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5482
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5483
|
+
|
5484
|
+
auto & buf_compute = lctx.buf_compute;
|
5485
|
+
|
5486
|
+
struct ggml_init_params params = {
|
5487
|
+
/*.mem_size =*/ buf_compute.size,
|
5488
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5489
|
+
/*.no_alloc =*/ false,
|
5490
|
+
};
|
5491
|
+
|
5492
|
+
params.no_alloc = true;
|
5493
|
+
|
5494
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5495
|
+
|
5496
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5497
|
+
|
5498
|
+
struct ggml_tensor * cur;
|
5499
|
+
struct ggml_tensor * inpL;
|
5500
|
+
|
5501
|
+
//int warmup = 0;
|
5502
|
+
if (batch.token) {
|
5503
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5504
|
+
|
5505
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5506
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5507
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5508
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5509
|
+
}
|
5510
|
+
|
5511
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5512
|
+
|
5513
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5514
|
+
} else {
|
5515
|
+
#ifdef GGML_USE_MPI
|
5516
|
+
GGML_ASSERT(false && "not implemented");
|
5517
|
+
#endif
|
5518
|
+
|
5519
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5520
|
+
|
5521
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5522
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5523
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4203
5524
|
}
|
4204
5525
|
}
|
4205
5526
|
|
4206
|
-
|
4207
|
-
|
4208
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4209
|
-
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4210
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4211
|
-
for (int i = 0; i < n_tokens; ++i) {
|
4212
|
-
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4213
|
-
}
|
4214
|
-
}
|
4215
|
-
ggml_set_name(inp_positions, "inp_positions");
|
5527
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5528
|
+
(void) i_gpu_start;
|
4216
5529
|
|
4217
|
-
|
5530
|
+
// offload functions set the tensor output backend to GPU
|
5531
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5532
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5533
|
+
offload_func_t offload_func_kq = llama_nop;
|
5534
|
+
offload_func_t offload_func_v = llama_nop;
|
5535
|
+
|
5536
|
+
#ifdef GGML_USE_CUBLAS
|
5537
|
+
if (n_gpu_layers > n_layer) {
|
5538
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
4218
5539
|
}
|
5540
|
+
if (n_gpu_layers > n_layer + 1) {
|
5541
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5542
|
+
}
|
5543
|
+
if (n_gpu_layers > n_layer + 2) {
|
5544
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5545
|
+
}
|
5546
|
+
#endif // GGML_USE_CUBLAS
|
4219
5547
|
|
4220
5548
|
// KQ_scale
|
4221
5549
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
@@ -4227,6 +5555,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4227
5555
|
|
4228
5556
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4229
5557
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5558
|
+
offload_func_kq(KQ_mask);
|
4230
5559
|
ggml_set_name(KQ_mask, "KQ_mask");
|
4231
5560
|
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4232
5561
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
@@ -4236,7 +5565,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4236
5565
|
for (int h = 0; h < 1; ++h) {
|
4237
5566
|
for (int j = 0; j < n_tokens; ++j) {
|
4238
5567
|
const llama_pos pos = batch.pos[j];
|
4239
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4240
5569
|
|
4241
5570
|
for (int i = 0; i < n_kv; ++i) {
|
4242
5571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4247,48 +5576,87 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4247
5576
|
}
|
4248
5577
|
}
|
4249
5578
|
|
4250
|
-
inpL = ggml_add(ctx0, token, position);
|
4251
|
-
ggml_set_name(inpL, "inpL");
|
4252
|
-
|
4253
5579
|
for (int il = 0; il < n_layer; ++il) {
|
4254
|
-
|
4255
|
-
|
4256
|
-
|
4257
|
-
|
5580
|
+
struct ggml_tensor * attn_norm;
|
5581
|
+
|
5582
|
+
offload_func_t offload_func = llama_nop;
|
5583
|
+
|
5584
|
+
#ifdef GGML_USE_CUBLAS
|
5585
|
+
if (il >= i_gpu_start) {
|
5586
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
4258
5587
|
}
|
5588
|
+
#endif // GGML_USE_CUBLAS
|
4259
5589
|
|
5590
|
+
// self-attention
|
5591
|
+
// TODO: refactor into common function (shared with LLaMA)
|
4260
5592
|
{
|
4261
|
-
|
4262
|
-
|
5593
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5594
|
+
offload_func(attn_norm);
|
4263
5595
|
|
4264
|
-
|
4265
|
-
|
4266
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5596
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5597
|
+
offload_func(attn_norm);
|
4267
5598
|
|
4268
|
-
|
4269
|
-
|
5599
|
+
if (1) {
|
5600
|
+
cur = attn_norm;
|
5601
|
+
}
|
5602
|
+
|
5603
|
+
// compute QKV
|
5604
|
+
|
5605
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5606
|
+
offload_func_kq(cur);
|
5607
|
+
|
5608
|
+
if (clamp_kqv > 0.0f) {
|
5609
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5610
|
+
offload_func_kq(cur);
|
5611
|
+
}
|
5612
|
+
|
5613
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5614
|
+
|
5615
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5616
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5617
|
+
wsize * n_embd_head,
|
5618
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5619
|
+
0);
|
5620
|
+
offload_func_kq(Qcur);
|
5621
|
+
|
5622
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5623
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5624
|
+
wsize * n_embd_head,
|
5625
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5626
|
+
wsize * n_embd_head * n_head);
|
5627
|
+
offload_func_kq(Kcur);
|
5628
|
+
|
5629
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5630
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5631
|
+
wsize * n_embd_head,
|
5632
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5633
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5634
|
+
offload_func_kq(Kcur);
|
5635
|
+
|
5636
|
+
ggml_set_name(Qcur, "Qcur");
|
5637
|
+
ggml_set_name(Kcur, "Kcur");
|
4270
5638
|
|
4271
5639
|
{
|
4272
5640
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5641
|
+
offload_func_v(Vcur);
|
5642
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
4273
5643
|
ggml_set_name(Vcur, "Vcur");
|
4274
5644
|
|
4275
5645
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5646
|
+
offload_func_kq(k);
|
4276
5647
|
ggml_set_name(k, "k");
|
4277
5648
|
|
4278
5649
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4279
5650
|
( n_ctx)*ggml_element_size(kv_self.v),
|
4280
5651
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5652
|
+
offload_func_v(v);
|
4281
5653
|
|
4282
5654
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4283
5655
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4284
5656
|
}
|
4285
5657
|
|
4286
|
-
struct ggml_tensor * Q =
|
4287
|
-
|
4288
|
-
ggml_cpy(ctx0,
|
4289
|
-
Qcur,
|
4290
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4291
|
-
0, 2, 1, 3);
|
5658
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5659
|
+
offload_func_kq(Q);
|
4292
5660
|
ggml_set_name(Q, "Q");
|
4293
5661
|
|
4294
5662
|
struct ggml_tensor * K =
|
@@ -4297,85 +5665,105 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4297
5665
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4298
5666
|
ggml_element_size(kv_self.k)*n_embd_head,
|
4299
5667
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5668
|
+
offload_func_kq(K);
|
4300
5669
|
ggml_set_name(K, "K");
|
4301
5670
|
|
4302
|
-
// K * Q
|
4303
5671
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5672
|
+
offload_func_kq(KQ);
|
4304
5673
|
ggml_set_name(KQ, "KQ");
|
4305
5674
|
|
4306
|
-
|
4307
|
-
|
4308
|
-
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5675
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5676
|
+
offload_func_kq(KQ_scaled);
|
4309
5677
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4310
5678
|
|
4311
|
-
//
|
4312
|
-
struct ggml_tensor *
|
5679
|
+
// TODO: replace with ggml_add()
|
5680
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5681
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5682
|
+
offload_func_kq(KQ_scaled_alibi);
|
5683
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5684
|
+
|
5685
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5686
|
+
offload_func_kq(KQ_masked);
|
4313
5687
|
ggml_set_name(KQ_masked, "KQ_masked");
|
4314
5688
|
|
4315
|
-
|
4316
|
-
|
5689
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5690
|
+
offload_func_v(KQ_soft_max);
|
4317
5691
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4318
5692
|
|
4319
|
-
// split cached V into n_head heads
|
4320
5693
|
struct ggml_tensor * V =
|
4321
5694
|
ggml_view_3d(ctx0, kv_self.v,
|
4322
5695
|
n_kv, n_embd_head, n_head_kv,
|
4323
5696
|
ggml_element_size(kv_self.v)*n_ctx,
|
4324
5697
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4325
5698
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5699
|
+
offload_func_v(V);
|
4326
5700
|
ggml_set_name(V, "V");
|
4327
5701
|
|
4328
5702
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5703
|
+
offload_func_v(KQV);
|
4329
5704
|
ggml_set_name(KQV, "KQV");
|
4330
5705
|
|
4331
|
-
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4332
5706
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5707
|
+
offload_func_v(KQV_merged);
|
4333
5708
|
ggml_set_name(KQV_merged, "KQV_merged");
|
4334
5709
|
|
4335
|
-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4336
5710
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5711
|
+
offload_func_v(cur);
|
4337
5712
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
4338
|
-
}
|
4339
5713
|
|
4340
|
-
|
4341
|
-
|
5714
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5715
|
+
offload_func(cur);
|
5716
|
+
ggml_set_name(cur, "result_wo");
|
5717
|
+
}
|
4342
5718
|
|
4343
5719
|
// Add the input
|
4344
5720
|
cur = ggml_add(ctx0, cur, inpL);
|
5721
|
+
offload_func(cur);
|
4345
5722
|
|
4346
|
-
struct ggml_tensor *
|
5723
|
+
struct ggml_tensor * attn_out = cur;
|
4347
5724
|
|
4348
|
-
//
|
5725
|
+
// feed forward
|
4349
5726
|
{
|
4350
5727
|
// Norm
|
4351
5728
|
{
|
4352
|
-
cur = ggml_norm(ctx0,
|
4353
|
-
|
5729
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5730
|
+
offload_func(cur);
|
5731
|
+
|
5732
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5733
|
+
offload_func(cur);
|
4354
5734
|
}
|
4355
5735
|
|
4356
|
-
cur =
|
5736
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5737
|
+
offload_func(cur);
|
4357
5738
|
|
4358
|
-
// GELU activation
|
4359
5739
|
cur = ggml_gelu(ctx0, cur);
|
4360
|
-
|
4361
|
-
|
4362
|
-
|
5740
|
+
offload_func(cur);
|
5741
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5742
|
+
offload_func(cur);
|
4363
5743
|
}
|
4364
5744
|
|
4365
|
-
|
5745
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5746
|
+
offload_func(cur);
|
5747
|
+
// input for next layer
|
5748
|
+
inpL = cur;
|
4366
5749
|
}
|
4367
5750
|
|
4368
|
-
|
5751
|
+
cur = inpL;
|
5752
|
+
|
5753
|
+
// norm
|
4369
5754
|
{
|
4370
|
-
cur = ggml_norm(ctx0,
|
4371
|
-
|
5755
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5756
|
+
offload_func_nr(cur);
|
5757
|
+
|
5758
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5759
|
+
ggml_set_name(cur, "result_norm");
|
4372
5760
|
}
|
4373
|
-
ggml_set_name(cur, "result_norm");
|
4374
5761
|
|
4375
5762
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4376
5763
|
ggml_set_name(cur, "result_output");
|
4377
5764
|
|
4378
5765
|
ggml_build_forward_expand(gf, cur);
|
5766
|
+
|
4379
5767
|
ggml_free(ctx0);
|
4380
5768
|
|
4381
5769
|
return gf;
|
@@ -4405,10 +5793,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4405
5793
|
{
|
4406
5794
|
result = llm_build_starcoder(lctx, batch);
|
4407
5795
|
} break;
|
5796
|
+
case LLM_ARCH_PERSIMMON:
|
5797
|
+
{
|
5798
|
+
result = llm_build_persimmon(lctx, batch);
|
5799
|
+
} break;
|
4408
5800
|
case LLM_ARCH_REFACT:
|
4409
5801
|
{
|
4410
5802
|
result = llm_build_refact(lctx, batch);
|
4411
5803
|
} break;
|
5804
|
+
case LLM_ARCH_BLOOM:
|
5805
|
+
{
|
5806
|
+
result = llm_build_bloom(lctx, batch);
|
5807
|
+
} break;
|
5808
|
+
case LLM_ARCH_MPT:
|
5809
|
+
{
|
5810
|
+
result = llm_build_mpt(lctx, batch);
|
5811
|
+
} break;
|
4412
5812
|
default:
|
4413
5813
|
GGML_ASSERT(false);
|
4414
5814
|
}
|
@@ -4420,7 +5820,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4420
5820
|
//
|
4421
5821
|
// - lctx: llama context
|
4422
5822
|
// - batch: batch to evaluate
|
4423
|
-
// - n_threads: number of threads to use
|
4424
5823
|
//
|
4425
5824
|
// return 0 on success
|
4426
5825
|
// return positive int on warning
|
@@ -4466,8 +5865,11 @@ static int llama_decode_internal(
|
|
4466
5865
|
|
4467
5866
|
// helpers for smoother batch API transistion
|
4468
5867
|
// after deprecating the llama_eval calls, these will be removed
|
4469
|
-
std::vector<llama_pos>
|
4470
|
-
|
5868
|
+
std::vector<llama_pos> pos;
|
5869
|
+
|
5870
|
+
std::vector<int32_t> n_seq_id;
|
5871
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
5872
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
4471
5873
|
|
4472
5874
|
if (batch.pos == nullptr) {
|
4473
5875
|
pos.resize(n_tokens);
|
@@ -4479,18 +5881,20 @@ static int llama_decode_internal(
|
|
4479
5881
|
}
|
4480
5882
|
|
4481
5883
|
if (batch.seq_id == nullptr) {
|
5884
|
+
n_seq_id.resize(n_tokens);
|
4482
5885
|
seq_id.resize(n_tokens);
|
5886
|
+
seq_id_arr.resize(n_tokens);
|
4483
5887
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
4484
|
-
|
5888
|
+
n_seq_id[i] = 1;
|
5889
|
+
seq_id[i].resize(1);
|
5890
|
+
seq_id[i][0] = batch.all_seq_id;
|
5891
|
+
seq_id_arr[i] = seq_id[i].data();
|
4485
5892
|
}
|
4486
5893
|
|
4487
|
-
batch.
|
5894
|
+
batch.n_seq_id = n_seq_id.data();
|
5895
|
+
batch.seq_id = seq_id_arr.data();
|
4488
5896
|
}
|
4489
5897
|
|
4490
|
-
// we always start to search for a free slot from the start of the cache
|
4491
|
-
// TODO: better strategies can be implemented
|
4492
|
-
kv_self.head = 0;
|
4493
|
-
|
4494
5898
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
5899
|
return 1;
|
4496
5900
|
}
|
@@ -4509,6 +5913,13 @@ static int llama_decode_internal(
|
|
4509
5913
|
|
4510
5914
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
4511
5915
|
|
5916
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5917
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5918
|
+
|
5919
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5920
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5921
|
+
|
5922
|
+
|
4512
5923
|
#ifdef GGML_USE_CUBLAS
|
4513
5924
|
for (int i = 0; i < gf->n_leafs; i++) {
|
4514
5925
|
ggml_tensor * node = gf->leafs[i];
|
@@ -4526,6 +5937,12 @@ static int llama_decode_internal(
|
|
4526
5937
|
}
|
4527
5938
|
|
4528
5939
|
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
5940
|
+
|
5941
|
+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
5942
|
+
if (!lctx.embedding.empty()) {
|
5943
|
+
embeddings->backend = GGML_BACKEND_CPU;
|
5944
|
+
}
|
5945
|
+
res->backend = GGML_BACKEND_CPU;
|
4529
5946
|
#endif
|
4530
5947
|
|
4531
5948
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -4543,18 +5960,13 @@ static int llama_decode_internal(
|
|
4543
5960
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4544
5961
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4545
5962
|
model.arch == LLM_ARCH_FALCON ||
|
4546
|
-
model.arch == LLM_ARCH_REFACT
|
5963
|
+
model.arch == LLM_ARCH_REFACT ||
|
5964
|
+
model.arch == LLM_ARCH_MPT;
|
4547
5965
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4548
5966
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4549
5967
|
n_threads = 1;
|
4550
5968
|
}
|
4551
5969
|
|
4552
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
4553
|
-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
4554
|
-
|
4555
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
4556
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
4557
|
-
|
4558
5970
|
#if GGML_USE_MPI
|
4559
5971
|
const int64_t n_layer = hparams.n_layer;
|
4560
5972
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -4576,8 +5988,12 @@ static int llama_decode_internal(
|
|
4576
5988
|
#endif
|
4577
5989
|
|
4578
5990
|
// update the kv ring buffer
|
4579
|
-
lctx.kv_self.head += n_tokens;
|
4580
5991
|
lctx.kv_self.has_shift = false;
|
5992
|
+
lctx.kv_self.head += n_tokens;
|
5993
|
+
// Ensure kv cache head points to a valid index.
|
5994
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5995
|
+
lctx.kv_self.head = 0;
|
5996
|
+
}
|
4581
5997
|
|
4582
5998
|
#ifdef GGML_PERF
|
4583
5999
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -4903,7 +6319,6 @@ struct llm_tokenizer_bpe {
|
|
4903
6319
|
llm_symbol sym;
|
4904
6320
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
4905
6321
|
sym.text = word.c_str() + offset;
|
4906
|
-
sym.n = 1;
|
4907
6322
|
sym.n = char_len;
|
4908
6323
|
offset += sym.n;
|
4909
6324
|
sym.prev = index - 1;
|
@@ -5040,7 +6455,6 @@ private:
|
|
5040
6455
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
6456
|
const std::string & utf_char = text_utf[i];
|
5042
6457
|
bool split_condition = false;
|
5043
|
-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
6458
|
int bytes_remain = text_utf.size() - i;
|
5045
6459
|
// forward backward lookups
|
5046
6460
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -5066,9 +6480,9 @@ private:
|
|
5066
6480
|
if (!split_condition && bytes_remain >= 3) {
|
5067
6481
|
// 're|'ve|'ll
|
5068
6482
|
if (utf_char == "\'" && (
|
5069
|
-
(utf_char_next == "r"
|
5070
|
-
(utf_char_next == "v"
|
5071
|
-
(utf_char_next == "l"
|
6483
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6484
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6485
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
5072
6486
|
) {
|
5073
6487
|
split_condition = true;
|
5074
6488
|
}
|
@@ -5119,7 +6533,7 @@ private:
|
|
5119
6533
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
6534
|
split_condition = true;
|
5121
6535
|
}
|
5122
|
-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next)
|
6536
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5123
6537
|
split_condition = true;
|
5124
6538
|
}
|
5125
6539
|
}
|
@@ -5164,7 +6578,137 @@ private:
|
|
5164
6578
|
llm_bigram_bpe::queue work_queue;
|
5165
6579
|
};
|
5166
6580
|
|
5167
|
-
|
6581
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
6582
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
6583
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
6584
|
+
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
6585
|
+
|
6586
|
+
struct fragment_buffer_variant{
|
6587
|
+
fragment_buffer_variant(llama_vocab::id _token)
|
6588
|
+
:
|
6589
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
6590
|
+
token(_token),
|
6591
|
+
raw_text(_dummy),
|
6592
|
+
offset(0),
|
6593
|
+
length(0){}
|
6594
|
+
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
6595
|
+
:
|
6596
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
6597
|
+
token((llama_vocab::id)-1),
|
6598
|
+
raw_text(_raw_text),
|
6599
|
+
offset(_offset),
|
6600
|
+
length(_length){
|
6601
|
+
GGML_ASSERT( _offset >= 0 );
|
6602
|
+
GGML_ASSERT( _length >= 1 );
|
6603
|
+
GGML_ASSERT( offset + length <= raw_text.length() );
|
6604
|
+
}
|
6605
|
+
|
6606
|
+
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
6607
|
+
const llama_vocab::id token;
|
6608
|
+
const std::string _dummy;
|
6609
|
+
const std::string & raw_text;
|
6610
|
+
const uint64_t offset;
|
6611
|
+
const uint64_t length;
|
6612
|
+
};
|
6613
|
+
|
6614
|
+
// #define PRETOKENIZERDEBUG
|
6615
|
+
|
6616
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
6617
|
+
{
|
6618
|
+
// for each special token
|
6619
|
+
for (const auto & st: vocab.special_tokens_cache) {
|
6620
|
+
const auto & special_token = st.first;
|
6621
|
+
const auto & special_id = st.second;
|
6622
|
+
|
6623
|
+
// for each text fragment
|
6624
|
+
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
6625
|
+
while (it != buffer.end()) {
|
6626
|
+
auto & fragment = (*it);
|
6627
|
+
|
6628
|
+
// if a fragment is text ( not yet processed )
|
6629
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
6630
|
+
auto * raw_text = &(fragment.raw_text);
|
6631
|
+
|
6632
|
+
auto raw_text_base_offset = fragment.offset;
|
6633
|
+
auto raw_text_base_length = fragment.length;
|
6634
|
+
|
6635
|
+
// loop over the text
|
6636
|
+
while (true) {
|
6637
|
+
// find the first occurence of a given special token in this fragment
|
6638
|
+
// passing offset argument only limit the "search area" but match coordinates
|
6639
|
+
// are still relative to the source full raw_text
|
6640
|
+
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6641
|
+
|
6642
|
+
// no occurences found, stop processing this fragment for a given special token
|
6643
|
+
if (match == std::string::npos) break;
|
6644
|
+
|
6645
|
+
// check if match is within bounds of offset <-> length
|
6646
|
+
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
6647
|
+
|
6648
|
+
#ifdef PRETOKENIZERDEBUG
|
6649
|
+
fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6650
|
+
#endif
|
6651
|
+
auto source = std::distance(buffer.begin(), it);
|
6652
|
+
|
6653
|
+
// if match is further than base offset
|
6654
|
+
// then we have some text to the left of it
|
6655
|
+
if (match > raw_text_base_offset) {
|
6656
|
+
// left
|
6657
|
+
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
6658
|
+
const int64_t left_reminder_length = match - raw_text_base_offset;
|
6659
|
+
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
6660
|
+
|
6661
|
+
#ifdef PRETOKENIZERDEBUG
|
6662
|
+
fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
6663
|
+
#endif
|
6664
|
+
it++;
|
6665
|
+
}
|
6666
|
+
|
6667
|
+
// special token
|
6668
|
+
buffer.emplace_after(it, special_id);
|
6669
|
+
it++;
|
6670
|
+
|
6671
|
+
// right
|
6672
|
+
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
6673
|
+
const int64_t right_reminder_offset = match + special_token.length();
|
6674
|
+
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
6675
|
+
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
6676
|
+
|
6677
|
+
#ifdef PRETOKENIZERDEBUG
|
6678
|
+
fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
6679
|
+
#endif
|
6680
|
+
|
6681
|
+
it++;
|
6682
|
+
|
6683
|
+
if (source == 0) {
|
6684
|
+
buffer.erase_after(buffer.before_begin());
|
6685
|
+
} else {
|
6686
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6687
|
+
}
|
6688
|
+
|
6689
|
+
// repeat for the right side
|
6690
|
+
raw_text_base_offset = right_reminder_offset;
|
6691
|
+
raw_text_base_length = right_reminder_length;
|
6692
|
+
|
6693
|
+
#ifdef PRETOKENIZERDEBUG
|
6694
|
+
fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6695
|
+
#endif
|
6696
|
+
} else {
|
6697
|
+
if (source == 0) {
|
6698
|
+
buffer.erase_after(buffer.before_begin());
|
6699
|
+
} else {
|
6700
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6701
|
+
}
|
6702
|
+
break;
|
6703
|
+
}
|
6704
|
+
}
|
6705
|
+
}
|
6706
|
+
it++;
|
6707
|
+
}
|
6708
|
+
}
|
6709
|
+
}
|
6710
|
+
|
6711
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
5168
6712
|
std::vector<llama_vocab::id> output;
|
5169
6713
|
|
5170
6714
|
// OG tokenizer behavior:
|
@@ -5180,20 +6724,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5180
6724
|
return output;
|
5181
6725
|
}
|
5182
6726
|
|
6727
|
+
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
6728
|
+
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
|
6729
|
+
|
6730
|
+
if (special) tokenizer_st_partition( vocab, fragment_buffer );
|
6731
|
+
|
5183
6732
|
switch (vocab.type) {
|
5184
6733
|
case LLAMA_VOCAB_TYPE_SPM:
|
5185
6734
|
{
|
5186
|
-
|
5187
|
-
|
6735
|
+
for (const auto & fragment: fragment_buffer)
|
6736
|
+
{
|
6737
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6738
|
+
{
|
6739
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
5188
6740
|
|
5189
|
-
|
5190
|
-
|
5191
|
-
|
6741
|
+
// TODO: It's likely possible to get rid of this string copy entirely
|
6742
|
+
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
6743
|
+
// and passing 'add space prefix' as bool argument
|
6744
|
+
//
|
6745
|
+
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
6746
|
+
|
6747
|
+
#ifdef PRETOKENIZERDEBUG
|
6748
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6749
|
+
#endif
|
6750
|
+
llm_tokenizer_spm tokenizer(vocab);
|
6751
|
+
llama_escape_whitespace(raw_text);
|
6752
|
+
tokenizer.tokenize(raw_text, output);
|
6753
|
+
}
|
6754
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6755
|
+
{
|
6756
|
+
output.push_back(fragment.token);
|
6757
|
+
}
|
6758
|
+
}
|
5192
6759
|
} break;
|
5193
6760
|
case LLAMA_VOCAB_TYPE_BPE:
|
5194
6761
|
{
|
5195
|
-
|
5196
|
-
|
6762
|
+
for (const auto & fragment: fragment_buffer)
|
6763
|
+
{
|
6764
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6765
|
+
{
|
6766
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6767
|
+
|
6768
|
+
#ifdef PRETOKENIZERDEBUG
|
6769
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6770
|
+
#endif
|
6771
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
6772
|
+
tokenizer.tokenize(raw_text, output);
|
6773
|
+
}
|
6774
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6775
|
+
{
|
6776
|
+
output.push_back(fragment.token);
|
6777
|
+
}
|
6778
|
+
}
|
5197
6779
|
} break;
|
5198
6780
|
}
|
5199
6781
|
|
@@ -5466,7 +7048,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5466
7048
|
std::vector<llama_grammar_candidate> rejects;
|
5467
7049
|
|
5468
7050
|
if (stack.empty()) {
|
5469
|
-
for (auto tok : candidates) {
|
7051
|
+
for (const auto & tok : candidates) {
|
5470
7052
|
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
5471
7053
|
rejects.push_back(tok);
|
5472
7054
|
}
|
@@ -5477,7 +7059,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5477
7059
|
const llama_grammar_element * stack_pos = stack.back();
|
5478
7060
|
|
5479
7061
|
std::vector<llama_grammar_candidate> next_candidates;
|
5480
|
-
for (auto tok : candidates) {
|
7062
|
+
for (const auto & tok : candidates) {
|
5481
7063
|
if (*tok.code_points == 0) {
|
5482
7064
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
5483
7065
|
// that cannot satisfy this position in grammar
|
@@ -5503,7 +7085,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
5503
7085
|
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
5504
7086
|
|
5505
7087
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
5506
|
-
for (auto tok : next_rejects) {
|
7088
|
+
for (const auto & tok : next_rejects) {
|
5507
7089
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
5508
7090
|
}
|
5509
7091
|
|
@@ -6635,7 +8217,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6635
8217
|
const std::string name = ggml_get_name(meta);
|
6636
8218
|
|
6637
8219
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6638
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
8220
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6639
8221
|
++n_attention_wv;
|
6640
8222
|
}
|
6641
8223
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6672,6 +8254,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6672
8254
|
}
|
6673
8255
|
|
6674
8256
|
std::ofstream fout(fname_out, std::ios::binary);
|
8257
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6675
8258
|
|
6676
8259
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6677
8260
|
|
@@ -7535,6 +9118,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
|
|
7535
9118
|
}
|
7536
9119
|
|
7537
9120
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
9121
|
+
if (seq_id_src == seq_id_dst) {
|
9122
|
+
return;
|
9123
|
+
}
|
7538
9124
|
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
7539
9125
|
}
|
7540
9126
|
|
@@ -7987,7 +9573,7 @@ int llama_eval_embd(
|
|
7987
9573
|
int n_past) {
|
7988
9574
|
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
7989
9575
|
|
7990
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9576
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7991
9577
|
|
7992
9578
|
const int ret = llama_decode_internal(*ctx, batch);
|
7993
9579
|
if (ret < 0) {
|
@@ -8008,20 +9594,21 @@ struct llama_batch llama_batch_get_one(
|
|
8008
9594
|
llama_pos pos_0,
|
8009
9595
|
llama_seq_id seq_id) {
|
8010
9596
|
return {
|
8011
|
-
/*n_tokens
|
8012
|
-
/*tokens
|
8013
|
-
/*embd
|
8014
|
-
/*pos
|
8015
|
-
/*
|
8016
|
-
/*
|
8017
|
-
/*
|
8018
|
-
/*
|
8019
|
-
/*
|
9597
|
+
/*n_tokens =*/ n_tokens,
|
9598
|
+
/*tokens =*/ tokens,
|
9599
|
+
/*embd =*/ nullptr,
|
9600
|
+
/*pos =*/ nullptr,
|
9601
|
+
/*n_seq_id =*/ nullptr,
|
9602
|
+
/*seq_id =*/ nullptr,
|
9603
|
+
/*logits =*/ nullptr,
|
9604
|
+
/*all_pos_0 =*/ pos_0,
|
9605
|
+
/*all_pos_1 =*/ 1,
|
9606
|
+
/*all_seq_id =*/ seq_id,
|
8020
9607
|
};
|
8021
9608
|
}
|
8022
9609
|
|
8023
|
-
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
8024
|
-
llama_batch batch = {
|
9610
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
9611
|
+
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
8025
9612
|
|
8026
9613
|
if (embd) {
|
8027
9614
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
@@ -8029,19 +9616,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
|
8029
9616
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
8030
9617
|
}
|
8031
9618
|
|
8032
|
-
batch.pos
|
8033
|
-
batch.
|
8034
|
-
batch.
|
9619
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
9620
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
9621
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
9622
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9623
|
+
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
9624
|
+
}
|
9625
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
8035
9626
|
|
8036
9627
|
return batch;
|
8037
9628
|
}
|
8038
9629
|
|
8039
9630
|
void llama_batch_free(struct llama_batch batch) {
|
8040
|
-
if (batch.token)
|
8041
|
-
if (batch.embd)
|
8042
|
-
if (batch.pos)
|
8043
|
-
if (batch.
|
8044
|
-
if (batch.
|
9631
|
+
if (batch.token) free(batch.token);
|
9632
|
+
if (batch.embd) free(batch.embd);
|
9633
|
+
if (batch.pos) free(batch.pos);
|
9634
|
+
if (batch.n_seq_id) free(batch.n_seq_id);
|
9635
|
+
if (batch.seq_id) {
|
9636
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
9637
|
+
free(batch.seq_id[i]);
|
9638
|
+
}
|
9639
|
+
free(batch.seq_id);
|
9640
|
+
}
|
9641
|
+
if (batch.logits) free(batch.logits);
|
8045
9642
|
}
|
8046
9643
|
|
8047
9644
|
int llama_decode(
|
@@ -8106,15 +9703,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
|
|
8106
9703
|
return ctx->model.vocab.special_eot_id;
|
8107
9704
|
}
|
8108
9705
|
|
8109
|
-
|
8110
9706
|
int llama_tokenize(
|
8111
9707
|
const struct llama_model * model,
|
8112
9708
|
const char * text,
|
8113
9709
|
int text_len,
|
8114
9710
|
llama_token * tokens,
|
8115
9711
|
int n_max_tokens,
|
8116
|
-
bool add_bos
|
8117
|
-
|
9712
|
+
bool add_bos,
|
9713
|
+
bool special) {
|
9714
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
8118
9715
|
|
8119
9716
|
if (n_max_tokens < (int) res.size()) {
|
8120
9717
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -8166,7 +9763,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8166
9763
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
9764
|
return 1;
|
8168
9765
|
} else {
|
8169
|
-
|
9766
|
+
// TODO: for now we accept all unsupported token types,
|
9767
|
+
// suppressing them like CONTROL tokens.
|
9768
|
+
// GGML_ASSERT(false);
|
8170
9769
|
}
|
8171
9770
|
break;
|
8172
9771
|
}
|
@@ -8182,7 +9781,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8182
9781
|
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
9782
|
;
|
8184
9783
|
} else {
|
8185
|
-
|
9784
|
+
// TODO: for now we accept all unsupported token types,
|
9785
|
+
// suppressing them like CONTROL tokens.
|
9786
|
+
// GGML_ASSERT(false);
|
8186
9787
|
}
|
8187
9788
|
break;
|
8188
9789
|
}
|