llama_cpp 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -186,7 +186,9 @@ enum llm_arch {
|
|
186
186
|
LLM_ARCH_GPTNEOX,
|
187
187
|
LLM_ARCH_MPT,
|
188
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_PERSIMMON,
|
189
190
|
LLM_ARCH_REFACT,
|
191
|
+
LLM_ARCH_BLOOM,
|
190
192
|
LLM_ARCH_UNKNOWN,
|
191
193
|
};
|
192
194
|
|
@@ -199,7 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
199
201
|
{ LLM_ARCH_MPT, "mpt" },
|
200
202
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
201
203
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
-
{
|
204
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205
|
+
{ LLM_ARCH_REFACT, "refact" },
|
206
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
203
207
|
};
|
204
208
|
|
205
209
|
enum llm_kv {
|
@@ -302,6 +306,7 @@ struct LLM_KV {
|
|
302
306
|
|
303
307
|
enum llm_tensor {
|
304
308
|
LLM_TENSOR_TOKEN_EMBD,
|
309
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
305
310
|
LLM_TENSOR_POS_EMBD,
|
306
311
|
LLM_TENSOR_OUTPUT,
|
307
312
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -318,6 +323,8 @@ enum llm_tensor {
|
|
318
323
|
LLM_TENSOR_FFN_DOWN,
|
319
324
|
LLM_TENSOR_FFN_UP,
|
320
325
|
LLM_TENSOR_FFN_NORM,
|
326
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
327
|
+
LLM_TENSOR_ATTN_K_NORM,
|
321
328
|
};
|
322
329
|
|
323
330
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -399,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
399
406
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
400
407
|
},
|
401
408
|
},
|
409
|
+
{
|
410
|
+
LLM_ARCH_PERSIMMON,
|
411
|
+
{
|
412
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
413
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
414
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
415
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
416
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
417
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
418
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
419
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
420
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
422
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
423
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
424
|
+
},
|
425
|
+
},
|
402
426
|
{
|
403
427
|
LLM_ARCH_MPT,
|
404
428
|
{
|
405
429
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
430
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
431
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
432
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
433
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
434
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
435
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
406
438
|
},
|
407
439
|
},
|
408
440
|
{
|
@@ -437,6 +469,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
437
469
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
470
|
},
|
439
471
|
},
|
472
|
+
{
|
473
|
+
LLM_ARCH_BLOOM,
|
474
|
+
{
|
475
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
477
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
479
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
480
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
481
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
482
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
483
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
485
|
+
},
|
486
|
+
},
|
440
487
|
{
|
441
488
|
LLM_ARCH_UNKNOWN,
|
442
489
|
{
|
@@ -954,6 +1001,7 @@ enum e_model {
|
|
954
1001
|
MODEL_1B,
|
955
1002
|
MODEL_3B,
|
956
1003
|
MODEL_7B,
|
1004
|
+
MODEL_8B,
|
957
1005
|
MODEL_13B,
|
958
1006
|
MODEL_15B,
|
959
1007
|
MODEL_30B,
|
@@ -984,6 +1032,9 @@ struct llama_hparams {
|
|
984
1032
|
float rope_freq_base_train;
|
985
1033
|
float rope_freq_scale_train;
|
986
1034
|
|
1035
|
+
float f_clamp_kqv;
|
1036
|
+
float f_max_alibi_bias;
|
1037
|
+
|
987
1038
|
bool operator!=(const llama_hparams & other) const {
|
988
1039
|
if (this->vocab_only != other.vocab_only) return true;
|
989
1040
|
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1036,6 +1087,10 @@ struct llama_layer {
|
|
1036
1087
|
struct ggml_tensor * attn_norm_b;
|
1037
1088
|
struct ggml_tensor * attn_norm_2;
|
1038
1089
|
struct ggml_tensor * attn_norm_2_b;
|
1090
|
+
struct ggml_tensor * attn_q_norm;
|
1091
|
+
struct ggml_tensor * attn_q_norm_b;
|
1092
|
+
struct ggml_tensor * attn_k_norm;
|
1093
|
+
struct ggml_tensor * attn_k_norm_b;
|
1039
1094
|
|
1040
1095
|
// attention
|
1041
1096
|
struct ggml_tensor * wq;
|
@@ -1077,6 +1132,9 @@ struct llama_kv_cell {
|
|
1077
1132
|
struct llama_kv_cache {
|
1078
1133
|
bool has_shift = false;
|
1079
1134
|
|
1135
|
+
// Note: The value of head isn't only used to optimize searching
|
1136
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1137
|
+
// cannot be freely changed after a slot has been allocated.
|
1080
1138
|
uint32_t head = 0;
|
1081
1139
|
uint32_t size = 0;
|
1082
1140
|
|
@@ -1162,6 +1220,8 @@ struct llama_model {
|
|
1162
1220
|
|
1163
1221
|
struct ggml_tensor * tok_embeddings;
|
1164
1222
|
struct ggml_tensor * pos_embeddings;
|
1223
|
+
struct ggml_tensor * tok_norm;
|
1224
|
+
struct ggml_tensor * tok_norm_b;
|
1165
1225
|
|
1166
1226
|
struct ggml_tensor * output_norm;
|
1167
1227
|
struct ggml_tensor * output_norm_b;
|
@@ -1291,7 +1351,11 @@ static bool llama_kv_cache_init(
|
|
1291
1351
|
cache.cells.clear();
|
1292
1352
|
cache.cells.resize(n_ctx);
|
1293
1353
|
|
1354
|
+
// TODO: this should be:
|
1355
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1356
|
+
// change it and test that it works
|
1294
1357
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1295
1359
|
|
1296
1360
|
struct ggml_init_params params;
|
1297
1361
|
params.mem_size = cache.buf.size;
|
@@ -1334,6 +1398,8 @@ static bool llama_kv_cache_init(
|
|
1334
1398
|
|
1335
1399
|
// find an empty slot of size "n_tokens" in the cache
|
1336
1400
|
// updates the cache head
|
1401
|
+
// Note: On success, it's important that cache.head points
|
1402
|
+
// to the first cell of the slot.
|
1337
1403
|
static bool llama_kv_cache_find_slot(
|
1338
1404
|
struct llama_kv_cache & cache,
|
1339
1405
|
const struct llama_batch & batch) {
|
@@ -1349,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
|
|
1349
1415
|
|
1350
1416
|
while (true) {
|
1351
1417
|
if (cache.head + n_tokens > n_ctx) {
|
1418
|
+
n_tested += n_ctx - cache.head;
|
1352
1419
|
cache.head = 0;
|
1353
|
-
n_tested += n_ctx - cache.head;
|
1354
1420
|
continue;
|
1355
1421
|
}
|
1356
1422
|
|
@@ -1401,6 +1467,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1401
1467
|
cache.cells[i].pos = -1;
|
1402
1468
|
cache.cells[i].seq_id.clear();
|
1403
1469
|
}
|
1470
|
+
|
1471
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1472
|
+
cache.head = uint32_t(c0);
|
1404
1473
|
}
|
1405
1474
|
|
1406
1475
|
static void llama_kv_cache_seq_rm(
|
@@ -1408,6 +1477,8 @@ static void llama_kv_cache_seq_rm(
|
|
1408
1477
|
llama_seq_id seq_id,
|
1409
1478
|
llama_pos p0,
|
1410
1479
|
llama_pos p1) {
|
1480
|
+
uint32_t new_head = cache.size;
|
1481
|
+
|
1411
1482
|
if (p0 < 0) p0 = 0;
|
1412
1483
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
1484
|
|
@@ -1416,9 +1487,13 @@ static void llama_kv_cache_seq_rm(
|
|
1416
1487
|
cache.cells[i].seq_id.erase(seq_id);
|
1417
1488
|
if (cache.cells[i].seq_id.empty()) {
|
1418
1489
|
cache.cells[i].pos = -1;
|
1490
|
+
if (new_head == cache.size) new_head = i;
|
1419
1491
|
}
|
1420
1492
|
}
|
1421
1493
|
}
|
1494
|
+
|
1495
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1496
|
+
if (new_head != cache.size) cache.head = new_head;
|
1422
1497
|
}
|
1423
1498
|
|
1424
1499
|
static void llama_kv_cache_seq_cp(
|
@@ -1430,6 +1505,8 @@ static void llama_kv_cache_seq_cp(
|
|
1430
1505
|
if (p0 < 0) p0 = 0;
|
1431
1506
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
1507
|
|
1508
|
+
cache.head = 0;
|
1509
|
+
|
1433
1510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
1511
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
1512
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1438,12 +1515,18 @@ static void llama_kv_cache_seq_cp(
|
|
1438
1515
|
}
|
1439
1516
|
|
1440
1517
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1518
|
+
uint32_t new_head = cache.size;
|
1519
|
+
|
1441
1520
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
1521
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
1522
|
cache.cells[i].pos = -1;
|
1444
1523
|
cache.cells[i].seq_id.clear();
|
1524
|
+
if (new_head == cache.size) new_head = i;
|
1445
1525
|
}
|
1446
1526
|
}
|
1527
|
+
|
1528
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1529
|
+
if (new_head != cache.size) cache.head = new_head;
|
1447
1530
|
}
|
1448
1531
|
|
1449
1532
|
static void llama_kv_cache_seq_shift(
|
@@ -1452,6 +1535,8 @@ static void llama_kv_cache_seq_shift(
|
|
1452
1535
|
llama_pos p0,
|
1453
1536
|
llama_pos p1,
|
1454
1537
|
llama_pos delta) {
|
1538
|
+
uint32_t new_head = cache.size;
|
1539
|
+
|
1455
1540
|
if (p0 < 0) p0 = 0;
|
1456
1541
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
1542
|
|
@@ -1461,12 +1546,17 @@ static void llama_kv_cache_seq_shift(
|
|
1461
1546
|
if (cache.cells[i].pos < 0) {
|
1462
1547
|
cache.cells[i].pos = -1;
|
1463
1548
|
cache.cells[i].seq_id.clear();
|
1549
|
+
if (new_head == cache.size) new_head = i;
|
1464
1550
|
} else {
|
1465
1551
|
cache.has_shift = true;
|
1466
1552
|
cache.cells[i].delta = delta;
|
1467
1553
|
}
|
1468
1554
|
}
|
1469
1555
|
}
|
1556
|
+
|
1557
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1558
|
+
// Otherwise we just start the next search from the beginning.
|
1559
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1470
1560
|
}
|
1471
1561
|
|
1472
1562
|
//
|
@@ -1670,7 +1760,7 @@ struct llama_model_loader {
|
|
1670
1760
|
}
|
1671
1761
|
}
|
1672
1762
|
|
1673
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1763
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1674
1764
|
if (backend != GGML_BACKEND_CPU) {
|
1675
1765
|
ggml_set_no_alloc(ctx, true);
|
1676
1766
|
}
|
@@ -1688,7 +1778,7 @@ struct llama_model_loader {
|
|
1688
1778
|
return tensor;
|
1689
1779
|
}
|
1690
1780
|
|
1691
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1781
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1692
1782
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1693
1783
|
|
1694
1784
|
if (cur == NULL) {
|
@@ -1867,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1867
1957
|
case MODEL_1B: return "1B";
|
1868
1958
|
case MODEL_3B: return "3B";
|
1869
1959
|
case MODEL_7B: return "7B";
|
1960
|
+
case MODEL_8B: return "8B";
|
1870
1961
|
case MODEL_13B: return "13B";
|
1871
1962
|
case MODEL_15B: return "15B";
|
1872
1963
|
case MODEL_30B: return "30B";
|
@@ -1979,6 +2070,14 @@ static void llm_load_hparams(
|
|
1979
2070
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1980
2071
|
}
|
1981
2072
|
} break;
|
2073
|
+
case LLM_ARCH_PERSIMMON:
|
2074
|
+
{
|
2075
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2076
|
+
switch (hparams.n_layer) {
|
2077
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2078
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2079
|
+
}
|
2080
|
+
} break;
|
1982
2081
|
case LLM_ARCH_REFACT:
|
1983
2082
|
{
|
1984
2083
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -1987,6 +2086,33 @@ static void llm_load_hparams(
|
|
1987
2086
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
2087
|
}
|
1989
2088
|
} break;
|
2089
|
+
case LLM_ARCH_BLOOM:
|
2090
|
+
{
|
2091
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2092
|
+
|
2093
|
+
switch (hparams.n_layer) {
|
2094
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2095
|
+
case 30:
|
2096
|
+
switch (hparams.n_embd) {
|
2097
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2098
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2099
|
+
} break;
|
2100
|
+
}
|
2101
|
+
} break;
|
2102
|
+
case LLM_ARCH_MPT:
|
2103
|
+
{
|
2104
|
+
hparams.f_clamp_kqv = 0.0f;
|
2105
|
+
|
2106
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2107
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2108
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2109
|
+
|
2110
|
+
switch (hparams.n_layer) {
|
2111
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2112
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2113
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2114
|
+
}
|
2115
|
+
} break;
|
1990
2116
|
default: (void)0;
|
1991
2117
|
}
|
1992
2118
|
|
@@ -2131,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2131
2257
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
2258
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
2259
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2260
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2261
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2134
2262
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
2263
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
2264
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2230,8 +2358,8 @@ static void llm_load_tensors(
|
|
2230
2358
|
|
2231
2359
|
// output
|
2232
2360
|
{
|
2233
|
-
|
2234
|
-
|
2361
|
+
ggml_backend_type backend_norm;
|
2362
|
+
ggml_backend_type backend_output;
|
2235
2363
|
|
2236
2364
|
if (n_gpu_layers > int(n_layer)) {
|
2237
2365
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2266,8 +2394,8 @@ static void llm_load_tensors(
|
|
2266
2394
|
model.layers.resize(n_layer);
|
2267
2395
|
|
2268
2396
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2269
|
-
const
|
2270
|
-
const
|
2397
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2398
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2271
2399
|
|
2272
2400
|
auto & layer = model.layers[i];
|
2273
2401
|
|
@@ -2296,8 +2424,8 @@ static void llm_load_tensors(
|
|
2296
2424
|
{
|
2297
2425
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2298
2426
|
{
|
2299
|
-
|
2300
|
-
|
2427
|
+
ggml_backend_type backend_norm;
|
2428
|
+
ggml_backend_type backend_output;
|
2301
2429
|
|
2302
2430
|
if (n_gpu_layers > int(n_layer)) {
|
2303
2431
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2332,8 +2460,8 @@ static void llm_load_tensors(
|
|
2332
2460
|
model.layers.resize(n_layer);
|
2333
2461
|
|
2334
2462
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2335
|
-
const
|
2336
|
-
const
|
2463
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2464
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2337
2465
|
|
2338
2466
|
auto & layer = model.layers[i];
|
2339
2467
|
|
@@ -2366,8 +2494,8 @@ static void llm_load_tensors(
|
|
2366
2494
|
|
2367
2495
|
// output
|
2368
2496
|
{
|
2369
|
-
|
2370
|
-
|
2497
|
+
ggml_backend_type backend_norm;
|
2498
|
+
ggml_backend_type backend_output;
|
2371
2499
|
|
2372
2500
|
if (n_gpu_layers > int(n_layer)) {
|
2373
2501
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2404,8 +2532,8 @@ static void llm_load_tensors(
|
|
2404
2532
|
model.layers.resize(n_layer);
|
2405
2533
|
|
2406
2534
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2407
|
-
const
|
2408
|
-
const
|
2535
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2536
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2409
2537
|
|
2410
2538
|
auto & layer = model.layers[i];
|
2411
2539
|
|
@@ -2443,8 +2571,8 @@ static void llm_load_tensors(
|
|
2443
2571
|
|
2444
2572
|
// output
|
2445
2573
|
{
|
2446
|
-
|
2447
|
-
|
2574
|
+
ggml_backend_type backend_norm;
|
2575
|
+
ggml_backend_type backend_output;
|
2448
2576
|
|
2449
2577
|
if (n_gpu_layers > int(n_layer)) {
|
2450
2578
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2481,8 +2609,8 @@ static void llm_load_tensors(
|
|
2481
2609
|
model.layers.resize(n_layer);
|
2482
2610
|
|
2483
2611
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2484
|
-
const
|
2485
|
-
const
|
2612
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2613
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2486
2614
|
|
2487
2615
|
auto & layer = model.layers[i];
|
2488
2616
|
|
@@ -2515,6 +2643,216 @@ static void llm_load_tensors(
|
|
2515
2643
|
}
|
2516
2644
|
}
|
2517
2645
|
} break;
|
2646
|
+
case LLM_ARCH_PERSIMMON:
|
2647
|
+
{
|
2648
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2649
|
+
|
2650
|
+
{
|
2651
|
+
ggml_backend_type backend_norm;
|
2652
|
+
ggml_backend_type backend_output;
|
2653
|
+
|
2654
|
+
if (n_gpu_layers > int(n_layer)) {
|
2655
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2656
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2657
|
+
#ifndef _WIN32
|
2658
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2659
|
+
#else
|
2660
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2661
|
+
#endif // _WIN32
|
2662
|
+
|
2663
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2664
|
+
} else {
|
2665
|
+
backend_norm = GGML_BACKEND_CPU;
|
2666
|
+
backend_output = GGML_BACKEND_CPU;
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2670
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2671
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2672
|
+
|
2673
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2674
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2675
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2676
|
+
}
|
2677
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2678
|
+
vram_weights += ggml_nbytes(model.output);
|
2679
|
+
}
|
2680
|
+
}
|
2681
|
+
|
2682
|
+
const uint32_t n_ff = hparams.n_ff;
|
2683
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2684
|
+
model.layers.resize(n_layer);
|
2685
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2686
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2687
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2688
|
+
auto & layer = model.layers[i];
|
2689
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2690
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2691
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2692
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2693
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2694
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2695
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2696
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2697
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2698
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2699
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2700
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2701
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2702
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2703
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2704
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2705
|
+
}
|
2706
|
+
} break;
|
2707
|
+
case LLM_ARCH_BLOOM:
|
2708
|
+
{
|
2709
|
+
// TODO: CPU-only for now
|
2710
|
+
|
2711
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2712
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2713
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2714
|
+
|
2715
|
+
// output
|
2716
|
+
{
|
2717
|
+
ggml_backend_type backend_norm;
|
2718
|
+
ggml_backend_type backend_output;
|
2719
|
+
|
2720
|
+
if (n_gpu_layers > int(n_layer)) {
|
2721
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2722
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2723
|
+
#ifndef _WIN32
|
2724
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2725
|
+
#else
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2727
|
+
#endif // _WIN32
|
2728
|
+
|
2729
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2730
|
+
} else {
|
2731
|
+
backend_norm = GGML_BACKEND_CPU;
|
2732
|
+
backend_output = GGML_BACKEND_CPU;
|
2733
|
+
}
|
2734
|
+
|
2735
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2736
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2737
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2738
|
+
|
2739
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2740
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2741
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2742
|
+
}
|
2743
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2744
|
+
vram_weights += ggml_nbytes(model.output);
|
2745
|
+
}
|
2746
|
+
}
|
2747
|
+
|
2748
|
+
const uint32_t n_ff = hparams.n_ff;
|
2749
|
+
|
2750
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2751
|
+
|
2752
|
+
model.layers.resize(n_layer);
|
2753
|
+
|
2754
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2757
|
+
|
2758
|
+
auto & layer = model.layers[i];
|
2759
|
+
|
2760
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2761
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2762
|
+
|
2763
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2764
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2765
|
+
|
2766
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2767
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2768
|
+
|
2769
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2771
|
+
|
2772
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2773
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2774
|
+
|
2775
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2776
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2777
|
+
|
2778
|
+
if (backend == GGML_BACKEND_GPU) {
|
2779
|
+
vram_weights +=
|
2780
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2781
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2782
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2783
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2784
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2785
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2786
|
+
}
|
2787
|
+
}
|
2788
|
+
} break;
|
2789
|
+
case LLM_ARCH_MPT:
|
2790
|
+
{
|
2791
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2792
|
+
|
2793
|
+
// output
|
2794
|
+
{
|
2795
|
+
ggml_backend_type backend_norm;
|
2796
|
+
ggml_backend_type backend_output;
|
2797
|
+
|
2798
|
+
if (n_gpu_layers > int(n_layer)) {
|
2799
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2800
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2801
|
+
#ifndef _WIN32
|
2802
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2803
|
+
#else
|
2804
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2805
|
+
#endif // _WIN32
|
2806
|
+
|
2807
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2808
|
+
} else {
|
2809
|
+
backend_norm = GGML_BACKEND_CPU;
|
2810
|
+
backend_output = GGML_BACKEND_CPU;
|
2811
|
+
}
|
2812
|
+
|
2813
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2814
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2815
|
+
|
2816
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2817
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2818
|
+
}
|
2819
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2820
|
+
vram_weights += ggml_nbytes(model.output);
|
2821
|
+
}
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
const uint32_t n_ff = hparams.n_ff;
|
2825
|
+
|
2826
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2827
|
+
|
2828
|
+
model.layers.resize(n_layer);
|
2829
|
+
|
2830
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2831
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2832
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2833
|
+
|
2834
|
+
auto & layer = model.layers[i];
|
2835
|
+
|
2836
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2838
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
|
+
|
2840
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
|
+
|
2842
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2843
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2844
|
+
|
2845
|
+
if (backend == GGML_BACKEND_GPU) {
|
2846
|
+
vram_weights +=
|
2847
|
+
ggml_nbytes(layer.attn_norm) +
|
2848
|
+
ggml_nbytes(layer.wqkv) +
|
2849
|
+
ggml_nbytes(layer.wo) +
|
2850
|
+
ggml_nbytes(layer.ffn_norm) +
|
2851
|
+
ggml_nbytes(layer.w2) +
|
2852
|
+
ggml_nbytes(layer.w3);
|
2853
|
+
}
|
2854
|
+
}
|
2855
|
+
} break;
|
2518
2856
|
default:
|
2519
2857
|
throw std::runtime_error("unknown architecture");
|
2520
2858
|
}
|
@@ -2624,8 +2962,8 @@ static bool llama_model_load(
|
|
2624
2962
|
}
|
2625
2963
|
|
2626
2964
|
static struct ggml_cgraph * llm_build_llama(
|
2627
|
-
|
2628
|
-
|
2965
|
+
llama_context & lctx,
|
2966
|
+
const llama_batch & batch) {
|
2629
2967
|
const auto & model = lctx.model;
|
2630
2968
|
const auto & hparams = model.hparams;
|
2631
2969
|
const auto & cparams = lctx.cparams;
|
@@ -2663,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2663
3001
|
struct ggml_init_params params = {
|
2664
3002
|
/*.mem_size =*/ buf_compute.size,
|
2665
3003
|
/*.mem_buffer =*/ buf_compute.data,
|
2666
|
-
/*.no_alloc =*/
|
3004
|
+
/*.no_alloc =*/ true,
|
2667
3005
|
};
|
2668
3006
|
|
2669
|
-
params.no_alloc = true;
|
2670
|
-
|
2671
3007
|
struct ggml_context * ctx0 = ggml_init(params);
|
2672
3008
|
|
2673
3009
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3051,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3051
3387
|
struct ggml_init_params params = {
|
3052
3388
|
/*.mem_size =*/ buf_compute.size,
|
3053
3389
|
/*.mem_buffer =*/ buf_compute.data,
|
3054
|
-
/*.no_alloc =*/
|
3390
|
+
/*.no_alloc =*/ true,
|
3055
3391
|
};
|
3056
3392
|
|
3057
|
-
params.no_alloc = true;
|
3058
|
-
|
3059
3393
|
struct ggml_context * ctx0 = ggml_init(params);
|
3060
3394
|
|
3061
3395
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3452,11 +3786,9 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3452
3786
|
struct ggml_init_params params = {
|
3453
3787
|
/*.mem_size =*/ buf_compute.size,
|
3454
3788
|
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
-
/*.no_alloc =*/
|
3789
|
+
/*.no_alloc =*/ true,
|
3456
3790
|
};
|
3457
3791
|
|
3458
|
-
params.no_alloc = true;
|
3459
|
-
|
3460
3792
|
struct ggml_context * ctx0 = ggml_init(params);
|
3461
3793
|
|
3462
3794
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3806,11 +4138,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3806
4138
|
struct ggml_init_params params = {
|
3807
4139
|
/*.mem_size =*/ buf_compute.size,
|
3808
4140
|
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
-
/*.no_alloc =*/
|
4141
|
+
/*.no_alloc =*/ true,
|
3810
4142
|
};
|
3811
4143
|
|
3812
|
-
params.no_alloc = true;
|
3813
|
-
|
3814
4144
|
struct ggml_context * ctx0 = ggml_init(params);
|
3815
4145
|
|
3816
4146
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4166,11 +4496,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4166
4496
|
struct ggml_init_params params = {
|
4167
4497
|
/*.mem_size =*/ buf_compute.size,
|
4168
4498
|
/*.mem_buffer =*/ buf_compute.data,
|
4169
|
-
/*.no_alloc =*/
|
4499
|
+
/*.no_alloc =*/ true,
|
4170
4500
|
};
|
4171
4501
|
|
4172
|
-
params.no_alloc = true;
|
4173
|
-
|
4174
4502
|
struct ggml_context * ctx0 = ggml_init(params);
|
4175
4503
|
|
4176
4504
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4381,19 +4709,975 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4381
4709
|
return gf;
|
4382
4710
|
}
|
4383
4711
|
|
4384
|
-
static struct ggml_cgraph *
|
4712
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4385
4713
|
llama_context & lctx,
|
4386
4714
|
const llama_batch & batch) {
|
4387
4715
|
const auto & model = lctx.model;
|
4716
|
+
const auto & hparams = model.hparams;
|
4388
4717
|
|
4389
|
-
|
4718
|
+
const auto & kv_self = lctx.kv_self;
|
4390
4719
|
|
4391
|
-
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4720
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4721
|
+
|
4722
|
+
const auto & cparams = lctx.cparams;
|
4723
|
+
const int64_t n_embd = hparams.n_embd;
|
4724
|
+
const int64_t n_layer = hparams.n_layer;
|
4725
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4726
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4727
|
+
const int64_t n_head = hparams.n_head;
|
4728
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4729
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4730
|
+
const size_t n_rot = n_embd_head / 2;
|
4731
|
+
|
4732
|
+
const float freq_base = cparams.rope_freq_base;
|
4733
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4734
|
+
const float norm_eps = hparams.f_norm_eps;
|
4735
|
+
|
4736
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4737
|
+
|
4738
|
+
|
4739
|
+
const int32_t n_tokens = batch.n_tokens;
|
4740
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4741
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4742
|
+
|
4743
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4744
|
+
|
4745
|
+
auto & buf_compute = lctx.buf_compute;
|
4746
|
+
struct ggml_init_params params = {
|
4747
|
+
/*.mem_size =*/ buf_compute.size,
|
4748
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4749
|
+
/*.no_alloc =*/ true,
|
4750
|
+
};
|
4751
|
+
|
4752
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4753
|
+
|
4754
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4755
|
+
|
4756
|
+
struct ggml_tensor * cur;
|
4757
|
+
struct ggml_tensor * inpL;
|
4758
|
+
|
4759
|
+
if (batch.token) {
|
4760
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4761
|
+
|
4762
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4763
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4764
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4765
|
+
}
|
4766
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4767
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4768
|
+
} else {
|
4769
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4770
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4772
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4773
|
+
}
|
4774
|
+
}
|
4775
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4776
|
+
(void) i_gpu_start;
|
4777
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4778
|
+
offload_func_t offload_func_kq = llama_nop;
|
4779
|
+
offload_func_t offload_func_v = llama_nop;
|
4780
|
+
// KQ_scale
|
4781
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4782
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4783
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4784
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4785
|
+
}
|
4786
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4787
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4788
|
+
offload_func_kq(KQ_mask);
|
4789
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4790
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4791
|
+
|
4792
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4793
|
+
float * data = (float *) KQ_mask->data;
|
4794
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4795
|
+
for (int h = 0; h < 1; ++h) {
|
4796
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4797
|
+
const llama_pos pos = batch.pos[j];
|
4798
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4799
|
+
for (int i = 0; i < n_kv; ++i) {
|
4800
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4802
|
+
}
|
4803
|
+
}
|
4804
|
+
}
|
4805
|
+
}
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4809
|
+
offload_func_kq(KQ_pos);
|
4810
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4811
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4812
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4813
|
+
int * data = (int *) KQ_pos->data;
|
4814
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4815
|
+
data[i] = batch.pos[i];
|
4816
|
+
}
|
4817
|
+
}
|
4818
|
+
if (do_rope_shift) {
|
4819
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4820
|
+
offload_func_kq(K_shift);
|
4821
|
+
ggml_set_name(K_shift, "K_shift");
|
4822
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4824
|
+
int * data = (int *) K_shift->data;
|
4825
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4826
|
+
data[i] = kv_self.cells[i].delta;
|
4827
|
+
}
|
4828
|
+
}
|
4829
|
+
for (int il = 0; il < n_layer; ++il) {
|
4830
|
+
struct ggml_tensor * tmp =
|
4831
|
+
// we rotate only the first n_rot dimensions.
|
4832
|
+
ggml_rope_custom_inplace(ctx0,
|
4833
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4834
|
+
n_rot, n_head, n_ctx,
|
4835
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4836
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4837
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4838
|
+
),
|
4839
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4840
|
+
offload_func_kq(tmp);
|
4841
|
+
ggml_build_forward_expand(gf, tmp);
|
4842
|
+
}
|
4843
|
+
}
|
4844
|
+
for (int il=0; il < n_layer; ++il) {
|
4845
|
+
struct ggml_tensor * residual = inpL;
|
4846
|
+
offload_func_t offload_func = llama_nop;
|
4847
|
+
{
|
4848
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4849
|
+
offload_func(cur);
|
4850
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4851
|
+
offload_func(cur);
|
4852
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4853
|
+
offload_func(cur);
|
4854
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4855
|
+
}
|
4856
|
+
// self attention
|
4857
|
+
{
|
4858
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4859
|
+
offload_func_kq(cur);
|
4860
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4861
|
+
offload_func_kq(cur);
|
4862
|
+
|
4863
|
+
// split qkv
|
4864
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4865
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4866
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4867
|
+
offload_func_kq(tmpqkv);
|
4868
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4869
|
+
offload_func_kq(tmpqkv_perm);
|
4870
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4871
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4872
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4873
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4874
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4875
|
+
0
|
4876
|
+
);
|
4877
|
+
offload_func_kq(tmpq);
|
4878
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4879
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4880
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4881
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4882
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4883
|
+
);
|
4884
|
+
offload_func_kq(tmpk);
|
4885
|
+
// Q/K Layernorm
|
4886
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4887
|
+
offload_func_kq(tmpq);
|
4888
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4889
|
+
offload_func_kq(tmpq);
|
4890
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4891
|
+
offload_func_kq(tmpq);
|
4892
|
+
|
4893
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4894
|
+
offload_func_v(tmpk);
|
4895
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
4896
|
+
offload_func_v(tmpk);
|
4897
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
4898
|
+
offload_func_v(tmpk);
|
4899
|
+
|
4900
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4901
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
4902
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4903
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4904
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4905
|
+
0
|
4906
|
+
);
|
4907
|
+
offload_func_kq(qrot);
|
4908
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
4909
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
4910
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4911
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4912
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4913
|
+
0
|
4914
|
+
);
|
4915
|
+
offload_func_kq(krot);
|
4916
|
+
ggml_format_name(krot, "krot_%d", il);
|
4917
|
+
|
4918
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4919
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4920
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4921
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4922
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4923
|
+
ggml_element_size(tmpq) * n_rot
|
4924
|
+
);
|
4925
|
+
offload_func_kq(qpass);
|
4926
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
4927
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4928
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4929
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4930
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4931
|
+
ggml_element_size(tmpk) * n_rot
|
4932
|
+
);
|
4933
|
+
offload_func_kq(kpass);
|
4934
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
4935
|
+
|
4936
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4937
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4938
|
+
);
|
4939
|
+
offload_func_kq(qrotated);
|
4940
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4941
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4942
|
+
);
|
4943
|
+
offload_func_kq(krotated);
|
4944
|
+
// ggml currently only supports concatenation on dim=2
|
4945
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4946
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4947
|
+
offload_func_kq(qrotated);
|
4948
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4949
|
+
offload_func_kq(krotated);
|
4950
|
+
|
4951
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4952
|
+
offload_func_kq(qpass);
|
4953
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4954
|
+
offload_func_kq(kpass);
|
4955
|
+
|
4956
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4957
|
+
offload_func_kq(Qcur);
|
4958
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4959
|
+
offload_func_kq(Kcur);
|
4960
|
+
|
4961
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
4962
|
+
offload_func_kq(Q);
|
4963
|
+
|
4964
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4965
|
+
offload_func_kq(Kcur);
|
4966
|
+
{
|
4967
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4968
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4969
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4970
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4971
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
4972
|
+
);
|
4973
|
+
offload_func_v(tmpv);
|
4974
|
+
// store K, V in cache
|
4975
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
4976
|
+
offload_func_v(Vcur);
|
4977
|
+
ggml_set_name(Vcur, "Vcur");
|
4978
|
+
|
4979
|
+
struct ggml_tensor * k = ggml_view_1d(
|
4980
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
4981
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
4982
|
+
);
|
4983
|
+
offload_func_kq(k);
|
4984
|
+
ggml_set_name(k, "k");
|
4985
|
+
|
4986
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4987
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4988
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4989
|
+
offload_func_v(v);
|
4990
|
+
ggml_set_name(v, "v");
|
4991
|
+
|
4992
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
4993
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4994
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4995
|
+
}
|
4996
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
4997
|
+
n_embd_head, n_kv, n_head_kv,
|
4998
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4999
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5000
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5001
|
+
|
5002
|
+
offload_func_kq(K);
|
5003
|
+
ggml_format_name(K, "K_%d", il);
|
5004
|
+
|
5005
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5006
|
+
offload_func_kq(KQ);
|
5007
|
+
ggml_set_name(KQ, "KQ");
|
5008
|
+
|
5009
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5010
|
+
offload_func_kq(KQ_scaled);
|
5011
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5012
|
+
|
5013
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5014
|
+
offload_func_kq(KQ_masked);
|
5015
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5016
|
+
|
5017
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5018
|
+
offload_func_kq(KQ_soft_max);
|
5019
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5020
|
+
|
5021
|
+
struct ggml_tensor * V =
|
5022
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5023
|
+
n_kv, n_embd_head, n_head_kv,
|
5024
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5025
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5026
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5027
|
+
offload_func_v(V);
|
5028
|
+
ggml_set_name(V, "V");
|
5029
|
+
|
5030
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5031
|
+
offload_func_v(KQV);
|
5032
|
+
ggml_set_name(KQV, "KQV");
|
5033
|
+
|
5034
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5035
|
+
offload_func_v(KQV_merged);
|
5036
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5037
|
+
|
5038
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5039
|
+
offload_func_v(cur);
|
5040
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5041
|
+
|
5042
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5043
|
+
offload_func(cur);
|
5044
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5045
|
+
offload_func(cur);
|
5046
|
+
ggml_set_name(cur, "result_wo");
|
5047
|
+
}
|
5048
|
+
|
5049
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5050
|
+
offload_func(inpFF);
|
5051
|
+
ggml_set_name(inpFF, "inpFF");
|
5052
|
+
{
|
5053
|
+
// MLP
|
5054
|
+
{
|
5055
|
+
// Norm
|
5056
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5057
|
+
offload_func(cur);
|
5058
|
+
cur = ggml_add(ctx0,
|
5059
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5060
|
+
model.layers[il].ffn_norm_b
|
5061
|
+
);
|
5062
|
+
ggml_set_name(cur, "ffn_norm");
|
5063
|
+
offload_func(cur);
|
5064
|
+
}
|
5065
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5066
|
+
offload_func(cur);
|
5067
|
+
|
5068
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5069
|
+
offload_func(cur);
|
5070
|
+
ggml_set_name(cur, "result_ffn_up");
|
5071
|
+
|
5072
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5073
|
+
ggml_set_name(cur, "result_ffn_act");
|
5074
|
+
offload_func(cur);
|
5075
|
+
offload_func(cur->src[0]);
|
5076
|
+
|
5077
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5078
|
+
offload_func(cur);
|
5079
|
+
cur = ggml_add(ctx0,
|
5080
|
+
cur,
|
5081
|
+
model.layers[il].b2);
|
5082
|
+
offload_func(cur);
|
5083
|
+
ggml_set_name(cur, "outFF");
|
5084
|
+
}
|
5085
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5086
|
+
offload_func(cur);
|
5087
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5088
|
+
inpL = cur;
|
5089
|
+
}
|
5090
|
+
cur = inpL;
|
5091
|
+
{
|
5092
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5093
|
+
offload_func_nr(cur);
|
5094
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5095
|
+
offload_func_nr(cur);
|
5096
|
+
|
5097
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5098
|
+
// offload_func_nr(cur);
|
5099
|
+
|
5100
|
+
ggml_set_name(cur, "result_norm");
|
5101
|
+
}
|
5102
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5103
|
+
ggml_set_name(cur, "result_output");
|
5104
|
+
ggml_build_forward_expand(gf, cur);
|
5105
|
+
ggml_free(ctx0);
|
5106
|
+
return gf;
|
5107
|
+
}
|
5108
|
+
|
5109
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5110
|
+
llama_context & lctx,
|
5111
|
+
const llama_batch & batch) {
|
5112
|
+
const auto & model = lctx.model;
|
5113
|
+
const auto & hparams = model.hparams;
|
5114
|
+
const auto & cparams = lctx.cparams;
|
5115
|
+
|
5116
|
+
const auto & kv_self = lctx.kv_self;
|
5117
|
+
|
5118
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5119
|
+
|
5120
|
+
const int64_t n_embd = hparams.n_embd;
|
5121
|
+
const int64_t n_layer = hparams.n_layer;
|
5122
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5123
|
+
const int64_t n_head = hparams.n_head;
|
5124
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5125
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5126
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5127
|
+
|
5128
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5129
|
+
|
5130
|
+
const float norm_eps = hparams.f_norm_eps;
|
5131
|
+
|
5132
|
+
const int32_t n_tokens = batch.n_tokens;
|
5133
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5134
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5135
|
+
|
5136
|
+
auto & buf_compute = lctx.buf_compute;
|
5137
|
+
|
5138
|
+
struct ggml_init_params params = {
|
5139
|
+
/*.mem_size =*/ buf_compute.size,
|
5140
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5141
|
+
/*.no_alloc =*/ false,
|
5142
|
+
};
|
5143
|
+
|
5144
|
+
params.no_alloc = true;
|
5145
|
+
|
5146
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5147
|
+
|
5148
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5149
|
+
|
5150
|
+
struct ggml_tensor * cur;
|
5151
|
+
struct ggml_tensor * token;
|
5152
|
+
struct ggml_tensor * inpL;
|
5153
|
+
|
5154
|
+
if (batch.token) {
|
5155
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5156
|
+
|
5157
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5158
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5159
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5160
|
+
}
|
5161
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5162
|
+
|
5163
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5164
|
+
} else {
|
5165
|
+
#ifdef GGML_USE_MPI
|
5166
|
+
GGML_ASSERT(false && "not implemented");
|
5167
|
+
#endif
|
5168
|
+
|
5169
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5170
|
+
|
5171
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5172
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5173
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5174
|
+
}
|
5175
|
+
}
|
5176
|
+
|
5177
|
+
// KQ_scale
|
5178
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5179
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5180
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5181
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5182
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5183
|
+
}
|
5184
|
+
|
5185
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5186
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5187
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5188
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5189
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5190
|
+
float * data = (float *) KQ_mask->data;
|
5191
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5192
|
+
|
5193
|
+
for (int h = 0; h < 1; ++h) {
|
5194
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5195
|
+
const llama_pos pos = batch.pos[j];
|
5196
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5197
|
+
|
5198
|
+
for (int i = 0; i < n_kv; ++i) {
|
5199
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5200
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5201
|
+
}
|
5202
|
+
}
|
5203
|
+
}
|
5204
|
+
}
|
5205
|
+
}
|
5206
|
+
|
5207
|
+
// norm
|
5208
|
+
{
|
5209
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5210
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
ggml_set_name(inpL, "inpL");
|
5214
|
+
|
5215
|
+
for (int il = 0; il < n_layer; ++il) {
|
5216
|
+
{
|
5217
|
+
// Norm
|
5218
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5219
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5220
|
+
}
|
5221
|
+
|
5222
|
+
{
|
5223
|
+
// Self Attention
|
5224
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5225
|
+
|
5226
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5227
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5228
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5229
|
+
|
5230
|
+
struct ggml_tensor * Qcur = tmpq;
|
5231
|
+
struct ggml_tensor * Kcur = tmpk;
|
5232
|
+
|
5233
|
+
// store key and value to memory
|
5234
|
+
{
|
5235
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5236
|
+
ggml_set_name(Vcur, "Vcur");
|
5237
|
+
|
5238
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5239
|
+
ggml_set_name(k, "k");
|
5240
|
+
|
5241
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5242
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5243
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5244
|
+
|
5245
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5246
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5247
|
+
}
|
5248
|
+
|
5249
|
+
struct ggml_tensor * Q =
|
5250
|
+
ggml_permute(ctx0,
|
5251
|
+
ggml_cpy(ctx0,
|
5252
|
+
Qcur,
|
5253
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5254
|
+
0, 2, 1, 3);
|
5255
|
+
ggml_set_name(Q, "Q");
|
5256
|
+
|
5257
|
+
struct ggml_tensor * K =
|
5258
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5259
|
+
n_embd_head, n_kv, n_head_kv,
|
5260
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5261
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5262
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5263
|
+
ggml_set_name(K, "K");
|
5264
|
+
|
5265
|
+
// K * Q
|
5266
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5267
|
+
ggml_set_name(KQ, "KQ");
|
5268
|
+
|
5269
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5270
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5271
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5272
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5273
|
+
|
5274
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5275
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5276
|
+
|
5277
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5278
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5279
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5280
|
+
|
5281
|
+
// KQ = soft_max(KQ_masked)
|
5282
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5283
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5284
|
+
|
5285
|
+
// split cached V into n_head heads
|
5286
|
+
struct ggml_tensor * V =
|
5287
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5288
|
+
n_kv, n_embd_head, n_head_kv,
|
5289
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5290
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5291
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5292
|
+
ggml_set_name(V, "V");
|
5293
|
+
|
5294
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5295
|
+
ggml_set_name(KQV, "KQV");
|
5296
|
+
|
5297
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5298
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5299
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5300
|
+
|
5301
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5302
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5303
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5304
|
+
}
|
5305
|
+
|
5306
|
+
// Projection
|
5307
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5308
|
+
|
5309
|
+
// Add the input
|
5310
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5311
|
+
|
5312
|
+
struct ggml_tensor * inpFF = cur;
|
5313
|
+
|
5314
|
+
// FF
|
5315
|
+
{
|
5316
|
+
// Norm
|
5317
|
+
{
|
5318
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5319
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5320
|
+
}
|
5321
|
+
|
5322
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5323
|
+
|
5324
|
+
// GELU activation
|
5325
|
+
cur = ggml_gelu(ctx0, cur);
|
5326
|
+
|
5327
|
+
// Projection
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5329
|
+
}
|
5330
|
+
|
5331
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
// Output Norm
|
5335
|
+
{
|
5336
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5337
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5338
|
+
}
|
5339
|
+
ggml_set_name(cur, "result_norm");
|
5340
|
+
|
5341
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5342
|
+
ggml_set_name(cur, "result_output");
|
5343
|
+
|
5344
|
+
ggml_build_forward_expand(gf, cur);
|
5345
|
+
|
5346
|
+
ggml_free(ctx0);
|
5347
|
+
|
5348
|
+
return gf;
|
5349
|
+
}
|
5350
|
+
|
5351
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5352
|
+
llama_context & lctx,
|
5353
|
+
const llama_batch & batch) {
|
5354
|
+
const auto & model = lctx.model;
|
5355
|
+
const auto & hparams = model.hparams;
|
5356
|
+
const auto & cparams = lctx.cparams;
|
5357
|
+
|
5358
|
+
const auto & kv_self = lctx.kv_self;
|
5359
|
+
|
5360
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5361
|
+
|
5362
|
+
const int64_t n_embd = hparams.n_embd;
|
5363
|
+
const int64_t n_layer = hparams.n_layer;
|
5364
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5365
|
+
const int64_t n_head = hparams.n_head;
|
5366
|
+
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5367
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
|
+
|
5370
|
+
const float norm_eps = hparams.f_norm_eps;
|
5371
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5372
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5373
|
+
|
5374
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5375
|
+
|
5376
|
+
const int32_t n_tokens = batch.n_tokens;
|
5377
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5378
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5379
|
+
|
5380
|
+
auto & buf_compute = lctx.buf_compute;
|
5381
|
+
|
5382
|
+
struct ggml_init_params params = {
|
5383
|
+
/*.mem_size =*/ buf_compute.size,
|
5384
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5385
|
+
/*.no_alloc =*/ false,
|
5386
|
+
};
|
5387
|
+
|
5388
|
+
params.no_alloc = true;
|
5389
|
+
|
5390
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5391
|
+
|
5392
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5393
|
+
|
5394
|
+
struct ggml_tensor * cur;
|
5395
|
+
struct ggml_tensor * inpL;
|
5396
|
+
|
5397
|
+
//int warmup = 0;
|
5398
|
+
if (batch.token) {
|
5399
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5400
|
+
|
5401
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5402
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5403
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5404
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5405
|
+
}
|
5406
|
+
|
5407
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5408
|
+
|
5409
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5410
|
+
} else {
|
5411
|
+
#ifdef GGML_USE_MPI
|
5412
|
+
GGML_ASSERT(false && "not implemented");
|
5413
|
+
#endif
|
5414
|
+
|
5415
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5416
|
+
|
5417
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5418
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5419
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5420
|
+
}
|
5421
|
+
}
|
5422
|
+
|
5423
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5424
|
+
(void) i_gpu_start;
|
5425
|
+
|
5426
|
+
// offload functions set the tensor output backend to GPU
|
5427
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5428
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5429
|
+
offload_func_t offload_func_kq = llama_nop;
|
5430
|
+
offload_func_t offload_func_v = llama_nop;
|
5431
|
+
|
5432
|
+
#ifdef GGML_USE_CUBLAS
|
5433
|
+
if (n_gpu_layers > n_layer) {
|
5434
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5435
|
+
}
|
5436
|
+
if (n_gpu_layers > n_layer + 1) {
|
5437
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5438
|
+
}
|
5439
|
+
if (n_gpu_layers > n_layer + 2) {
|
5440
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5441
|
+
}
|
5442
|
+
#endif // GGML_USE_CUBLAS
|
5443
|
+
|
5444
|
+
// KQ_scale
|
5445
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5446
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5447
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5448
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5449
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5450
|
+
}
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
offload_func_kq(KQ_mask);
|
5455
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5456
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5457
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5458
|
+
float * data = (float *) KQ_mask->data;
|
5459
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5460
|
+
|
5461
|
+
for (int h = 0; h < 1; ++h) {
|
5462
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5463
|
+
const llama_pos pos = batch.pos[j];
|
5464
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5465
|
+
|
5466
|
+
for (int i = 0; i < n_kv; ++i) {
|
5467
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5468
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5469
|
+
}
|
5470
|
+
}
|
5471
|
+
}
|
5472
|
+
}
|
5473
|
+
}
|
5474
|
+
|
5475
|
+
for (int il = 0; il < n_layer; ++il) {
|
5476
|
+
struct ggml_tensor * attn_norm;
|
5477
|
+
|
5478
|
+
offload_func_t offload_func = llama_nop;
|
5479
|
+
|
5480
|
+
#ifdef GGML_USE_CUBLAS
|
5481
|
+
if (il >= i_gpu_start) {
|
5482
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
5483
|
+
}
|
5484
|
+
#endif // GGML_USE_CUBLAS
|
5485
|
+
|
5486
|
+
// self-attention
|
5487
|
+
// TODO: refactor into common function (shared with LLaMA)
|
5488
|
+
{
|
5489
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5490
|
+
offload_func(attn_norm);
|
5491
|
+
|
5492
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5493
|
+
offload_func(attn_norm);
|
5494
|
+
|
5495
|
+
if (1) {
|
5496
|
+
cur = attn_norm;
|
5497
|
+
}
|
5498
|
+
|
5499
|
+
// compute QKV
|
5500
|
+
|
5501
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5502
|
+
offload_func_kq(cur);
|
5503
|
+
|
5504
|
+
if (clamp_kqv > 0.0f) {
|
5505
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5506
|
+
offload_func_kq(cur);
|
5507
|
+
}
|
5508
|
+
|
5509
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5510
|
+
|
5511
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5512
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5513
|
+
wsize * n_embd_head,
|
5514
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5515
|
+
0);
|
5516
|
+
offload_func_kq(Qcur);
|
5517
|
+
|
5518
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5519
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5520
|
+
wsize * n_embd_head,
|
5521
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5522
|
+
wsize * n_embd_head * n_head);
|
5523
|
+
offload_func_kq(Kcur);
|
5524
|
+
|
5525
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5526
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5527
|
+
wsize * n_embd_head,
|
5528
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5529
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5530
|
+
offload_func_kq(Kcur);
|
5531
|
+
|
5532
|
+
ggml_set_name(Qcur, "Qcur");
|
5533
|
+
ggml_set_name(Kcur, "Kcur");
|
5534
|
+
|
5535
|
+
{
|
5536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5537
|
+
offload_func_v(Vcur);
|
5538
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
5539
|
+
ggml_set_name(Vcur, "Vcur");
|
5540
|
+
|
5541
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5542
|
+
offload_func_kq(k);
|
5543
|
+
ggml_set_name(k, "k");
|
5544
|
+
|
5545
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5546
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5547
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5548
|
+
offload_func_v(v);
|
5549
|
+
|
5550
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5551
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5552
|
+
}
|
5553
|
+
|
5554
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5555
|
+
offload_func_kq(Q);
|
5556
|
+
ggml_set_name(Q, "Q");
|
5557
|
+
|
5558
|
+
struct ggml_tensor * K =
|
5559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5560
|
+
n_embd_head, n_kv, n_head_kv,
|
5561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5564
|
+
offload_func_kq(K);
|
5565
|
+
ggml_set_name(K, "K");
|
5566
|
+
|
5567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5568
|
+
offload_func_kq(KQ);
|
5569
|
+
ggml_set_name(KQ, "KQ");
|
5570
|
+
|
5571
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5572
|
+
offload_func_kq(KQ_scaled);
|
5573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5574
|
+
|
5575
|
+
// TODO: replace with ggml_add()
|
5576
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5577
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5578
|
+
offload_func_kq(KQ_scaled_alibi);
|
5579
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5580
|
+
|
5581
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5582
|
+
offload_func_kq(KQ_masked);
|
5583
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5584
|
+
|
5585
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5586
|
+
offload_func_v(KQ_soft_max);
|
5587
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5588
|
+
|
5589
|
+
struct ggml_tensor * V =
|
5590
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5591
|
+
n_kv, n_embd_head, n_head_kv,
|
5592
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5593
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5594
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5595
|
+
offload_func_v(V);
|
5596
|
+
ggml_set_name(V, "V");
|
5597
|
+
|
5598
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5599
|
+
offload_func_v(KQV);
|
5600
|
+
ggml_set_name(KQV, "KQV");
|
5601
|
+
|
5602
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5603
|
+
offload_func_v(KQV_merged);
|
5604
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5605
|
+
|
5606
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5607
|
+
offload_func_v(cur);
|
5608
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5609
|
+
|
5610
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5611
|
+
offload_func(cur);
|
5612
|
+
ggml_set_name(cur, "result_wo");
|
5613
|
+
}
|
5614
|
+
|
5615
|
+
// Add the input
|
5616
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5617
|
+
offload_func(cur);
|
5618
|
+
|
5619
|
+
struct ggml_tensor * attn_out = cur;
|
5620
|
+
|
5621
|
+
// feed forward
|
5622
|
+
{
|
5623
|
+
// Norm
|
5624
|
+
{
|
5625
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5626
|
+
offload_func(cur);
|
5627
|
+
|
5628
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5629
|
+
offload_func(cur);
|
5630
|
+
}
|
5631
|
+
|
5632
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5633
|
+
offload_func(cur);
|
5634
|
+
|
5635
|
+
cur = ggml_gelu(ctx0, cur);
|
5636
|
+
offload_func(cur);
|
5637
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5638
|
+
offload_func(cur);
|
5639
|
+
}
|
5640
|
+
|
5641
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5642
|
+
offload_func(cur);
|
5643
|
+
// input for next layer
|
5644
|
+
inpL = cur;
|
5645
|
+
}
|
5646
|
+
|
5647
|
+
cur = inpL;
|
5648
|
+
|
5649
|
+
// norm
|
5650
|
+
{
|
5651
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5652
|
+
offload_func_nr(cur);
|
5653
|
+
|
5654
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5655
|
+
ggml_set_name(cur, "result_norm");
|
5656
|
+
}
|
5657
|
+
|
5658
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5659
|
+
ggml_set_name(cur, "result_output");
|
5660
|
+
|
5661
|
+
ggml_build_forward_expand(gf, cur);
|
5662
|
+
|
5663
|
+
ggml_free(ctx0);
|
5664
|
+
|
5665
|
+
return gf;
|
5666
|
+
}
|
5667
|
+
|
5668
|
+
static struct ggml_cgraph * llama_build_graph(
|
5669
|
+
llama_context & lctx,
|
5670
|
+
const llama_batch & batch) {
|
5671
|
+
const auto & model = lctx.model;
|
5672
|
+
|
5673
|
+
struct ggml_cgraph * result = NULL;
|
5674
|
+
|
5675
|
+
switch (model.arch) {
|
5676
|
+
case LLM_ARCH_LLAMA:
|
5677
|
+
{
|
5678
|
+
result = llm_build_llama(lctx, batch);
|
5679
|
+
} break;
|
5680
|
+
case LLM_ARCH_BAICHUAN:
|
4397
5681
|
{
|
4398
5682
|
result = llm_build_baichaun(lctx, batch);
|
4399
5683
|
} break;
|
@@ -4405,10 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4405
5689
|
{
|
4406
5690
|
result = llm_build_starcoder(lctx, batch);
|
4407
5691
|
} break;
|
5692
|
+
case LLM_ARCH_PERSIMMON:
|
5693
|
+
{
|
5694
|
+
result = llm_build_persimmon(lctx, batch);
|
5695
|
+
} break;
|
4408
5696
|
case LLM_ARCH_REFACT:
|
4409
5697
|
{
|
4410
5698
|
result = llm_build_refact(lctx, batch);
|
4411
5699
|
} break;
|
5700
|
+
case LLM_ARCH_BLOOM:
|
5701
|
+
{
|
5702
|
+
result = llm_build_bloom(lctx, batch);
|
5703
|
+
} break;
|
5704
|
+
case LLM_ARCH_MPT:
|
5705
|
+
{
|
5706
|
+
result = llm_build_mpt(lctx, batch);
|
5707
|
+
} break;
|
4412
5708
|
default:
|
4413
5709
|
GGML_ASSERT(false);
|
4414
5710
|
}
|
@@ -4420,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4420
5716
|
//
|
4421
5717
|
// - lctx: llama context
|
4422
5718
|
// - batch: batch to evaluate
|
4423
|
-
// - n_threads: number of threads to use
|
4424
5719
|
//
|
4425
5720
|
// return 0 on success
|
4426
5721
|
// return positive int on warning
|
@@ -4487,10 +5782,6 @@ static int llama_decode_internal(
|
|
4487
5782
|
batch.seq_id = seq_id.data();
|
4488
5783
|
}
|
4489
5784
|
|
4490
|
-
// we always start to search for a free slot from the start of the cache
|
4491
|
-
// TODO: better strategies can be implemented
|
4492
|
-
kv_self.head = 0;
|
4493
|
-
|
4494
5785
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
5786
|
return 1;
|
4496
5787
|
}
|
@@ -4543,7 +5834,8 @@ static int llama_decode_internal(
|
|
4543
5834
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4544
5835
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4545
5836
|
model.arch == LLM_ARCH_FALCON ||
|
4546
|
-
model.arch == LLM_ARCH_REFACT
|
5837
|
+
model.arch == LLM_ARCH_REFACT ||
|
5838
|
+
model.arch == LLM_ARCH_MPT;
|
4547
5839
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4548
5840
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4549
5841
|
n_threads = 1;
|
@@ -4576,8 +5868,12 @@ static int llama_decode_internal(
|
|
4576
5868
|
#endif
|
4577
5869
|
|
4578
5870
|
// update the kv ring buffer
|
4579
|
-
lctx.kv_self.head += n_tokens;
|
4580
5871
|
lctx.kv_self.has_shift = false;
|
5872
|
+
lctx.kv_self.head += n_tokens;
|
5873
|
+
// Ensure kv cache head points to a valid index.
|
5874
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5875
|
+
lctx.kv_self.head = 0;
|
5876
|
+
}
|
4581
5877
|
|
4582
5878
|
#ifdef GGML_PERF
|
4583
5879
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -5040,7 +6336,6 @@ private:
|
|
5040
6336
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
6337
|
const std::string & utf_char = text_utf[i];
|
5042
6338
|
bool split_condition = false;
|
5043
|
-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
6339
|
int bytes_remain = text_utf.size() - i;
|
5045
6340
|
// forward backward lookups
|
5046
6341
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -5066,9 +6361,9 @@ private:
|
|
5066
6361
|
if (!split_condition && bytes_remain >= 3) {
|
5067
6362
|
// 're|'ve|'ll
|
5068
6363
|
if (utf_char == "\'" && (
|
5069
|
-
(utf_char_next == "r"
|
5070
|
-
(utf_char_next == "v"
|
5071
|
-
(utf_char_next == "l"
|
6364
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6365
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6366
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
5072
6367
|
) {
|
5073
6368
|
split_condition = true;
|
5074
6369
|
}
|
@@ -5119,7 +6414,7 @@ private:
|
|
5119
6414
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
6415
|
split_condition = true;
|
5121
6416
|
}
|
5122
|
-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next)
|
6417
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5123
6418
|
split_condition = true;
|
5124
6419
|
}
|
5125
6420
|
}
|
@@ -6635,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6635
7930
|
const std::string name = ggml_get_name(meta);
|
6636
7931
|
|
6637
7932
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6638
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
7933
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6639
7934
|
++n_attention_wv;
|
6640
7935
|
}
|
6641
7936
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6672,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6672
7967
|
}
|
6673
7968
|
|
6674
7969
|
std::ofstream fout(fname_out, std::ios::binary);
|
7970
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6675
7971
|
|
6676
7972
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6677
7973
|
|
@@ -8166,7 +9462,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8166
9462
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
9463
|
return 1;
|
8168
9464
|
} else {
|
8169
|
-
|
9465
|
+
// TODO: for now we accept all unsupported token types,
|
9466
|
+
// suppressing them like CONTROL tokens.
|
9467
|
+
// GGML_ASSERT(false);
|
8170
9468
|
}
|
8171
9469
|
break;
|
8172
9470
|
}
|
@@ -8182,7 +9480,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8182
9480
|
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
9481
|
;
|
8184
9482
|
} else {
|
8185
|
-
|
9483
|
+
// TODO: for now we accept all unsupported token types,
|
9484
|
+
// suppressing them like CONTROL tokens.
|
9485
|
+
// GGML_ASSERT(false);
|
8186
9486
|
}
|
8187
9487
|
break;
|
8188
9488
|
}
|