llama_cpp 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -186,7 +186,9 @@ enum llm_arch {
|
|
186
186
|
LLM_ARCH_GPTNEOX,
|
187
187
|
LLM_ARCH_MPT,
|
188
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_PERSIMMON,
|
189
190
|
LLM_ARCH_REFACT,
|
191
|
+
LLM_ARCH_BLOOM,
|
190
192
|
LLM_ARCH_UNKNOWN,
|
191
193
|
};
|
192
194
|
|
@@ -199,7 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
199
201
|
{ LLM_ARCH_MPT, "mpt" },
|
200
202
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
201
203
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
-
{
|
204
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205
|
+
{ LLM_ARCH_REFACT, "refact" },
|
206
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
203
207
|
};
|
204
208
|
|
205
209
|
enum llm_kv {
|
@@ -302,6 +306,7 @@ struct LLM_KV {
|
|
302
306
|
|
303
307
|
enum llm_tensor {
|
304
308
|
LLM_TENSOR_TOKEN_EMBD,
|
309
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
305
310
|
LLM_TENSOR_POS_EMBD,
|
306
311
|
LLM_TENSOR_OUTPUT,
|
307
312
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -318,6 +323,8 @@ enum llm_tensor {
|
|
318
323
|
LLM_TENSOR_FFN_DOWN,
|
319
324
|
LLM_TENSOR_FFN_UP,
|
320
325
|
LLM_TENSOR_FFN_NORM,
|
326
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
327
|
+
LLM_TENSOR_ATTN_K_NORM,
|
321
328
|
};
|
322
329
|
|
323
330
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -399,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
399
406
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
400
407
|
},
|
401
408
|
},
|
409
|
+
{
|
410
|
+
LLM_ARCH_PERSIMMON,
|
411
|
+
{
|
412
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
413
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
414
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
415
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
416
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
417
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
418
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
419
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
420
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
422
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
423
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
424
|
+
},
|
425
|
+
},
|
402
426
|
{
|
403
427
|
LLM_ARCH_MPT,
|
404
428
|
{
|
405
429
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
430
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
431
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
432
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
433
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
434
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
435
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
406
438
|
},
|
407
439
|
},
|
408
440
|
{
|
@@ -437,6 +469,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
437
469
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
470
|
},
|
439
471
|
},
|
472
|
+
{
|
473
|
+
LLM_ARCH_BLOOM,
|
474
|
+
{
|
475
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
477
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
479
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
480
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
481
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
482
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
483
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
485
|
+
},
|
486
|
+
},
|
440
487
|
{
|
441
488
|
LLM_ARCH_UNKNOWN,
|
442
489
|
{
|
@@ -954,6 +1001,7 @@ enum e_model {
|
|
954
1001
|
MODEL_1B,
|
955
1002
|
MODEL_3B,
|
956
1003
|
MODEL_7B,
|
1004
|
+
MODEL_8B,
|
957
1005
|
MODEL_13B,
|
958
1006
|
MODEL_15B,
|
959
1007
|
MODEL_30B,
|
@@ -984,6 +1032,9 @@ struct llama_hparams {
|
|
984
1032
|
float rope_freq_base_train;
|
985
1033
|
float rope_freq_scale_train;
|
986
1034
|
|
1035
|
+
float f_clamp_kqv;
|
1036
|
+
float f_max_alibi_bias;
|
1037
|
+
|
987
1038
|
bool operator!=(const llama_hparams & other) const {
|
988
1039
|
if (this->vocab_only != other.vocab_only) return true;
|
989
1040
|
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1036,6 +1087,10 @@ struct llama_layer {
|
|
1036
1087
|
struct ggml_tensor * attn_norm_b;
|
1037
1088
|
struct ggml_tensor * attn_norm_2;
|
1038
1089
|
struct ggml_tensor * attn_norm_2_b;
|
1090
|
+
struct ggml_tensor * attn_q_norm;
|
1091
|
+
struct ggml_tensor * attn_q_norm_b;
|
1092
|
+
struct ggml_tensor * attn_k_norm;
|
1093
|
+
struct ggml_tensor * attn_k_norm_b;
|
1039
1094
|
|
1040
1095
|
// attention
|
1041
1096
|
struct ggml_tensor * wq;
|
@@ -1077,6 +1132,9 @@ struct llama_kv_cell {
|
|
1077
1132
|
struct llama_kv_cache {
|
1078
1133
|
bool has_shift = false;
|
1079
1134
|
|
1135
|
+
// Note: The value of head isn't only used to optimize searching
|
1136
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1137
|
+
// cannot be freely changed after a slot has been allocated.
|
1080
1138
|
uint32_t head = 0;
|
1081
1139
|
uint32_t size = 0;
|
1082
1140
|
|
@@ -1162,6 +1220,8 @@ struct llama_model {
|
|
1162
1220
|
|
1163
1221
|
struct ggml_tensor * tok_embeddings;
|
1164
1222
|
struct ggml_tensor * pos_embeddings;
|
1223
|
+
struct ggml_tensor * tok_norm;
|
1224
|
+
struct ggml_tensor * tok_norm_b;
|
1165
1225
|
|
1166
1226
|
struct ggml_tensor * output_norm;
|
1167
1227
|
struct ggml_tensor * output_norm_b;
|
@@ -1291,7 +1351,11 @@ static bool llama_kv_cache_init(
|
|
1291
1351
|
cache.cells.clear();
|
1292
1352
|
cache.cells.resize(n_ctx);
|
1293
1353
|
|
1354
|
+
// TODO: this should be:
|
1355
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1356
|
+
// change it and test that it works
|
1294
1357
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1295
1359
|
|
1296
1360
|
struct ggml_init_params params;
|
1297
1361
|
params.mem_size = cache.buf.size;
|
@@ -1334,6 +1398,8 @@ static bool llama_kv_cache_init(
|
|
1334
1398
|
|
1335
1399
|
// find an empty slot of size "n_tokens" in the cache
|
1336
1400
|
// updates the cache head
|
1401
|
+
// Note: On success, it's important that cache.head points
|
1402
|
+
// to the first cell of the slot.
|
1337
1403
|
static bool llama_kv_cache_find_slot(
|
1338
1404
|
struct llama_kv_cache & cache,
|
1339
1405
|
const struct llama_batch & batch) {
|
@@ -1349,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
|
|
1349
1415
|
|
1350
1416
|
while (true) {
|
1351
1417
|
if (cache.head + n_tokens > n_ctx) {
|
1418
|
+
n_tested += n_ctx - cache.head;
|
1352
1419
|
cache.head = 0;
|
1353
|
-
n_tested += n_ctx - cache.head;
|
1354
1420
|
continue;
|
1355
1421
|
}
|
1356
1422
|
|
@@ -1401,6 +1467,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1401
1467
|
cache.cells[i].pos = -1;
|
1402
1468
|
cache.cells[i].seq_id.clear();
|
1403
1469
|
}
|
1470
|
+
|
1471
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1472
|
+
cache.head = uint32_t(c0);
|
1404
1473
|
}
|
1405
1474
|
|
1406
1475
|
static void llama_kv_cache_seq_rm(
|
@@ -1408,6 +1477,8 @@ static void llama_kv_cache_seq_rm(
|
|
1408
1477
|
llama_seq_id seq_id,
|
1409
1478
|
llama_pos p0,
|
1410
1479
|
llama_pos p1) {
|
1480
|
+
uint32_t new_head = cache.size;
|
1481
|
+
|
1411
1482
|
if (p0 < 0) p0 = 0;
|
1412
1483
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
1484
|
|
@@ -1416,9 +1487,13 @@ static void llama_kv_cache_seq_rm(
|
|
1416
1487
|
cache.cells[i].seq_id.erase(seq_id);
|
1417
1488
|
if (cache.cells[i].seq_id.empty()) {
|
1418
1489
|
cache.cells[i].pos = -1;
|
1490
|
+
if (new_head == cache.size) new_head = i;
|
1419
1491
|
}
|
1420
1492
|
}
|
1421
1493
|
}
|
1494
|
+
|
1495
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1496
|
+
if (new_head != cache.size) cache.head = new_head;
|
1422
1497
|
}
|
1423
1498
|
|
1424
1499
|
static void llama_kv_cache_seq_cp(
|
@@ -1430,6 +1505,8 @@ static void llama_kv_cache_seq_cp(
|
|
1430
1505
|
if (p0 < 0) p0 = 0;
|
1431
1506
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
1507
|
|
1508
|
+
cache.head = 0;
|
1509
|
+
|
1433
1510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
1511
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
1512
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1438,12 +1515,18 @@ static void llama_kv_cache_seq_cp(
|
|
1438
1515
|
}
|
1439
1516
|
|
1440
1517
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1518
|
+
uint32_t new_head = cache.size;
|
1519
|
+
|
1441
1520
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
1521
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
1522
|
cache.cells[i].pos = -1;
|
1444
1523
|
cache.cells[i].seq_id.clear();
|
1524
|
+
if (new_head == cache.size) new_head = i;
|
1445
1525
|
}
|
1446
1526
|
}
|
1527
|
+
|
1528
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1529
|
+
if (new_head != cache.size) cache.head = new_head;
|
1447
1530
|
}
|
1448
1531
|
|
1449
1532
|
static void llama_kv_cache_seq_shift(
|
@@ -1452,6 +1535,8 @@ static void llama_kv_cache_seq_shift(
|
|
1452
1535
|
llama_pos p0,
|
1453
1536
|
llama_pos p1,
|
1454
1537
|
llama_pos delta) {
|
1538
|
+
uint32_t new_head = cache.size;
|
1539
|
+
|
1455
1540
|
if (p0 < 0) p0 = 0;
|
1456
1541
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
1542
|
|
@@ -1461,12 +1546,17 @@ static void llama_kv_cache_seq_shift(
|
|
1461
1546
|
if (cache.cells[i].pos < 0) {
|
1462
1547
|
cache.cells[i].pos = -1;
|
1463
1548
|
cache.cells[i].seq_id.clear();
|
1549
|
+
if (new_head == cache.size) new_head = i;
|
1464
1550
|
} else {
|
1465
1551
|
cache.has_shift = true;
|
1466
1552
|
cache.cells[i].delta = delta;
|
1467
1553
|
}
|
1468
1554
|
}
|
1469
1555
|
}
|
1556
|
+
|
1557
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1558
|
+
// Otherwise we just start the next search from the beginning.
|
1559
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1470
1560
|
}
|
1471
1561
|
|
1472
1562
|
//
|
@@ -1670,7 +1760,7 @@ struct llama_model_loader {
|
|
1670
1760
|
}
|
1671
1761
|
}
|
1672
1762
|
|
1673
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1763
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1674
1764
|
if (backend != GGML_BACKEND_CPU) {
|
1675
1765
|
ggml_set_no_alloc(ctx, true);
|
1676
1766
|
}
|
@@ -1688,7 +1778,7 @@ struct llama_model_loader {
|
|
1688
1778
|
return tensor;
|
1689
1779
|
}
|
1690
1780
|
|
1691
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1781
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1692
1782
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1693
1783
|
|
1694
1784
|
if (cur == NULL) {
|
@@ -1867,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1867
1957
|
case MODEL_1B: return "1B";
|
1868
1958
|
case MODEL_3B: return "3B";
|
1869
1959
|
case MODEL_7B: return "7B";
|
1960
|
+
case MODEL_8B: return "8B";
|
1870
1961
|
case MODEL_13B: return "13B";
|
1871
1962
|
case MODEL_15B: return "15B";
|
1872
1963
|
case MODEL_30B: return "30B";
|
@@ -1979,6 +2070,14 @@ static void llm_load_hparams(
|
|
1979
2070
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1980
2071
|
}
|
1981
2072
|
} break;
|
2073
|
+
case LLM_ARCH_PERSIMMON:
|
2074
|
+
{
|
2075
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2076
|
+
switch (hparams.n_layer) {
|
2077
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2078
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2079
|
+
}
|
2080
|
+
} break;
|
1982
2081
|
case LLM_ARCH_REFACT:
|
1983
2082
|
{
|
1984
2083
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -1987,6 +2086,33 @@ static void llm_load_hparams(
|
|
1987
2086
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
2087
|
}
|
1989
2088
|
} break;
|
2089
|
+
case LLM_ARCH_BLOOM:
|
2090
|
+
{
|
2091
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2092
|
+
|
2093
|
+
switch (hparams.n_layer) {
|
2094
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2095
|
+
case 30:
|
2096
|
+
switch (hparams.n_embd) {
|
2097
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2098
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2099
|
+
} break;
|
2100
|
+
}
|
2101
|
+
} break;
|
2102
|
+
case LLM_ARCH_MPT:
|
2103
|
+
{
|
2104
|
+
hparams.f_clamp_kqv = 0.0f;
|
2105
|
+
|
2106
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2107
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2108
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2109
|
+
|
2110
|
+
switch (hparams.n_layer) {
|
2111
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2112
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2113
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2114
|
+
}
|
2115
|
+
} break;
|
1990
2116
|
default: (void)0;
|
1991
2117
|
}
|
1992
2118
|
|
@@ -2131,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2131
2257
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
2258
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
2259
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2260
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2261
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2134
2262
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
2263
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
2264
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2230,8 +2358,8 @@ static void llm_load_tensors(
|
|
2230
2358
|
|
2231
2359
|
// output
|
2232
2360
|
{
|
2233
|
-
|
2234
|
-
|
2361
|
+
ggml_backend_type backend_norm;
|
2362
|
+
ggml_backend_type backend_output;
|
2235
2363
|
|
2236
2364
|
if (n_gpu_layers > int(n_layer)) {
|
2237
2365
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2266,8 +2394,8 @@ static void llm_load_tensors(
|
|
2266
2394
|
model.layers.resize(n_layer);
|
2267
2395
|
|
2268
2396
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2269
|
-
const
|
2270
|
-
const
|
2397
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2398
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2271
2399
|
|
2272
2400
|
auto & layer = model.layers[i];
|
2273
2401
|
|
@@ -2296,8 +2424,8 @@ static void llm_load_tensors(
|
|
2296
2424
|
{
|
2297
2425
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2298
2426
|
{
|
2299
|
-
|
2300
|
-
|
2427
|
+
ggml_backend_type backend_norm;
|
2428
|
+
ggml_backend_type backend_output;
|
2301
2429
|
|
2302
2430
|
if (n_gpu_layers > int(n_layer)) {
|
2303
2431
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2332,8 +2460,8 @@ static void llm_load_tensors(
|
|
2332
2460
|
model.layers.resize(n_layer);
|
2333
2461
|
|
2334
2462
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2335
|
-
const
|
2336
|
-
const
|
2463
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2464
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2337
2465
|
|
2338
2466
|
auto & layer = model.layers[i];
|
2339
2467
|
|
@@ -2366,8 +2494,8 @@ static void llm_load_tensors(
|
|
2366
2494
|
|
2367
2495
|
// output
|
2368
2496
|
{
|
2369
|
-
|
2370
|
-
|
2497
|
+
ggml_backend_type backend_norm;
|
2498
|
+
ggml_backend_type backend_output;
|
2371
2499
|
|
2372
2500
|
if (n_gpu_layers > int(n_layer)) {
|
2373
2501
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2404,8 +2532,8 @@ static void llm_load_tensors(
|
|
2404
2532
|
model.layers.resize(n_layer);
|
2405
2533
|
|
2406
2534
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2407
|
-
const
|
2408
|
-
const
|
2535
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2536
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2409
2537
|
|
2410
2538
|
auto & layer = model.layers[i];
|
2411
2539
|
|
@@ -2443,8 +2571,8 @@ static void llm_load_tensors(
|
|
2443
2571
|
|
2444
2572
|
// output
|
2445
2573
|
{
|
2446
|
-
|
2447
|
-
|
2574
|
+
ggml_backend_type backend_norm;
|
2575
|
+
ggml_backend_type backend_output;
|
2448
2576
|
|
2449
2577
|
if (n_gpu_layers > int(n_layer)) {
|
2450
2578
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2481,8 +2609,8 @@ static void llm_load_tensors(
|
|
2481
2609
|
model.layers.resize(n_layer);
|
2482
2610
|
|
2483
2611
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2484
|
-
const
|
2485
|
-
const
|
2612
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2613
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2486
2614
|
|
2487
2615
|
auto & layer = model.layers[i];
|
2488
2616
|
|
@@ -2515,6 +2643,216 @@ static void llm_load_tensors(
|
|
2515
2643
|
}
|
2516
2644
|
}
|
2517
2645
|
} break;
|
2646
|
+
case LLM_ARCH_PERSIMMON:
|
2647
|
+
{
|
2648
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2649
|
+
|
2650
|
+
{
|
2651
|
+
ggml_backend_type backend_norm;
|
2652
|
+
ggml_backend_type backend_output;
|
2653
|
+
|
2654
|
+
if (n_gpu_layers > int(n_layer)) {
|
2655
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2656
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2657
|
+
#ifndef _WIN32
|
2658
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2659
|
+
#else
|
2660
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2661
|
+
#endif // _WIN32
|
2662
|
+
|
2663
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2664
|
+
} else {
|
2665
|
+
backend_norm = GGML_BACKEND_CPU;
|
2666
|
+
backend_output = GGML_BACKEND_CPU;
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2670
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2671
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2672
|
+
|
2673
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2674
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2675
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2676
|
+
}
|
2677
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2678
|
+
vram_weights += ggml_nbytes(model.output);
|
2679
|
+
}
|
2680
|
+
}
|
2681
|
+
|
2682
|
+
const uint32_t n_ff = hparams.n_ff;
|
2683
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2684
|
+
model.layers.resize(n_layer);
|
2685
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2686
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2687
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2688
|
+
auto & layer = model.layers[i];
|
2689
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2690
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2691
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2692
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2693
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2694
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2695
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2696
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2697
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2698
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2699
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2700
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2701
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2702
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2703
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2704
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2705
|
+
}
|
2706
|
+
} break;
|
2707
|
+
case LLM_ARCH_BLOOM:
|
2708
|
+
{
|
2709
|
+
// TODO: CPU-only for now
|
2710
|
+
|
2711
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2712
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2713
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2714
|
+
|
2715
|
+
// output
|
2716
|
+
{
|
2717
|
+
ggml_backend_type backend_norm;
|
2718
|
+
ggml_backend_type backend_output;
|
2719
|
+
|
2720
|
+
if (n_gpu_layers > int(n_layer)) {
|
2721
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2722
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2723
|
+
#ifndef _WIN32
|
2724
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2725
|
+
#else
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2727
|
+
#endif // _WIN32
|
2728
|
+
|
2729
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2730
|
+
} else {
|
2731
|
+
backend_norm = GGML_BACKEND_CPU;
|
2732
|
+
backend_output = GGML_BACKEND_CPU;
|
2733
|
+
}
|
2734
|
+
|
2735
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2736
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2737
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2738
|
+
|
2739
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2740
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2741
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2742
|
+
}
|
2743
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2744
|
+
vram_weights += ggml_nbytes(model.output);
|
2745
|
+
}
|
2746
|
+
}
|
2747
|
+
|
2748
|
+
const uint32_t n_ff = hparams.n_ff;
|
2749
|
+
|
2750
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2751
|
+
|
2752
|
+
model.layers.resize(n_layer);
|
2753
|
+
|
2754
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2757
|
+
|
2758
|
+
auto & layer = model.layers[i];
|
2759
|
+
|
2760
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2761
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2762
|
+
|
2763
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2764
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2765
|
+
|
2766
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2767
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2768
|
+
|
2769
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2771
|
+
|
2772
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2773
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2774
|
+
|
2775
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2776
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2777
|
+
|
2778
|
+
if (backend == GGML_BACKEND_GPU) {
|
2779
|
+
vram_weights +=
|
2780
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2781
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2782
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2783
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2784
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2785
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2786
|
+
}
|
2787
|
+
}
|
2788
|
+
} break;
|
2789
|
+
case LLM_ARCH_MPT:
|
2790
|
+
{
|
2791
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2792
|
+
|
2793
|
+
// output
|
2794
|
+
{
|
2795
|
+
ggml_backend_type backend_norm;
|
2796
|
+
ggml_backend_type backend_output;
|
2797
|
+
|
2798
|
+
if (n_gpu_layers > int(n_layer)) {
|
2799
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2800
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2801
|
+
#ifndef _WIN32
|
2802
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2803
|
+
#else
|
2804
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2805
|
+
#endif // _WIN32
|
2806
|
+
|
2807
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2808
|
+
} else {
|
2809
|
+
backend_norm = GGML_BACKEND_CPU;
|
2810
|
+
backend_output = GGML_BACKEND_CPU;
|
2811
|
+
}
|
2812
|
+
|
2813
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2814
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2815
|
+
|
2816
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2817
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2818
|
+
}
|
2819
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2820
|
+
vram_weights += ggml_nbytes(model.output);
|
2821
|
+
}
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
const uint32_t n_ff = hparams.n_ff;
|
2825
|
+
|
2826
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2827
|
+
|
2828
|
+
model.layers.resize(n_layer);
|
2829
|
+
|
2830
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2831
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2832
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2833
|
+
|
2834
|
+
auto & layer = model.layers[i];
|
2835
|
+
|
2836
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2838
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
|
+
|
2840
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
|
+
|
2842
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2843
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2844
|
+
|
2845
|
+
if (backend == GGML_BACKEND_GPU) {
|
2846
|
+
vram_weights +=
|
2847
|
+
ggml_nbytes(layer.attn_norm) +
|
2848
|
+
ggml_nbytes(layer.wqkv) +
|
2849
|
+
ggml_nbytes(layer.wo) +
|
2850
|
+
ggml_nbytes(layer.ffn_norm) +
|
2851
|
+
ggml_nbytes(layer.w2) +
|
2852
|
+
ggml_nbytes(layer.w3);
|
2853
|
+
}
|
2854
|
+
}
|
2855
|
+
} break;
|
2518
2856
|
default:
|
2519
2857
|
throw std::runtime_error("unknown architecture");
|
2520
2858
|
}
|
@@ -2624,8 +2962,8 @@ static bool llama_model_load(
|
|
2624
2962
|
}
|
2625
2963
|
|
2626
2964
|
static struct ggml_cgraph * llm_build_llama(
|
2627
|
-
|
2628
|
-
|
2965
|
+
llama_context & lctx,
|
2966
|
+
const llama_batch & batch) {
|
2629
2967
|
const auto & model = lctx.model;
|
2630
2968
|
const auto & hparams = model.hparams;
|
2631
2969
|
const auto & cparams = lctx.cparams;
|
@@ -2663,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2663
3001
|
struct ggml_init_params params = {
|
2664
3002
|
/*.mem_size =*/ buf_compute.size,
|
2665
3003
|
/*.mem_buffer =*/ buf_compute.data,
|
2666
|
-
/*.no_alloc =*/
|
3004
|
+
/*.no_alloc =*/ true,
|
2667
3005
|
};
|
2668
3006
|
|
2669
|
-
params.no_alloc = true;
|
2670
|
-
|
2671
3007
|
struct ggml_context * ctx0 = ggml_init(params);
|
2672
3008
|
|
2673
3009
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3051,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3051
3387
|
struct ggml_init_params params = {
|
3052
3388
|
/*.mem_size =*/ buf_compute.size,
|
3053
3389
|
/*.mem_buffer =*/ buf_compute.data,
|
3054
|
-
/*.no_alloc =*/
|
3390
|
+
/*.no_alloc =*/ true,
|
3055
3391
|
};
|
3056
3392
|
|
3057
|
-
params.no_alloc = true;
|
3058
|
-
|
3059
3393
|
struct ggml_context * ctx0 = ggml_init(params);
|
3060
3394
|
|
3061
3395
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3452,11 +3786,9 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3452
3786
|
struct ggml_init_params params = {
|
3453
3787
|
/*.mem_size =*/ buf_compute.size,
|
3454
3788
|
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
-
/*.no_alloc =*/
|
3789
|
+
/*.no_alloc =*/ true,
|
3456
3790
|
};
|
3457
3791
|
|
3458
|
-
params.no_alloc = true;
|
3459
|
-
|
3460
3792
|
struct ggml_context * ctx0 = ggml_init(params);
|
3461
3793
|
|
3462
3794
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3806,11 +4138,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3806
4138
|
struct ggml_init_params params = {
|
3807
4139
|
/*.mem_size =*/ buf_compute.size,
|
3808
4140
|
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
-
/*.no_alloc =*/
|
4141
|
+
/*.no_alloc =*/ true,
|
3810
4142
|
};
|
3811
4143
|
|
3812
|
-
params.no_alloc = true;
|
3813
|
-
|
3814
4144
|
struct ggml_context * ctx0 = ggml_init(params);
|
3815
4145
|
|
3816
4146
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4166,11 +4496,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4166
4496
|
struct ggml_init_params params = {
|
4167
4497
|
/*.mem_size =*/ buf_compute.size,
|
4168
4498
|
/*.mem_buffer =*/ buf_compute.data,
|
4169
|
-
/*.no_alloc =*/
|
4499
|
+
/*.no_alloc =*/ true,
|
4170
4500
|
};
|
4171
4501
|
|
4172
|
-
params.no_alloc = true;
|
4173
|
-
|
4174
4502
|
struct ggml_context * ctx0 = ggml_init(params);
|
4175
4503
|
|
4176
4504
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -4381,19 +4709,975 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4381
4709
|
return gf;
|
4382
4710
|
}
|
4383
4711
|
|
4384
|
-
static struct ggml_cgraph *
|
4712
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4385
4713
|
llama_context & lctx,
|
4386
4714
|
const llama_batch & batch) {
|
4387
4715
|
const auto & model = lctx.model;
|
4716
|
+
const auto & hparams = model.hparams;
|
4388
4717
|
|
4389
|
-
|
4718
|
+
const auto & kv_self = lctx.kv_self;
|
4390
4719
|
|
4391
|
-
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4720
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4721
|
+
|
4722
|
+
const auto & cparams = lctx.cparams;
|
4723
|
+
const int64_t n_embd = hparams.n_embd;
|
4724
|
+
const int64_t n_layer = hparams.n_layer;
|
4725
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4726
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4727
|
+
const int64_t n_head = hparams.n_head;
|
4728
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4729
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4730
|
+
const size_t n_rot = n_embd_head / 2;
|
4731
|
+
|
4732
|
+
const float freq_base = cparams.rope_freq_base;
|
4733
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4734
|
+
const float norm_eps = hparams.f_norm_eps;
|
4735
|
+
|
4736
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4737
|
+
|
4738
|
+
|
4739
|
+
const int32_t n_tokens = batch.n_tokens;
|
4740
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4741
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4742
|
+
|
4743
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4744
|
+
|
4745
|
+
auto & buf_compute = lctx.buf_compute;
|
4746
|
+
struct ggml_init_params params = {
|
4747
|
+
/*.mem_size =*/ buf_compute.size,
|
4748
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4749
|
+
/*.no_alloc =*/ true,
|
4750
|
+
};
|
4751
|
+
|
4752
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4753
|
+
|
4754
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4755
|
+
|
4756
|
+
struct ggml_tensor * cur;
|
4757
|
+
struct ggml_tensor * inpL;
|
4758
|
+
|
4759
|
+
if (batch.token) {
|
4760
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4761
|
+
|
4762
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4763
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4764
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4765
|
+
}
|
4766
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4767
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4768
|
+
} else {
|
4769
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4770
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4772
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4773
|
+
}
|
4774
|
+
}
|
4775
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4776
|
+
(void) i_gpu_start;
|
4777
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4778
|
+
offload_func_t offload_func_kq = llama_nop;
|
4779
|
+
offload_func_t offload_func_v = llama_nop;
|
4780
|
+
// KQ_scale
|
4781
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4782
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4783
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4784
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4785
|
+
}
|
4786
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4787
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4788
|
+
offload_func_kq(KQ_mask);
|
4789
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4790
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4791
|
+
|
4792
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4793
|
+
float * data = (float *) KQ_mask->data;
|
4794
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4795
|
+
for (int h = 0; h < 1; ++h) {
|
4796
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4797
|
+
const llama_pos pos = batch.pos[j];
|
4798
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4799
|
+
for (int i = 0; i < n_kv; ++i) {
|
4800
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4802
|
+
}
|
4803
|
+
}
|
4804
|
+
}
|
4805
|
+
}
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4809
|
+
offload_func_kq(KQ_pos);
|
4810
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4811
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4812
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4813
|
+
int * data = (int *) KQ_pos->data;
|
4814
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4815
|
+
data[i] = batch.pos[i];
|
4816
|
+
}
|
4817
|
+
}
|
4818
|
+
if (do_rope_shift) {
|
4819
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4820
|
+
offload_func_kq(K_shift);
|
4821
|
+
ggml_set_name(K_shift, "K_shift");
|
4822
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4824
|
+
int * data = (int *) K_shift->data;
|
4825
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4826
|
+
data[i] = kv_self.cells[i].delta;
|
4827
|
+
}
|
4828
|
+
}
|
4829
|
+
for (int il = 0; il < n_layer; ++il) {
|
4830
|
+
struct ggml_tensor * tmp =
|
4831
|
+
// we rotate only the first n_rot dimensions.
|
4832
|
+
ggml_rope_custom_inplace(ctx0,
|
4833
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4834
|
+
n_rot, n_head, n_ctx,
|
4835
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4836
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4837
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4838
|
+
),
|
4839
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4840
|
+
offload_func_kq(tmp);
|
4841
|
+
ggml_build_forward_expand(gf, tmp);
|
4842
|
+
}
|
4843
|
+
}
|
4844
|
+
for (int il=0; il < n_layer; ++il) {
|
4845
|
+
struct ggml_tensor * residual = inpL;
|
4846
|
+
offload_func_t offload_func = llama_nop;
|
4847
|
+
{
|
4848
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4849
|
+
offload_func(cur);
|
4850
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4851
|
+
offload_func(cur);
|
4852
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4853
|
+
offload_func(cur);
|
4854
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4855
|
+
}
|
4856
|
+
// self attention
|
4857
|
+
{
|
4858
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4859
|
+
offload_func_kq(cur);
|
4860
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4861
|
+
offload_func_kq(cur);
|
4862
|
+
|
4863
|
+
// split qkv
|
4864
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4865
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4866
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4867
|
+
offload_func_kq(tmpqkv);
|
4868
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4869
|
+
offload_func_kq(tmpqkv_perm);
|
4870
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4871
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4872
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4873
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4874
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4875
|
+
0
|
4876
|
+
);
|
4877
|
+
offload_func_kq(tmpq);
|
4878
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4879
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4880
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4881
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4882
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4883
|
+
);
|
4884
|
+
offload_func_kq(tmpk);
|
4885
|
+
// Q/K Layernorm
|
4886
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4887
|
+
offload_func_kq(tmpq);
|
4888
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4889
|
+
offload_func_kq(tmpq);
|
4890
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4891
|
+
offload_func_kq(tmpq);
|
4892
|
+
|
4893
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4894
|
+
offload_func_v(tmpk);
|
4895
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
4896
|
+
offload_func_v(tmpk);
|
4897
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
4898
|
+
offload_func_v(tmpk);
|
4899
|
+
|
4900
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4901
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
4902
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4903
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4904
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4905
|
+
0
|
4906
|
+
);
|
4907
|
+
offload_func_kq(qrot);
|
4908
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
4909
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
4910
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4911
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4912
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4913
|
+
0
|
4914
|
+
);
|
4915
|
+
offload_func_kq(krot);
|
4916
|
+
ggml_format_name(krot, "krot_%d", il);
|
4917
|
+
|
4918
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4919
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4920
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4921
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4922
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4923
|
+
ggml_element_size(tmpq) * n_rot
|
4924
|
+
);
|
4925
|
+
offload_func_kq(qpass);
|
4926
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
4927
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4928
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4929
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4930
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4931
|
+
ggml_element_size(tmpk) * n_rot
|
4932
|
+
);
|
4933
|
+
offload_func_kq(kpass);
|
4934
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
4935
|
+
|
4936
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4937
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4938
|
+
);
|
4939
|
+
offload_func_kq(qrotated);
|
4940
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4941
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4942
|
+
);
|
4943
|
+
offload_func_kq(krotated);
|
4944
|
+
// ggml currently only supports concatenation on dim=2
|
4945
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4946
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4947
|
+
offload_func_kq(qrotated);
|
4948
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4949
|
+
offload_func_kq(krotated);
|
4950
|
+
|
4951
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4952
|
+
offload_func_kq(qpass);
|
4953
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4954
|
+
offload_func_kq(kpass);
|
4955
|
+
|
4956
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4957
|
+
offload_func_kq(Qcur);
|
4958
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4959
|
+
offload_func_kq(Kcur);
|
4960
|
+
|
4961
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
4962
|
+
offload_func_kq(Q);
|
4963
|
+
|
4964
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4965
|
+
offload_func_kq(Kcur);
|
4966
|
+
{
|
4967
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4968
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4969
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4970
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4971
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
4972
|
+
);
|
4973
|
+
offload_func_v(tmpv);
|
4974
|
+
// store K, V in cache
|
4975
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
4976
|
+
offload_func_v(Vcur);
|
4977
|
+
ggml_set_name(Vcur, "Vcur");
|
4978
|
+
|
4979
|
+
struct ggml_tensor * k = ggml_view_1d(
|
4980
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
4981
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
4982
|
+
);
|
4983
|
+
offload_func_kq(k);
|
4984
|
+
ggml_set_name(k, "k");
|
4985
|
+
|
4986
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4987
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4988
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4989
|
+
offload_func_v(v);
|
4990
|
+
ggml_set_name(v, "v");
|
4991
|
+
|
4992
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
4993
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4994
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4995
|
+
}
|
4996
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
4997
|
+
n_embd_head, n_kv, n_head_kv,
|
4998
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4999
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5000
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5001
|
+
|
5002
|
+
offload_func_kq(K);
|
5003
|
+
ggml_format_name(K, "K_%d", il);
|
5004
|
+
|
5005
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5006
|
+
offload_func_kq(KQ);
|
5007
|
+
ggml_set_name(KQ, "KQ");
|
5008
|
+
|
5009
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5010
|
+
offload_func_kq(KQ_scaled);
|
5011
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5012
|
+
|
5013
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5014
|
+
offload_func_kq(KQ_masked);
|
5015
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5016
|
+
|
5017
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5018
|
+
offload_func_kq(KQ_soft_max);
|
5019
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5020
|
+
|
5021
|
+
struct ggml_tensor * V =
|
5022
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5023
|
+
n_kv, n_embd_head, n_head_kv,
|
5024
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5025
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5026
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5027
|
+
offload_func_v(V);
|
5028
|
+
ggml_set_name(V, "V");
|
5029
|
+
|
5030
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5031
|
+
offload_func_v(KQV);
|
5032
|
+
ggml_set_name(KQV, "KQV");
|
5033
|
+
|
5034
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5035
|
+
offload_func_v(KQV_merged);
|
5036
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5037
|
+
|
5038
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5039
|
+
offload_func_v(cur);
|
5040
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5041
|
+
|
5042
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5043
|
+
offload_func(cur);
|
5044
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5045
|
+
offload_func(cur);
|
5046
|
+
ggml_set_name(cur, "result_wo");
|
5047
|
+
}
|
5048
|
+
|
5049
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5050
|
+
offload_func(inpFF);
|
5051
|
+
ggml_set_name(inpFF, "inpFF");
|
5052
|
+
{
|
5053
|
+
// MLP
|
5054
|
+
{
|
5055
|
+
// Norm
|
5056
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5057
|
+
offload_func(cur);
|
5058
|
+
cur = ggml_add(ctx0,
|
5059
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5060
|
+
model.layers[il].ffn_norm_b
|
5061
|
+
);
|
5062
|
+
ggml_set_name(cur, "ffn_norm");
|
5063
|
+
offload_func(cur);
|
5064
|
+
}
|
5065
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5066
|
+
offload_func(cur);
|
5067
|
+
|
5068
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5069
|
+
offload_func(cur);
|
5070
|
+
ggml_set_name(cur, "result_ffn_up");
|
5071
|
+
|
5072
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5073
|
+
ggml_set_name(cur, "result_ffn_act");
|
5074
|
+
offload_func(cur);
|
5075
|
+
offload_func(cur->src[0]);
|
5076
|
+
|
5077
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5078
|
+
offload_func(cur);
|
5079
|
+
cur = ggml_add(ctx0,
|
5080
|
+
cur,
|
5081
|
+
model.layers[il].b2);
|
5082
|
+
offload_func(cur);
|
5083
|
+
ggml_set_name(cur, "outFF");
|
5084
|
+
}
|
5085
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5086
|
+
offload_func(cur);
|
5087
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5088
|
+
inpL = cur;
|
5089
|
+
}
|
5090
|
+
cur = inpL;
|
5091
|
+
{
|
5092
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5093
|
+
offload_func_nr(cur);
|
5094
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5095
|
+
offload_func_nr(cur);
|
5096
|
+
|
5097
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5098
|
+
// offload_func_nr(cur);
|
5099
|
+
|
5100
|
+
ggml_set_name(cur, "result_norm");
|
5101
|
+
}
|
5102
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5103
|
+
ggml_set_name(cur, "result_output");
|
5104
|
+
ggml_build_forward_expand(gf, cur);
|
5105
|
+
ggml_free(ctx0);
|
5106
|
+
return gf;
|
5107
|
+
}
|
5108
|
+
|
5109
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5110
|
+
llama_context & lctx,
|
5111
|
+
const llama_batch & batch) {
|
5112
|
+
const auto & model = lctx.model;
|
5113
|
+
const auto & hparams = model.hparams;
|
5114
|
+
const auto & cparams = lctx.cparams;
|
5115
|
+
|
5116
|
+
const auto & kv_self = lctx.kv_self;
|
5117
|
+
|
5118
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5119
|
+
|
5120
|
+
const int64_t n_embd = hparams.n_embd;
|
5121
|
+
const int64_t n_layer = hparams.n_layer;
|
5122
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5123
|
+
const int64_t n_head = hparams.n_head;
|
5124
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5125
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5126
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5127
|
+
|
5128
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5129
|
+
|
5130
|
+
const float norm_eps = hparams.f_norm_eps;
|
5131
|
+
|
5132
|
+
const int32_t n_tokens = batch.n_tokens;
|
5133
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5134
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5135
|
+
|
5136
|
+
auto & buf_compute = lctx.buf_compute;
|
5137
|
+
|
5138
|
+
struct ggml_init_params params = {
|
5139
|
+
/*.mem_size =*/ buf_compute.size,
|
5140
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5141
|
+
/*.no_alloc =*/ false,
|
5142
|
+
};
|
5143
|
+
|
5144
|
+
params.no_alloc = true;
|
5145
|
+
|
5146
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5147
|
+
|
5148
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5149
|
+
|
5150
|
+
struct ggml_tensor * cur;
|
5151
|
+
struct ggml_tensor * token;
|
5152
|
+
struct ggml_tensor * inpL;
|
5153
|
+
|
5154
|
+
if (batch.token) {
|
5155
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5156
|
+
|
5157
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5158
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5159
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5160
|
+
}
|
5161
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5162
|
+
|
5163
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5164
|
+
} else {
|
5165
|
+
#ifdef GGML_USE_MPI
|
5166
|
+
GGML_ASSERT(false && "not implemented");
|
5167
|
+
#endif
|
5168
|
+
|
5169
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5170
|
+
|
5171
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5172
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5173
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5174
|
+
}
|
5175
|
+
}
|
5176
|
+
|
5177
|
+
// KQ_scale
|
5178
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5179
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5180
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5181
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5182
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5183
|
+
}
|
5184
|
+
|
5185
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5186
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5187
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5188
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5189
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5190
|
+
float * data = (float *) KQ_mask->data;
|
5191
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5192
|
+
|
5193
|
+
for (int h = 0; h < 1; ++h) {
|
5194
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5195
|
+
const llama_pos pos = batch.pos[j];
|
5196
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5197
|
+
|
5198
|
+
for (int i = 0; i < n_kv; ++i) {
|
5199
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5200
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5201
|
+
}
|
5202
|
+
}
|
5203
|
+
}
|
5204
|
+
}
|
5205
|
+
}
|
5206
|
+
|
5207
|
+
// norm
|
5208
|
+
{
|
5209
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5210
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
ggml_set_name(inpL, "inpL");
|
5214
|
+
|
5215
|
+
for (int il = 0; il < n_layer; ++il) {
|
5216
|
+
{
|
5217
|
+
// Norm
|
5218
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5219
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5220
|
+
}
|
5221
|
+
|
5222
|
+
{
|
5223
|
+
// Self Attention
|
5224
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5225
|
+
|
5226
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5227
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5228
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5229
|
+
|
5230
|
+
struct ggml_tensor * Qcur = tmpq;
|
5231
|
+
struct ggml_tensor * Kcur = tmpk;
|
5232
|
+
|
5233
|
+
// store key and value to memory
|
5234
|
+
{
|
5235
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5236
|
+
ggml_set_name(Vcur, "Vcur");
|
5237
|
+
|
5238
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5239
|
+
ggml_set_name(k, "k");
|
5240
|
+
|
5241
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5242
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5243
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5244
|
+
|
5245
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5246
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5247
|
+
}
|
5248
|
+
|
5249
|
+
struct ggml_tensor * Q =
|
5250
|
+
ggml_permute(ctx0,
|
5251
|
+
ggml_cpy(ctx0,
|
5252
|
+
Qcur,
|
5253
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5254
|
+
0, 2, 1, 3);
|
5255
|
+
ggml_set_name(Q, "Q");
|
5256
|
+
|
5257
|
+
struct ggml_tensor * K =
|
5258
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5259
|
+
n_embd_head, n_kv, n_head_kv,
|
5260
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5261
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5262
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5263
|
+
ggml_set_name(K, "K");
|
5264
|
+
|
5265
|
+
// K * Q
|
5266
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5267
|
+
ggml_set_name(KQ, "KQ");
|
5268
|
+
|
5269
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5270
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5271
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5272
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5273
|
+
|
5274
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5275
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5276
|
+
|
5277
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5278
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5279
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5280
|
+
|
5281
|
+
// KQ = soft_max(KQ_masked)
|
5282
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5283
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5284
|
+
|
5285
|
+
// split cached V into n_head heads
|
5286
|
+
struct ggml_tensor * V =
|
5287
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5288
|
+
n_kv, n_embd_head, n_head_kv,
|
5289
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5290
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5291
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5292
|
+
ggml_set_name(V, "V");
|
5293
|
+
|
5294
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5295
|
+
ggml_set_name(KQV, "KQV");
|
5296
|
+
|
5297
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5298
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5299
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5300
|
+
|
5301
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5302
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5303
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5304
|
+
}
|
5305
|
+
|
5306
|
+
// Projection
|
5307
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5308
|
+
|
5309
|
+
// Add the input
|
5310
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5311
|
+
|
5312
|
+
struct ggml_tensor * inpFF = cur;
|
5313
|
+
|
5314
|
+
// FF
|
5315
|
+
{
|
5316
|
+
// Norm
|
5317
|
+
{
|
5318
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5319
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5320
|
+
}
|
5321
|
+
|
5322
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5323
|
+
|
5324
|
+
// GELU activation
|
5325
|
+
cur = ggml_gelu(ctx0, cur);
|
5326
|
+
|
5327
|
+
// Projection
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5329
|
+
}
|
5330
|
+
|
5331
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
// Output Norm
|
5335
|
+
{
|
5336
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5337
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5338
|
+
}
|
5339
|
+
ggml_set_name(cur, "result_norm");
|
5340
|
+
|
5341
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5342
|
+
ggml_set_name(cur, "result_output");
|
5343
|
+
|
5344
|
+
ggml_build_forward_expand(gf, cur);
|
5345
|
+
|
5346
|
+
ggml_free(ctx0);
|
5347
|
+
|
5348
|
+
return gf;
|
5349
|
+
}
|
5350
|
+
|
5351
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5352
|
+
llama_context & lctx,
|
5353
|
+
const llama_batch & batch) {
|
5354
|
+
const auto & model = lctx.model;
|
5355
|
+
const auto & hparams = model.hparams;
|
5356
|
+
const auto & cparams = lctx.cparams;
|
5357
|
+
|
5358
|
+
const auto & kv_self = lctx.kv_self;
|
5359
|
+
|
5360
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5361
|
+
|
5362
|
+
const int64_t n_embd = hparams.n_embd;
|
5363
|
+
const int64_t n_layer = hparams.n_layer;
|
5364
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5365
|
+
const int64_t n_head = hparams.n_head;
|
5366
|
+
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5367
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
|
+
|
5370
|
+
const float norm_eps = hparams.f_norm_eps;
|
5371
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5372
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5373
|
+
|
5374
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5375
|
+
|
5376
|
+
const int32_t n_tokens = batch.n_tokens;
|
5377
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5378
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5379
|
+
|
5380
|
+
auto & buf_compute = lctx.buf_compute;
|
5381
|
+
|
5382
|
+
struct ggml_init_params params = {
|
5383
|
+
/*.mem_size =*/ buf_compute.size,
|
5384
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5385
|
+
/*.no_alloc =*/ false,
|
5386
|
+
};
|
5387
|
+
|
5388
|
+
params.no_alloc = true;
|
5389
|
+
|
5390
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5391
|
+
|
5392
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5393
|
+
|
5394
|
+
struct ggml_tensor * cur;
|
5395
|
+
struct ggml_tensor * inpL;
|
5396
|
+
|
5397
|
+
//int warmup = 0;
|
5398
|
+
if (batch.token) {
|
5399
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5400
|
+
|
5401
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5402
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5403
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5404
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5405
|
+
}
|
5406
|
+
|
5407
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5408
|
+
|
5409
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5410
|
+
} else {
|
5411
|
+
#ifdef GGML_USE_MPI
|
5412
|
+
GGML_ASSERT(false && "not implemented");
|
5413
|
+
#endif
|
5414
|
+
|
5415
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5416
|
+
|
5417
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5418
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5419
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5420
|
+
}
|
5421
|
+
}
|
5422
|
+
|
5423
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5424
|
+
(void) i_gpu_start;
|
5425
|
+
|
5426
|
+
// offload functions set the tensor output backend to GPU
|
5427
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5428
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5429
|
+
offload_func_t offload_func_kq = llama_nop;
|
5430
|
+
offload_func_t offload_func_v = llama_nop;
|
5431
|
+
|
5432
|
+
#ifdef GGML_USE_CUBLAS
|
5433
|
+
if (n_gpu_layers > n_layer) {
|
5434
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5435
|
+
}
|
5436
|
+
if (n_gpu_layers > n_layer + 1) {
|
5437
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5438
|
+
}
|
5439
|
+
if (n_gpu_layers > n_layer + 2) {
|
5440
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5441
|
+
}
|
5442
|
+
#endif // GGML_USE_CUBLAS
|
5443
|
+
|
5444
|
+
// KQ_scale
|
5445
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5446
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5447
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5448
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5449
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5450
|
+
}
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
offload_func_kq(KQ_mask);
|
5455
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5456
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5457
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5458
|
+
float * data = (float *) KQ_mask->data;
|
5459
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5460
|
+
|
5461
|
+
for (int h = 0; h < 1; ++h) {
|
5462
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5463
|
+
const llama_pos pos = batch.pos[j];
|
5464
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5465
|
+
|
5466
|
+
for (int i = 0; i < n_kv; ++i) {
|
5467
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5468
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5469
|
+
}
|
5470
|
+
}
|
5471
|
+
}
|
5472
|
+
}
|
5473
|
+
}
|
5474
|
+
|
5475
|
+
for (int il = 0; il < n_layer; ++il) {
|
5476
|
+
struct ggml_tensor * attn_norm;
|
5477
|
+
|
5478
|
+
offload_func_t offload_func = llama_nop;
|
5479
|
+
|
5480
|
+
#ifdef GGML_USE_CUBLAS
|
5481
|
+
if (il >= i_gpu_start) {
|
5482
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
5483
|
+
}
|
5484
|
+
#endif // GGML_USE_CUBLAS
|
5485
|
+
|
5486
|
+
// self-attention
|
5487
|
+
// TODO: refactor into common function (shared with LLaMA)
|
5488
|
+
{
|
5489
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5490
|
+
offload_func(attn_norm);
|
5491
|
+
|
5492
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5493
|
+
offload_func(attn_norm);
|
5494
|
+
|
5495
|
+
if (1) {
|
5496
|
+
cur = attn_norm;
|
5497
|
+
}
|
5498
|
+
|
5499
|
+
// compute QKV
|
5500
|
+
|
5501
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5502
|
+
offload_func_kq(cur);
|
5503
|
+
|
5504
|
+
if (clamp_kqv > 0.0f) {
|
5505
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5506
|
+
offload_func_kq(cur);
|
5507
|
+
}
|
5508
|
+
|
5509
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5510
|
+
|
5511
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5512
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5513
|
+
wsize * n_embd_head,
|
5514
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5515
|
+
0);
|
5516
|
+
offload_func_kq(Qcur);
|
5517
|
+
|
5518
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5519
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5520
|
+
wsize * n_embd_head,
|
5521
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5522
|
+
wsize * n_embd_head * n_head);
|
5523
|
+
offload_func_kq(Kcur);
|
5524
|
+
|
5525
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5526
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5527
|
+
wsize * n_embd_head,
|
5528
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5529
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5530
|
+
offload_func_kq(Kcur);
|
5531
|
+
|
5532
|
+
ggml_set_name(Qcur, "Qcur");
|
5533
|
+
ggml_set_name(Kcur, "Kcur");
|
5534
|
+
|
5535
|
+
{
|
5536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5537
|
+
offload_func_v(Vcur);
|
5538
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
5539
|
+
ggml_set_name(Vcur, "Vcur");
|
5540
|
+
|
5541
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5542
|
+
offload_func_kq(k);
|
5543
|
+
ggml_set_name(k, "k");
|
5544
|
+
|
5545
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5546
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5547
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5548
|
+
offload_func_v(v);
|
5549
|
+
|
5550
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5551
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5552
|
+
}
|
5553
|
+
|
5554
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5555
|
+
offload_func_kq(Q);
|
5556
|
+
ggml_set_name(Q, "Q");
|
5557
|
+
|
5558
|
+
struct ggml_tensor * K =
|
5559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5560
|
+
n_embd_head, n_kv, n_head_kv,
|
5561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5564
|
+
offload_func_kq(K);
|
5565
|
+
ggml_set_name(K, "K");
|
5566
|
+
|
5567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5568
|
+
offload_func_kq(KQ);
|
5569
|
+
ggml_set_name(KQ, "KQ");
|
5570
|
+
|
5571
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5572
|
+
offload_func_kq(KQ_scaled);
|
5573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5574
|
+
|
5575
|
+
// TODO: replace with ggml_add()
|
5576
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5577
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5578
|
+
offload_func_kq(KQ_scaled_alibi);
|
5579
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5580
|
+
|
5581
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5582
|
+
offload_func_kq(KQ_masked);
|
5583
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5584
|
+
|
5585
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5586
|
+
offload_func_v(KQ_soft_max);
|
5587
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5588
|
+
|
5589
|
+
struct ggml_tensor * V =
|
5590
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5591
|
+
n_kv, n_embd_head, n_head_kv,
|
5592
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5593
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5594
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5595
|
+
offload_func_v(V);
|
5596
|
+
ggml_set_name(V, "V");
|
5597
|
+
|
5598
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5599
|
+
offload_func_v(KQV);
|
5600
|
+
ggml_set_name(KQV, "KQV");
|
5601
|
+
|
5602
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5603
|
+
offload_func_v(KQV_merged);
|
5604
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5605
|
+
|
5606
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5607
|
+
offload_func_v(cur);
|
5608
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5609
|
+
|
5610
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5611
|
+
offload_func(cur);
|
5612
|
+
ggml_set_name(cur, "result_wo");
|
5613
|
+
}
|
5614
|
+
|
5615
|
+
// Add the input
|
5616
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5617
|
+
offload_func(cur);
|
5618
|
+
|
5619
|
+
struct ggml_tensor * attn_out = cur;
|
5620
|
+
|
5621
|
+
// feed forward
|
5622
|
+
{
|
5623
|
+
// Norm
|
5624
|
+
{
|
5625
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5626
|
+
offload_func(cur);
|
5627
|
+
|
5628
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5629
|
+
offload_func(cur);
|
5630
|
+
}
|
5631
|
+
|
5632
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5633
|
+
offload_func(cur);
|
5634
|
+
|
5635
|
+
cur = ggml_gelu(ctx0, cur);
|
5636
|
+
offload_func(cur);
|
5637
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5638
|
+
offload_func(cur);
|
5639
|
+
}
|
5640
|
+
|
5641
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5642
|
+
offload_func(cur);
|
5643
|
+
// input for next layer
|
5644
|
+
inpL = cur;
|
5645
|
+
}
|
5646
|
+
|
5647
|
+
cur = inpL;
|
5648
|
+
|
5649
|
+
// norm
|
5650
|
+
{
|
5651
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5652
|
+
offload_func_nr(cur);
|
5653
|
+
|
5654
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5655
|
+
ggml_set_name(cur, "result_norm");
|
5656
|
+
}
|
5657
|
+
|
5658
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5659
|
+
ggml_set_name(cur, "result_output");
|
5660
|
+
|
5661
|
+
ggml_build_forward_expand(gf, cur);
|
5662
|
+
|
5663
|
+
ggml_free(ctx0);
|
5664
|
+
|
5665
|
+
return gf;
|
5666
|
+
}
|
5667
|
+
|
5668
|
+
static struct ggml_cgraph * llama_build_graph(
|
5669
|
+
llama_context & lctx,
|
5670
|
+
const llama_batch & batch) {
|
5671
|
+
const auto & model = lctx.model;
|
5672
|
+
|
5673
|
+
struct ggml_cgraph * result = NULL;
|
5674
|
+
|
5675
|
+
switch (model.arch) {
|
5676
|
+
case LLM_ARCH_LLAMA:
|
5677
|
+
{
|
5678
|
+
result = llm_build_llama(lctx, batch);
|
5679
|
+
} break;
|
5680
|
+
case LLM_ARCH_BAICHUAN:
|
4397
5681
|
{
|
4398
5682
|
result = llm_build_baichaun(lctx, batch);
|
4399
5683
|
} break;
|
@@ -4405,10 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4405
5689
|
{
|
4406
5690
|
result = llm_build_starcoder(lctx, batch);
|
4407
5691
|
} break;
|
5692
|
+
case LLM_ARCH_PERSIMMON:
|
5693
|
+
{
|
5694
|
+
result = llm_build_persimmon(lctx, batch);
|
5695
|
+
} break;
|
4408
5696
|
case LLM_ARCH_REFACT:
|
4409
5697
|
{
|
4410
5698
|
result = llm_build_refact(lctx, batch);
|
4411
5699
|
} break;
|
5700
|
+
case LLM_ARCH_BLOOM:
|
5701
|
+
{
|
5702
|
+
result = llm_build_bloom(lctx, batch);
|
5703
|
+
} break;
|
5704
|
+
case LLM_ARCH_MPT:
|
5705
|
+
{
|
5706
|
+
result = llm_build_mpt(lctx, batch);
|
5707
|
+
} break;
|
4412
5708
|
default:
|
4413
5709
|
GGML_ASSERT(false);
|
4414
5710
|
}
|
@@ -4420,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4420
5716
|
//
|
4421
5717
|
// - lctx: llama context
|
4422
5718
|
// - batch: batch to evaluate
|
4423
|
-
// - n_threads: number of threads to use
|
4424
5719
|
//
|
4425
5720
|
// return 0 on success
|
4426
5721
|
// return positive int on warning
|
@@ -4487,10 +5782,6 @@ static int llama_decode_internal(
|
|
4487
5782
|
batch.seq_id = seq_id.data();
|
4488
5783
|
}
|
4489
5784
|
|
4490
|
-
// we always start to search for a free slot from the start of the cache
|
4491
|
-
// TODO: better strategies can be implemented
|
4492
|
-
kv_self.head = 0;
|
4493
|
-
|
4494
5785
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
5786
|
return 1;
|
4496
5787
|
}
|
@@ -4543,7 +5834,8 @@ static int llama_decode_internal(
|
|
4543
5834
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4544
5835
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4545
5836
|
model.arch == LLM_ARCH_FALCON ||
|
4546
|
-
model.arch == LLM_ARCH_REFACT
|
5837
|
+
model.arch == LLM_ARCH_REFACT ||
|
5838
|
+
model.arch == LLM_ARCH_MPT;
|
4547
5839
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4548
5840
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4549
5841
|
n_threads = 1;
|
@@ -4576,8 +5868,12 @@ static int llama_decode_internal(
|
|
4576
5868
|
#endif
|
4577
5869
|
|
4578
5870
|
// update the kv ring buffer
|
4579
|
-
lctx.kv_self.head += n_tokens;
|
4580
5871
|
lctx.kv_self.has_shift = false;
|
5872
|
+
lctx.kv_self.head += n_tokens;
|
5873
|
+
// Ensure kv cache head points to a valid index.
|
5874
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5875
|
+
lctx.kv_self.head = 0;
|
5876
|
+
}
|
4581
5877
|
|
4582
5878
|
#ifdef GGML_PERF
|
4583
5879
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -5040,7 +6336,6 @@ private:
|
|
5040
6336
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
6337
|
const std::string & utf_char = text_utf[i];
|
5042
6338
|
bool split_condition = false;
|
5043
|
-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
6339
|
int bytes_remain = text_utf.size() - i;
|
5045
6340
|
// forward backward lookups
|
5046
6341
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -5066,9 +6361,9 @@ private:
|
|
5066
6361
|
if (!split_condition && bytes_remain >= 3) {
|
5067
6362
|
// 're|'ve|'ll
|
5068
6363
|
if (utf_char == "\'" && (
|
5069
|
-
(utf_char_next == "r"
|
5070
|
-
(utf_char_next == "v"
|
5071
|
-
(utf_char_next == "l"
|
6364
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6365
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6366
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
5072
6367
|
) {
|
5073
6368
|
split_condition = true;
|
5074
6369
|
}
|
@@ -5119,7 +6414,7 @@ private:
|
|
5119
6414
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
6415
|
split_condition = true;
|
5121
6416
|
}
|
5122
|
-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next)
|
6417
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5123
6418
|
split_condition = true;
|
5124
6419
|
}
|
5125
6420
|
}
|
@@ -6635,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6635
7930
|
const std::string name = ggml_get_name(meta);
|
6636
7931
|
|
6637
7932
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6638
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
7933
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6639
7934
|
++n_attention_wv;
|
6640
7935
|
}
|
6641
7936
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6672,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6672
7967
|
}
|
6673
7968
|
|
6674
7969
|
std::ofstream fout(fname_out, std::ios::binary);
|
7970
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6675
7971
|
|
6676
7972
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6677
7973
|
|
@@ -8166,7 +9462,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8166
9462
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
9463
|
return 1;
|
8168
9464
|
} else {
|
8169
|
-
|
9465
|
+
// TODO: for now we accept all unsupported token types,
|
9466
|
+
// suppressing them like CONTROL tokens.
|
9467
|
+
// GGML_ASSERT(false);
|
8170
9468
|
}
|
8171
9469
|
break;
|
8172
9470
|
}
|
@@ -8182,7 +9480,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
|
8182
9480
|
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
9481
|
;
|
8184
9482
|
} else {
|
8185
|
-
|
9483
|
+
// TODO: for now we accept all unsupported token types,
|
9484
|
+
// suppressing them like CONTROL tokens.
|
9485
|
+
// GGML_ASSERT(false);
|
8186
9486
|
}
|
8187
9487
|
break;
|
8188
9488
|
}
|