llama_cpp 0.6.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
123
125
|
}
|
124
126
|
s = std::move(result);
|
125
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
126
149
|
#ifdef GGML_USE_CPU_HBM
|
127
150
|
#include <hbwmalloc.h>
|
128
151
|
#endif
|
@@ -163,6 +186,9 @@ enum llm_arch {
|
|
163
186
|
LLM_ARCH_GPTNEOX,
|
164
187
|
LLM_ARCH_MPT,
|
165
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_PERSIMMON,
|
190
|
+
LLM_ARCH_REFACT,
|
191
|
+
LLM_ARCH_BLOOM,
|
166
192
|
LLM_ARCH_UNKNOWN,
|
167
193
|
};
|
168
194
|
|
@@ -175,6 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
175
201
|
{ LLM_ARCH_MPT, "mpt" },
|
176
202
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
203
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
204
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205
|
+
{ LLM_ARCH_REFACT, "refact" },
|
206
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
178
207
|
};
|
179
208
|
|
180
209
|
enum llm_kv {
|
@@ -277,6 +306,7 @@ struct LLM_KV {
|
|
277
306
|
|
278
307
|
enum llm_tensor {
|
279
308
|
LLM_TENSOR_TOKEN_EMBD,
|
309
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
280
310
|
LLM_TENSOR_POS_EMBD,
|
281
311
|
LLM_TENSOR_OUTPUT,
|
282
312
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -293,6 +323,8 @@ enum llm_tensor {
|
|
293
323
|
LLM_TENSOR_FFN_DOWN,
|
294
324
|
LLM_TENSOR_FFN_UP,
|
295
325
|
LLM_TENSOR_FFN_NORM,
|
326
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
327
|
+
LLM_TENSOR_ATTN_K_NORM,
|
296
328
|
};
|
297
329
|
|
298
330
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -374,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
374
406
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
375
407
|
},
|
376
408
|
},
|
409
|
+
{
|
410
|
+
LLM_ARCH_PERSIMMON,
|
411
|
+
{
|
412
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
413
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
414
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
415
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
416
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
417
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
418
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
419
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
420
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
422
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
423
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
424
|
+
},
|
425
|
+
},
|
377
426
|
{
|
378
427
|
LLM_ARCH_MPT,
|
379
428
|
{
|
380
429
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
430
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
431
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
432
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
433
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
434
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
435
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
381
438
|
},
|
382
439
|
},
|
383
440
|
{
|
@@ -395,6 +452,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
395
452
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
453
|
},
|
397
454
|
},
|
455
|
+
{
|
456
|
+
LLM_ARCH_REFACT,
|
457
|
+
{
|
458
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
459
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
460
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
461
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
462
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
463
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
464
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
465
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
466
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
467
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
468
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
469
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
470
|
+
},
|
471
|
+
},
|
472
|
+
{
|
473
|
+
LLM_ARCH_BLOOM,
|
474
|
+
{
|
475
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
477
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
479
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
480
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
481
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
482
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
483
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
485
|
+
},
|
486
|
+
},
|
398
487
|
{
|
399
488
|
LLM_ARCH_UNKNOWN,
|
400
489
|
{
|
@@ -912,6 +1001,7 @@ enum e_model {
|
|
912
1001
|
MODEL_1B,
|
913
1002
|
MODEL_3B,
|
914
1003
|
MODEL_7B,
|
1004
|
+
MODEL_8B,
|
915
1005
|
MODEL_13B,
|
916
1006
|
MODEL_15B,
|
917
1007
|
MODEL_30B,
|
@@ -942,8 +1032,28 @@ struct llama_hparams {
|
|
942
1032
|
float rope_freq_base_train;
|
943
1033
|
float rope_freq_scale_train;
|
944
1034
|
|
1035
|
+
float f_clamp_kqv;
|
1036
|
+
float f_max_alibi_bias;
|
1037
|
+
|
945
1038
|
bool operator!=(const llama_hparams & other) const {
|
946
|
-
|
1039
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1040
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1041
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1042
|
+
if (this->n_embd != other.n_embd) return true;
|
1043
|
+
if (this->n_head != other.n_head) return true;
|
1044
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1045
|
+
if (this->n_layer != other.n_layer) return true;
|
1046
|
+
if (this->n_rot != other.n_rot) return true;
|
1047
|
+
if (this->n_ff != other.n_ff) return true;
|
1048
|
+
|
1049
|
+
const float EPSILON = 1e-9;
|
1050
|
+
|
1051
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1052
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1053
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1054
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1055
|
+
|
1056
|
+
return false;
|
947
1057
|
}
|
948
1058
|
|
949
1059
|
uint32_t n_gqa() const {
|
@@ -977,6 +1087,10 @@ struct llama_layer {
|
|
977
1087
|
struct ggml_tensor * attn_norm_b;
|
978
1088
|
struct ggml_tensor * attn_norm_2;
|
979
1089
|
struct ggml_tensor * attn_norm_2_b;
|
1090
|
+
struct ggml_tensor * attn_q_norm;
|
1091
|
+
struct ggml_tensor * attn_q_norm_b;
|
1092
|
+
struct ggml_tensor * attn_k_norm;
|
1093
|
+
struct ggml_tensor * attn_k_norm_b;
|
980
1094
|
|
981
1095
|
// attention
|
982
1096
|
struct ggml_tensor * wq;
|
@@ -1018,6 +1132,9 @@ struct llama_kv_cell {
|
|
1018
1132
|
struct llama_kv_cache {
|
1019
1133
|
bool has_shift = false;
|
1020
1134
|
|
1135
|
+
// Note: The value of head isn't only used to optimize searching
|
1136
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1137
|
+
// cannot be freely changed after a slot has been allocated.
|
1021
1138
|
uint32_t head = 0;
|
1022
1139
|
uint32_t size = 0;
|
1023
1140
|
|
@@ -1071,6 +1188,10 @@ struct llama_vocab {
|
|
1071
1188
|
id special_pad_id = -1;
|
1072
1189
|
|
1073
1190
|
id linefeed_id = 13;
|
1191
|
+
id special_prefix_id = 32007;
|
1192
|
+
id special_middle_id = 32009;
|
1193
|
+
id special_suffix_id = 32008;
|
1194
|
+
id special_eot_id = 32010;
|
1074
1195
|
|
1075
1196
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1076
1197
|
replace_all(token_left, " ", "\u0120");
|
@@ -1099,6 +1220,8 @@ struct llama_model {
|
|
1099
1220
|
|
1100
1221
|
struct ggml_tensor * tok_embeddings;
|
1101
1222
|
struct ggml_tensor * pos_embeddings;
|
1223
|
+
struct ggml_tensor * tok_norm;
|
1224
|
+
struct ggml_tensor * tok_norm_b;
|
1102
1225
|
|
1103
1226
|
struct ggml_tensor * output_norm;
|
1104
1227
|
struct ggml_tensor * output_norm_b;
|
@@ -1228,7 +1351,11 @@ static bool llama_kv_cache_init(
|
|
1228
1351
|
cache.cells.clear();
|
1229
1352
|
cache.cells.resize(n_ctx);
|
1230
1353
|
|
1354
|
+
// TODO: this should be:
|
1355
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1356
|
+
// change it and test that it works
|
1231
1357
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1232
1359
|
|
1233
1360
|
struct ggml_init_params params;
|
1234
1361
|
params.mem_size = cache.buf.size;
|
@@ -1271,9 +1398,11 @@ static bool llama_kv_cache_init(
|
|
1271
1398
|
|
1272
1399
|
// find an empty slot of size "n_tokens" in the cache
|
1273
1400
|
// updates the cache head
|
1401
|
+
// Note: On success, it's important that cache.head points
|
1402
|
+
// to the first cell of the slot.
|
1274
1403
|
static bool llama_kv_cache_find_slot(
|
1275
|
-
|
1276
|
-
|
1404
|
+
struct llama_kv_cache & cache,
|
1405
|
+
const struct llama_batch & batch) {
|
1277
1406
|
const uint32_t n_ctx = cache.size;
|
1278
1407
|
const uint32_t n_tokens = batch.n_tokens;
|
1279
1408
|
|
@@ -1286,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
|
|
1286
1415
|
|
1287
1416
|
while (true) {
|
1288
1417
|
if (cache.head + n_tokens > n_ctx) {
|
1418
|
+
n_tested += n_ctx - cache.head;
|
1289
1419
|
cache.head = 0;
|
1290
|
-
n_tested += n_ctx - cache.head;
|
1291
1420
|
continue;
|
1292
1421
|
}
|
1293
1422
|
|
@@ -1338,29 +1467,46 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1338
1467
|
cache.cells[i].pos = -1;
|
1339
1468
|
cache.cells[i].seq_id.clear();
|
1340
1469
|
}
|
1470
|
+
|
1471
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1472
|
+
cache.head = uint32_t(c0);
|
1341
1473
|
}
|
1342
1474
|
|
1343
1475
|
static void llama_kv_cache_seq_rm(
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1476
|
+
struct llama_kv_cache & cache,
|
1477
|
+
llama_seq_id seq_id,
|
1478
|
+
llama_pos p0,
|
1479
|
+
llama_pos p1) {
|
1480
|
+
uint32_t new_head = cache.size;
|
1481
|
+
|
1482
|
+
if (p0 < 0) p0 = 0;
|
1483
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1484
|
+
|
1348
1485
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
1486
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
1487
|
cache.cells[i].seq_id.erase(seq_id);
|
1351
1488
|
if (cache.cells[i].seq_id.empty()) {
|
1352
1489
|
cache.cells[i].pos = -1;
|
1490
|
+
if (new_head == cache.size) new_head = i;
|
1353
1491
|
}
|
1354
1492
|
}
|
1355
1493
|
}
|
1494
|
+
|
1495
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1496
|
+
if (new_head != cache.size) cache.head = new_head;
|
1356
1497
|
}
|
1357
1498
|
|
1358
1499
|
static void llama_kv_cache_seq_cp(
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1500
|
+
struct llama_kv_cache & cache,
|
1501
|
+
llama_seq_id seq_id_src,
|
1502
|
+
llama_seq_id seq_id_dst,
|
1503
|
+
llama_pos p0,
|
1504
|
+
llama_pos p1) {
|
1505
|
+
if (p0 < 0) p0 = 0;
|
1506
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1507
|
+
|
1508
|
+
cache.head = 0;
|
1509
|
+
|
1364
1510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
1511
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
1512
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1369,32 +1515,48 @@ static void llama_kv_cache_seq_cp(
|
|
1369
1515
|
}
|
1370
1516
|
|
1371
1517
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1518
|
+
uint32_t new_head = cache.size;
|
1519
|
+
|
1372
1520
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
1521
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
1522
|
cache.cells[i].pos = -1;
|
1375
1523
|
cache.cells[i].seq_id.clear();
|
1524
|
+
if (new_head == cache.size) new_head = i;
|
1376
1525
|
}
|
1377
1526
|
}
|
1527
|
+
|
1528
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1529
|
+
if (new_head != cache.size) cache.head = new_head;
|
1378
1530
|
}
|
1379
1531
|
|
1380
1532
|
static void llama_kv_cache_seq_shift(
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1533
|
+
struct llama_kv_cache & cache,
|
1534
|
+
llama_seq_id seq_id,
|
1535
|
+
llama_pos p0,
|
1536
|
+
llama_pos p1,
|
1537
|
+
llama_pos delta) {
|
1538
|
+
uint32_t new_head = cache.size;
|
1539
|
+
|
1540
|
+
if (p0 < 0) p0 = 0;
|
1541
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1542
|
+
|
1386
1543
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
1544
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
1545
|
cache.cells[i].pos += delta;
|
1389
1546
|
if (cache.cells[i].pos < 0) {
|
1390
1547
|
cache.cells[i].pos = -1;
|
1391
1548
|
cache.cells[i].seq_id.clear();
|
1549
|
+
if (new_head == cache.size) new_head = i;
|
1392
1550
|
} else {
|
1393
1551
|
cache.has_shift = true;
|
1394
1552
|
cache.cells[i].delta = delta;
|
1395
1553
|
}
|
1396
1554
|
}
|
1397
1555
|
}
|
1556
|
+
|
1557
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1558
|
+
// Otherwise we just start the next search from the beginning.
|
1559
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1398
1560
|
}
|
1399
1561
|
|
1400
1562
|
//
|
@@ -1598,7 +1760,7 @@ struct llama_model_loader {
|
|
1598
1760
|
}
|
1599
1761
|
}
|
1600
1762
|
|
1601
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1763
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1602
1764
|
if (backend != GGML_BACKEND_CPU) {
|
1603
1765
|
ggml_set_no_alloc(ctx, true);
|
1604
1766
|
}
|
@@ -1616,7 +1778,7 @@ struct llama_model_loader {
|
|
1616
1778
|
return tensor;
|
1617
1779
|
}
|
1618
1780
|
|
1619
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1781
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1620
1782
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1621
1783
|
|
1622
1784
|
if (cur == NULL) {
|
@@ -1795,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1795
1957
|
case MODEL_1B: return "1B";
|
1796
1958
|
case MODEL_3B: return "3B";
|
1797
1959
|
case MODEL_7B: return "7B";
|
1960
|
+
case MODEL_8B: return "8B";
|
1798
1961
|
case MODEL_13B: return "13B";
|
1799
1962
|
case MODEL_15B: return "15B";
|
1800
1963
|
case MODEL_30B: return "30B";
|
@@ -1907,6 +2070,49 @@ static void llm_load_hparams(
|
|
1907
2070
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
2071
|
}
|
1909
2072
|
} break;
|
2073
|
+
case LLM_ARCH_PERSIMMON:
|
2074
|
+
{
|
2075
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2076
|
+
switch (hparams.n_layer) {
|
2077
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2078
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2079
|
+
}
|
2080
|
+
} break;
|
2081
|
+
case LLM_ARCH_REFACT:
|
2082
|
+
{
|
2083
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
2084
|
+
switch (hparams.n_layer) {
|
2085
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
2086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2087
|
+
}
|
2088
|
+
} break;
|
2089
|
+
case LLM_ARCH_BLOOM:
|
2090
|
+
{
|
2091
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2092
|
+
|
2093
|
+
switch (hparams.n_layer) {
|
2094
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2095
|
+
case 30:
|
2096
|
+
switch (hparams.n_embd) {
|
2097
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2098
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2099
|
+
} break;
|
2100
|
+
}
|
2101
|
+
} break;
|
2102
|
+
case LLM_ARCH_MPT:
|
2103
|
+
{
|
2104
|
+
hparams.f_clamp_kqv = 0.0f;
|
2105
|
+
|
2106
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2107
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2108
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2109
|
+
|
2110
|
+
switch (hparams.n_layer) {
|
2111
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2112
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2113
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2114
|
+
}
|
2115
|
+
} break;
|
1910
2116
|
default: (void)0;
|
1911
2117
|
}
|
1912
2118
|
|
@@ -1971,6 +2177,7 @@ static void llm_load_vocab(
|
|
1971
2177
|
|
1972
2178
|
for (int i = 0; i < n_merges; i++) {
|
1973
2179
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2180
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1974
2181
|
|
1975
2182
|
std::string first;
|
1976
2183
|
std::string second;
|
@@ -2005,6 +2212,7 @@ static void llm_load_vocab(
|
|
2005
2212
|
|
2006
2213
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
2007
2214
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2215
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
2008
2216
|
|
2009
2217
|
vocab.token_to_id[word] = i;
|
2010
2218
|
|
@@ -2013,12 +2221,13 @@ static void llm_load_vocab(
|
|
2013
2221
|
token_data.score = scores ? scores[i] : 0.0f;
|
2014
2222
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
2015
2223
|
}
|
2224
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
2016
2225
|
|
2017
2226
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
2018
2227
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2019
2228
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2020
2229
|
} else {
|
2021
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2230
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
2022
2231
|
}
|
2023
2232
|
|
2024
2233
|
// special tokens
|
@@ -2048,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2048
2257
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
2258
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
2259
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2260
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2261
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2051
2262
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
2263
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
2264
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2141,13 +2352,14 @@ static void llm_load_tensors(
|
|
2141
2352
|
const auto tn = LLM_TN(model.arch);
|
2142
2353
|
switch (model.arch) {
|
2143
2354
|
case LLM_ARCH_LLAMA:
|
2355
|
+
case LLM_ARCH_REFACT:
|
2144
2356
|
{
|
2145
2357
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2146
2358
|
|
2147
2359
|
// output
|
2148
2360
|
{
|
2149
|
-
|
2150
|
-
|
2361
|
+
ggml_backend_type backend_norm;
|
2362
|
+
ggml_backend_type backend_output;
|
2151
2363
|
|
2152
2364
|
if (n_gpu_layers > int(n_layer)) {
|
2153
2365
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2182,8 +2394,8 @@ static void llm_load_tensors(
|
|
2182
2394
|
model.layers.resize(n_layer);
|
2183
2395
|
|
2184
2396
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2185
|
-
const
|
2186
|
-
const
|
2397
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2398
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2187
2399
|
|
2188
2400
|
auto & layer = model.layers[i];
|
2189
2401
|
|
@@ -2212,8 +2424,8 @@ static void llm_load_tensors(
|
|
2212
2424
|
{
|
2213
2425
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2214
2426
|
{
|
2215
|
-
|
2216
|
-
|
2427
|
+
ggml_backend_type backend_norm;
|
2428
|
+
ggml_backend_type backend_output;
|
2217
2429
|
|
2218
2430
|
if (n_gpu_layers > int(n_layer)) {
|
2219
2431
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2248,8 +2460,8 @@ static void llm_load_tensors(
|
|
2248
2460
|
model.layers.resize(n_layer);
|
2249
2461
|
|
2250
2462
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2251
|
-
const
|
2252
|
-
const
|
2463
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2464
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2253
2465
|
|
2254
2466
|
auto & layer = model.layers[i];
|
2255
2467
|
|
@@ -2282,8 +2494,8 @@ static void llm_load_tensors(
|
|
2282
2494
|
|
2283
2495
|
// output
|
2284
2496
|
{
|
2285
|
-
|
2286
|
-
|
2497
|
+
ggml_backend_type backend_norm;
|
2498
|
+
ggml_backend_type backend_output;
|
2287
2499
|
|
2288
2500
|
if (n_gpu_layers > int(n_layer)) {
|
2289
2501
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2320,8 +2532,8 @@ static void llm_load_tensors(
|
|
2320
2532
|
model.layers.resize(n_layer);
|
2321
2533
|
|
2322
2534
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2323
|
-
const
|
2324
|
-
const
|
2535
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2536
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2325
2537
|
|
2326
2538
|
auto & layer = model.layers[i];
|
2327
2539
|
|
@@ -2359,8 +2571,8 @@ static void llm_load_tensors(
|
|
2359
2571
|
|
2360
2572
|
// output
|
2361
2573
|
{
|
2362
|
-
|
2363
|
-
|
2574
|
+
ggml_backend_type backend_norm;
|
2575
|
+
ggml_backend_type backend_output;
|
2364
2576
|
|
2365
2577
|
if (n_gpu_layers > int(n_layer)) {
|
2366
2578
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2397,8 +2609,8 @@ static void llm_load_tensors(
|
|
2397
2609
|
model.layers.resize(n_layer);
|
2398
2610
|
|
2399
2611
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2400
|
-
const
|
2401
|
-
const
|
2612
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2613
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2402
2614
|
|
2403
2615
|
auto & layer = model.layers[i];
|
2404
2616
|
|
@@ -2431,103 +2643,313 @@ static void llm_load_tensors(
|
|
2431
2643
|
}
|
2432
2644
|
}
|
2433
2645
|
} break;
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2437
|
-
}
|
2646
|
+
case LLM_ARCH_PERSIMMON:
|
2647
|
+
{
|
2648
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2438
2649
|
|
2439
|
-
|
2650
|
+
{
|
2651
|
+
ggml_backend_type backend_norm;
|
2652
|
+
ggml_backend_type backend_output;
|
2440
2653
|
|
2441
|
-
|
2442
|
-
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2654
|
+
if (n_gpu_layers > int(n_layer)) {
|
2655
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2656
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2657
|
+
#ifndef _WIN32
|
2658
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2659
|
+
#else
|
2660
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2661
|
+
#endif // _WIN32
|
2447
2662
|
|
2448
|
-
|
2663
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2664
|
+
} else {
|
2665
|
+
backend_norm = GGML_BACKEND_CPU;
|
2666
|
+
backend_output = GGML_BACKEND_CPU;
|
2667
|
+
}
|
2449
2668
|
|
2450
|
-
|
2451
|
-
|
2669
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2670
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2671
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2452
2672
|
|
2453
|
-
|
2454
|
-
|
2455
|
-
|
2456
|
-
|
2673
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2674
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2675
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2676
|
+
}
|
2677
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2678
|
+
vram_weights += ggml_nbytes(model.output);
|
2679
|
+
}
|
2680
|
+
}
|
2457
2681
|
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
2461
|
-
|
2462
|
-
|
2463
|
-
|
2464
|
-
|
2682
|
+
const uint32_t n_ff = hparams.n_ff;
|
2683
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2684
|
+
model.layers.resize(n_layer);
|
2685
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2686
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2687
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2688
|
+
auto & layer = model.layers[i];
|
2689
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2690
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2691
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2692
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2693
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2694
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2695
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2696
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2697
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2698
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2699
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2700
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2701
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2702
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2703
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2704
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2705
|
+
}
|
2706
|
+
} break;
|
2707
|
+
case LLM_ARCH_BLOOM:
|
2708
|
+
{
|
2709
|
+
// TODO: CPU-only for now
|
2465
2710
|
|
2466
|
-
|
2467
|
-
|
2711
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2712
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2713
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2714
|
+
|
2715
|
+
// output
|
2716
|
+
{
|
2717
|
+
ggml_backend_type backend_norm;
|
2718
|
+
ggml_backend_type backend_output;
|
2719
|
+
|
2720
|
+
if (n_gpu_layers > int(n_layer)) {
|
2721
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2722
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2723
|
+
#ifndef _WIN32
|
2724
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2468
2725
|
#else
|
2469
|
-
|
2470
|
-
#endif //
|
2471
|
-
}
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2727
|
+
#endif // _WIN32
|
2472
2728
|
|
2473
|
-
|
2474
|
-
|
2475
|
-
|
2476
|
-
|
2477
|
-
|
2729
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2730
|
+
} else {
|
2731
|
+
backend_norm = GGML_BACKEND_CPU;
|
2732
|
+
backend_output = GGML_BACKEND_CPU;
|
2733
|
+
}
|
2478
2734
|
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
ggml_cuda_set_tensor_split(tensor_split);
|
2483
|
-
}
|
2484
|
-
#endif
|
2735
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2736
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2737
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2485
2738
|
|
2486
|
-
|
2739
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2740
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2741
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2742
|
+
}
|
2743
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2744
|
+
vram_weights += ggml_nbytes(model.output);
|
2745
|
+
}
|
2746
|
+
}
|
2487
2747
|
|
2488
|
-
|
2489
|
-
progress_callback(1.0f, progress_callback_user_data);
|
2490
|
-
}
|
2748
|
+
const uint32_t n_ff = hparams.n_ff;
|
2491
2749
|
|
2492
|
-
|
2750
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2493
2751
|
|
2494
|
-
|
2495
|
-
// we take page faults deferred by mmap() into consideration
|
2496
|
-
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2497
|
-
}
|
2752
|
+
model.layers.resize(n_layer);
|
2498
2753
|
|
2499
|
-
|
2500
|
-
|
2501
|
-
|
2502
|
-
int n_gpu_layers,
|
2503
|
-
int main_gpu,
|
2504
|
-
const float * tensor_split,
|
2505
|
-
bool use_mmap,
|
2506
|
-
bool use_mlock,
|
2507
|
-
bool vocab_only,
|
2508
|
-
llama_progress_callback progress_callback,
|
2509
|
-
void *progress_callback_user_data) {
|
2510
|
-
try {
|
2511
|
-
llama_model_loader ml(fname, use_mmap);
|
2754
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2512
2757
|
|
2513
|
-
|
2758
|
+
auto & layer = model.layers[i];
|
2514
2759
|
|
2515
|
-
|
2516
|
-
|
2517
|
-
llm_load_vocab (ml, model);
|
2760
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2761
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2518
2762
|
|
2519
|
-
|
2763
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2764
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2520
2765
|
|
2521
|
-
|
2522
|
-
|
2523
|
-
}
|
2766
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2767
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2524
2768
|
|
2525
|
-
|
2526
|
-
|
2527
|
-
return true;
|
2528
|
-
}
|
2769
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2529
2771
|
|
2530
|
-
|
2772
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2773
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2774
|
+
|
2775
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2776
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2777
|
+
|
2778
|
+
if (backend == GGML_BACKEND_GPU) {
|
2779
|
+
vram_weights +=
|
2780
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2781
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2782
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2783
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2784
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2785
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2786
|
+
}
|
2787
|
+
}
|
2788
|
+
} break;
|
2789
|
+
case LLM_ARCH_MPT:
|
2790
|
+
{
|
2791
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2792
|
+
|
2793
|
+
// output
|
2794
|
+
{
|
2795
|
+
ggml_backend_type backend_norm;
|
2796
|
+
ggml_backend_type backend_output;
|
2797
|
+
|
2798
|
+
if (n_gpu_layers > int(n_layer)) {
|
2799
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2800
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2801
|
+
#ifndef _WIN32
|
2802
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2803
|
+
#else
|
2804
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2805
|
+
#endif // _WIN32
|
2806
|
+
|
2807
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2808
|
+
} else {
|
2809
|
+
backend_norm = GGML_BACKEND_CPU;
|
2810
|
+
backend_output = GGML_BACKEND_CPU;
|
2811
|
+
}
|
2812
|
+
|
2813
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2814
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2815
|
+
|
2816
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2817
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2818
|
+
}
|
2819
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2820
|
+
vram_weights += ggml_nbytes(model.output);
|
2821
|
+
}
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
const uint32_t n_ff = hparams.n_ff;
|
2825
|
+
|
2826
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2827
|
+
|
2828
|
+
model.layers.resize(n_layer);
|
2829
|
+
|
2830
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2831
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2832
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2833
|
+
|
2834
|
+
auto & layer = model.layers[i];
|
2835
|
+
|
2836
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2838
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
|
+
|
2840
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
|
+
|
2842
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2843
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2844
|
+
|
2845
|
+
if (backend == GGML_BACKEND_GPU) {
|
2846
|
+
vram_weights +=
|
2847
|
+
ggml_nbytes(layer.attn_norm) +
|
2848
|
+
ggml_nbytes(layer.wqkv) +
|
2849
|
+
ggml_nbytes(layer.wo) +
|
2850
|
+
ggml_nbytes(layer.ffn_norm) +
|
2851
|
+
ggml_nbytes(layer.w2) +
|
2852
|
+
ggml_nbytes(layer.w3);
|
2853
|
+
}
|
2854
|
+
}
|
2855
|
+
} break;
|
2856
|
+
default:
|
2857
|
+
throw std::runtime_error("unknown architecture");
|
2858
|
+
}
|
2859
|
+
}
|
2860
|
+
|
2861
|
+
ml.done_getting_tensors();
|
2862
|
+
|
2863
|
+
// print memory requirements
|
2864
|
+
{
|
2865
|
+
// this is the total memory required to run the inference
|
2866
|
+
size_t mem_required =
|
2867
|
+
ctx_size +
|
2868
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2869
|
+
|
2870
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2871
|
+
|
2872
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2873
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
2874
|
+
|
2875
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
2876
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
2877
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2878
|
+
}
|
2879
|
+
|
2880
|
+
#ifdef GGML_USE_CUBLAS
|
2881
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2882
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2883
|
+
#elif defined(GGML_USE_CLBLAST)
|
2884
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2885
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2886
|
+
#endif // GGML_USE_CUBLAS
|
2887
|
+
|
2888
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2889
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2890
|
+
#else
|
2891
|
+
(void) n_gpu_layers;
|
2892
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2893
|
+
}
|
2894
|
+
|
2895
|
+
// populate `tensors_by_name`
|
2896
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
2897
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
2898
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
2899
|
+
}
|
2900
|
+
|
2901
|
+
(void) tensor_split;
|
2902
|
+
#ifdef GGML_USE_CUBLAS
|
2903
|
+
{
|
2904
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
2905
|
+
}
|
2906
|
+
#endif
|
2907
|
+
|
2908
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
2909
|
+
|
2910
|
+
if (progress_callback) {
|
2911
|
+
progress_callback(1.0f, progress_callback_user_data);
|
2912
|
+
}
|
2913
|
+
|
2914
|
+
model.mapping = std::move(ml.mapping);
|
2915
|
+
|
2916
|
+
// loading time will be recalculate after the first eval, so
|
2917
|
+
// we take page faults deferred by mmap() into consideration
|
2918
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2919
|
+
}
|
2920
|
+
|
2921
|
+
static bool llama_model_load(
|
2922
|
+
const std::string & fname,
|
2923
|
+
llama_model & model,
|
2924
|
+
int n_gpu_layers,
|
2925
|
+
int main_gpu,
|
2926
|
+
const float * tensor_split,
|
2927
|
+
bool use_mmap,
|
2928
|
+
bool use_mlock,
|
2929
|
+
bool vocab_only,
|
2930
|
+
llama_progress_callback progress_callback,
|
2931
|
+
void *progress_callback_user_data) {
|
2932
|
+
try {
|
2933
|
+
llama_model_loader ml(fname, use_mmap);
|
2934
|
+
|
2935
|
+
model.hparams.vocab_only = vocab_only;
|
2936
|
+
|
2937
|
+
llm_load_arch (ml, model);
|
2938
|
+
llm_load_hparams(ml, model);
|
2939
|
+
llm_load_vocab (ml, model);
|
2940
|
+
|
2941
|
+
llm_load_print_meta(ml, model);
|
2942
|
+
|
2943
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2944
|
+
throw std::runtime_error("vocab size mismatch");
|
2945
|
+
}
|
2946
|
+
|
2947
|
+
if (vocab_only) {
|
2948
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
2949
|
+
return true;
|
2950
|
+
}
|
2951
|
+
|
2952
|
+
llm_load_tensors(
|
2531
2953
|
ml, model, n_gpu_layers,
|
2532
2954
|
main_gpu, tensor_split,
|
2533
2955
|
use_mlock, progress_callback, progress_callback_user_data);
|
@@ -2540,8 +2962,8 @@ static bool llama_model_load(
|
|
2540
2962
|
}
|
2541
2963
|
|
2542
2964
|
static struct ggml_cgraph * llm_build_llama(
|
2543
|
-
|
2544
|
-
|
2965
|
+
llama_context & lctx,
|
2966
|
+
const llama_batch & batch) {
|
2545
2967
|
const auto & model = lctx.model;
|
2546
2968
|
const auto & hparams = model.hparams;
|
2547
2969
|
const auto & cparams = lctx.cparams;
|
@@ -2579,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2579
3001
|
struct ggml_init_params params = {
|
2580
3002
|
/*.mem_size =*/ buf_compute.size,
|
2581
3003
|
/*.mem_buffer =*/ buf_compute.data,
|
2582
|
-
/*.no_alloc =*/
|
3004
|
+
/*.no_alloc =*/ true,
|
2583
3005
|
};
|
2584
3006
|
|
2585
|
-
params.no_alloc = true;
|
2586
|
-
|
2587
3007
|
struct ggml_context * ctx0 = ggml_init(params);
|
2588
3008
|
|
2589
3009
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -2967,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2967
3387
|
struct ggml_init_params params = {
|
2968
3388
|
/*.mem_size =*/ buf_compute.size,
|
2969
3389
|
/*.mem_buffer =*/ buf_compute.data,
|
2970
|
-
/*.no_alloc =*/
|
3390
|
+
/*.no_alloc =*/ true,
|
2971
3391
|
};
|
2972
3392
|
|
2973
|
-
params.no_alloc = true;
|
2974
|
-
|
2975
3393
|
struct ggml_context * ctx0 = ggml_init(params);
|
2976
3394
|
|
2977
3395
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3334,7 +3752,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3334
3752
|
return gf;
|
3335
3753
|
}
|
3336
3754
|
|
3337
|
-
static struct ggml_cgraph *
|
3755
|
+
static struct ggml_cgraph * llm_build_refact(
|
3338
3756
|
llama_context & lctx,
|
3339
3757
|
const llama_batch & batch) {
|
3340
3758
|
const auto & model = lctx.model;
|
@@ -3353,11 +3771,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3353
3771
|
const int64_t n_embd_head = hparams.n_embd_head();
|
3354
3772
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3355
3773
|
|
3356
|
-
|
3357
|
-
|
3358
|
-
const float freq_base = cparams.rope_freq_base;
|
3359
|
-
const float freq_scale = cparams.rope_freq_scale;
|
3360
|
-
const float norm_eps = hparams.f_norm_eps;
|
3774
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3361
3775
|
|
3362
3776
|
const int n_gpu_layers = model.n_gpu_layers;
|
3363
3777
|
|
@@ -3365,21 +3779,16 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3365
3779
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
3780
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
3781
|
|
3368
|
-
|
3369
|
-
|
3370
|
-
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
-
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3782
|
+
// printf("n_kv = %d\n", n_kv);
|
3372
3783
|
|
3373
3784
|
auto & buf_compute = lctx.buf_compute;
|
3374
3785
|
|
3375
3786
|
struct ggml_init_params params = {
|
3376
3787
|
/*.mem_size =*/ buf_compute.size,
|
3377
3788
|
/*.mem_buffer =*/ buf_compute.data,
|
3378
|
-
/*.no_alloc =*/
|
3789
|
+
/*.no_alloc =*/ true,
|
3379
3790
|
};
|
3380
3791
|
|
3381
|
-
params.no_alloc = true;
|
3382
|
-
|
3383
3792
|
struct ggml_context * ctx0 = ggml_init(params);
|
3384
3793
|
|
3385
3794
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3436,7 +3845,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3436
3845
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3437
3846
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3438
3847
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3439
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
3848
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3440
3849
|
}
|
3441
3850
|
|
3442
3851
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
@@ -3462,47 +3871,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3462
3871
|
}
|
3463
3872
|
}
|
3464
3873
|
|
3465
|
-
// KQ_pos - contains the positions
|
3466
|
-
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
-
offload_func_kq(KQ_pos);
|
3468
|
-
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
-
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
-
int * data = (int *) KQ_pos->data;
|
3472
|
-
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
-
data[i] = batch.pos[i];
|
3474
|
-
}
|
3475
|
-
}
|
3476
|
-
|
3477
|
-
// shift the entire K-cache if needed
|
3478
|
-
if (do_rope_shift) {
|
3479
|
-
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
-
offload_func_kq(K_shift);
|
3481
|
-
ggml_set_name(K_shift, "K_shift");
|
3482
|
-
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
-
int * data = (int *) K_shift->data;
|
3485
|
-
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
-
data[i] = kv_self.cells[i].delta;
|
3487
|
-
}
|
3488
|
-
}
|
3489
|
-
|
3490
|
-
for (int il = 0; il < n_layer; ++il) {
|
3491
|
-
struct ggml_tensor * tmp =
|
3492
|
-
ggml_rope_custom_inplace(ctx0,
|
3493
|
-
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
-
n_embd_head, n_head_kv, n_ctx,
|
3495
|
-
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
-
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
-
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
-
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
-
offload_func_kq(tmp);
|
3500
|
-
ggml_build_forward_expand(gf, tmp);
|
3501
|
-
}
|
3502
|
-
}
|
3503
|
-
|
3504
3874
|
for (int il = 0; il < n_layer; ++il) {
|
3505
|
-
|
3875
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3506
3876
|
|
3507
3877
|
offload_func_t offload_func = llama_nop;
|
3508
3878
|
|
@@ -3512,80 +3882,49 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3512
3882
|
}
|
3513
3883
|
#endif // GGML_USE_CUBLAS
|
3514
3884
|
|
3515
|
-
|
3516
|
-
// TODO: refactor into common function (shared with LLaMA)
|
3517
|
-
{
|
3518
|
-
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
3519
|
-
offload_func(attn_norm);
|
3885
|
+
struct ggml_tensor * inpSA = inpL;
|
3520
3886
|
|
3521
|
-
|
3522
|
-
|
3523
|
-
|
3524
|
-
offload_func(
|
3525
|
-
|
3887
|
+
// norm
|
3888
|
+
{
|
3889
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3890
|
+
offload_func(cur);
|
3891
|
+
ggml_set_name(cur, "rms_norm_0");
|
3526
3892
|
|
3527
|
-
|
3528
|
-
|
3529
|
-
|
3893
|
+
// cur = cur*attn_norm(broadcasted)
|
3894
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3895
|
+
offload_func(cur);
|
3896
|
+
ggml_set_name(cur, "attention_norm_0");
|
3897
|
+
}
|
3530
3898
|
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3535
|
-
|
3536
|
-
|
3537
|
-
cur = attn_norm;
|
3538
|
-
}
|
3539
|
-
|
3540
|
-
// compute QKV
|
3541
|
-
|
3542
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
3543
|
-
offload_func_kq(cur);
|
3544
|
-
|
3545
|
-
// Note that the strides for Kcur, Vcur are set up so that the
|
3546
|
-
// resulting views are misaligned with the tensor's storage
|
3547
|
-
// (by applying the K/V offset we shift the tensor's original
|
3548
|
-
// view to stick out behind the viewed QKV tensor's allocated
|
3549
|
-
// memory, so to say). This is ok because no actual accesses
|
3550
|
-
// happen to that out-of-range memory, but it can require some
|
3551
|
-
// trickery when trying to accurately dump these views for
|
3552
|
-
// debugging.
|
3553
|
-
|
3554
|
-
const size_t wsize = ggml_type_size(cur->type);
|
3899
|
+
// self-attention
|
3900
|
+
{
|
3901
|
+
// compute Q and K
|
3902
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3903
|
+
offload_func_kq(tmpk);
|
3904
|
+
ggml_set_name(tmpk, "tmpk");
|
3555
3905
|
|
3556
|
-
|
3557
|
-
// non-contiguous views is added for the rope operator
|
3558
|
-
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3559
|
-
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3560
|
-
wsize * n_embd_head,
|
3561
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3562
|
-
0));
|
3906
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3563
3907
|
offload_func_kq(tmpq);
|
3908
|
+
ggml_set_name(tmpq, "tmpq");
|
3564
3909
|
|
3565
|
-
struct ggml_tensor *
|
3566
|
-
|
3567
|
-
|
3568
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3569
|
-
wsize * n_embd_head * n_head));
|
3570
|
-
offload_func_kq(tmpk);
|
3571
|
-
|
3572
|
-
struct ggml_tensor * tmpv = ggml_view_3d(
|
3573
|
-
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3574
|
-
wsize * n_embd_head,
|
3575
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3576
|
-
wsize * n_embd_head * (n_head + n_head_kv));
|
3577
|
-
offload_func_v(tmpv);
|
3910
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3911
|
+
offload_func_kq(Kcur);
|
3912
|
+
ggml_set_name(Kcur, "Kcur");
|
3578
3913
|
|
3579
|
-
|
3580
|
-
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3914
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3581
3915
|
offload_func_kq(Qcur);
|
3582
|
-
|
3583
|
-
offload_func_kq(Kcur);
|
3916
|
+
ggml_set_name(Qcur, "Qcur");
|
3584
3917
|
|
3918
|
+
// store key and value to memory
|
3585
3919
|
{
|
3586
|
-
|
3920
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3921
|
+
|
3922
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3923
|
+
offload_func_v(tmpv);
|
3924
|
+
ggml_set_name(tmpv, "tmpv");
|
3925
|
+
|
3926
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3587
3927
|
offload_func_v(Vcur);
|
3588
|
-
offload_func_v(Vcur->src[0]->src[0]);
|
3589
3928
|
ggml_set_name(Vcur, "Vcur");
|
3590
3929
|
|
3591
3930
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
@@ -3596,6 +3935,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3596
3935
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3597
3936
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3598
3937
|
offload_func_v(v);
|
3938
|
+
ggml_set_name(v, "v");
|
3599
3939
|
|
3600
3940
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3601
3941
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3614,22 +3954,31 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3614
3954
|
offload_func_kq(K);
|
3615
3955
|
ggml_set_name(K, "K");
|
3616
3956
|
|
3957
|
+
// K * Q
|
3617
3958
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3618
3959
|
offload_func_kq(KQ);
|
3619
3960
|
ggml_set_name(KQ, "KQ");
|
3620
3961
|
|
3962
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3963
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3621
3964
|
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3622
3965
|
offload_func_kq(KQ_scaled);
|
3623
3966
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3624
3967
|
|
3625
|
-
|
3968
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3969
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3970
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3971
|
+
|
3972
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3626
3973
|
offload_func_kq(KQ_masked);
|
3627
3974
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3628
3975
|
|
3976
|
+
// KQ = soft_max(KQ_masked)
|
3629
3977
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3630
3978
|
offload_func_v(KQ_soft_max);
|
3631
3979
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3632
3980
|
|
3981
|
+
// split cached V into n_head heads
|
3633
3982
|
struct ggml_tensor * V =
|
3634
3983
|
ggml_view_3d(ctx0, kv_self.v,
|
3635
3984
|
n_kv, n_embd_head, n_head_kv,
|
@@ -3639,42 +3988,85 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3639
3988
|
offload_func_v(V);
|
3640
3989
|
ggml_set_name(V, "V");
|
3641
3990
|
|
3991
|
+
#if 1
|
3642
3992
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3643
3993
|
offload_func_v(KQV);
|
3644
3994
|
ggml_set_name(KQV, "KQV");
|
3995
|
+
#else
|
3996
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3997
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3998
|
+
// is there a better way?
|
3999
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
4000
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
4001
|
+
#endif
|
3645
4002
|
|
4003
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3646
4004
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3647
4005
|
offload_func_v(KQV_merged);
|
3648
4006
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3649
4007
|
|
4008
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3650
4009
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3651
4010
|
offload_func_v(cur);
|
3652
4011
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3653
4012
|
|
3654
|
-
|
4013
|
+
// projection (no bias)
|
4014
|
+
cur = ggml_mul_mat(ctx0,
|
4015
|
+
model.layers[il].wo,
|
4016
|
+
cur);
|
3655
4017
|
offload_func(cur);
|
3656
4018
|
ggml_set_name(cur, "result_wo");
|
3657
4019
|
}
|
3658
4020
|
|
3659
|
-
struct ggml_tensor *
|
4021
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
4022
|
+
offload_func(inpFF);
|
4023
|
+
ggml_set_name(inpFF, "inpFF");
|
3660
4024
|
|
3661
|
-
// feed
|
4025
|
+
// feed-forward network
|
3662
4026
|
{
|
3663
|
-
|
4027
|
+
// norm
|
4028
|
+
{
|
4029
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
4030
|
+
offload_func(cur);
|
4031
|
+
ggml_set_name(cur, "rms_norm_1");
|
3664
4032
|
|
3665
|
-
|
4033
|
+
// cur = cur*ffn_norm(broadcasted)
|
4034
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
4035
|
+
offload_func(cur);
|
4036
|
+
ggml_set_name(cur, "ffn_norm");
|
4037
|
+
}
|
4038
|
+
|
4039
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
4040
|
+
model.layers[il].w3,
|
4041
|
+
cur);
|
4042
|
+
offload_func(tmp);
|
4043
|
+
ggml_set_name(tmp, "result_w3");
|
4044
|
+
|
4045
|
+
cur = ggml_mul_mat(ctx0,
|
4046
|
+
model.layers[il].w1,
|
4047
|
+
cur);
|
3666
4048
|
offload_func(cur);
|
4049
|
+
ggml_set_name(cur, "result_w1");
|
3667
4050
|
|
3668
|
-
|
4051
|
+
// SILU activation
|
4052
|
+
cur = ggml_silu(ctx0, cur);
|
3669
4053
|
offload_func(cur);
|
3670
|
-
cur
|
4054
|
+
ggml_set_name(cur, "silu");
|
4055
|
+
|
4056
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3671
4057
|
offload_func(cur);
|
4058
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
4059
|
+
|
4060
|
+
cur = ggml_mul_mat(ctx0,
|
4061
|
+
model.layers[il].w2,
|
4062
|
+
cur);
|
4063
|
+
offload_func(cur);
|
4064
|
+
ggml_set_name(cur, "result_w2");
|
3672
4065
|
}
|
3673
4066
|
|
3674
|
-
cur = ggml_add(ctx0, cur,
|
3675
|
-
offload_func(cur);
|
3676
|
-
cur = ggml_add(ctx0, cur, inpL);
|
4067
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3677
4068
|
offload_func(cur);
|
4069
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3678
4070
|
|
3679
4071
|
// input for next layer
|
3680
4072
|
inpL = cur;
|
@@ -3684,15 +4076,17 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3684
4076
|
|
3685
4077
|
// norm
|
3686
4078
|
{
|
3687
|
-
cur =
|
4079
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3688
4080
|
offload_func_nr(cur);
|
4081
|
+
ggml_set_name(cur, "rms_norm_2");
|
3689
4082
|
|
3690
|
-
cur =
|
3691
|
-
|
3692
|
-
|
4083
|
+
// cur = cur*norm(broadcasted)
|
4084
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
4085
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3693
4086
|
ggml_set_name(cur, "result_norm");
|
3694
4087
|
}
|
3695
4088
|
|
4089
|
+
// lm_head
|
3696
4090
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3697
4091
|
ggml_set_name(cur, "result_output");
|
3698
4092
|
|
@@ -3703,7 +4097,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3703
4097
|
return gf;
|
3704
4098
|
}
|
3705
4099
|
|
3706
|
-
static struct ggml_cgraph *
|
4100
|
+
static struct ggml_cgraph * llm_build_falcon(
|
3707
4101
|
llama_context & lctx,
|
3708
4102
|
const llama_batch & batch) {
|
3709
4103
|
const auto & model = lctx.model;
|
@@ -3724,29 +4118,34 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3724
4118
|
|
3725
4119
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3726
4120
|
|
3727
|
-
const float
|
4121
|
+
const float freq_base = cparams.rope_freq_base;
|
4122
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4123
|
+
const float norm_eps = hparams.f_norm_eps;
|
4124
|
+
|
4125
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3728
4126
|
|
3729
4127
|
const int32_t n_tokens = batch.n_tokens;
|
3730
4128
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
4129
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3732
4130
|
|
4131
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4132
|
+
|
4133
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
4134
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
4135
|
+
|
3733
4136
|
auto & buf_compute = lctx.buf_compute;
|
3734
4137
|
|
3735
4138
|
struct ggml_init_params params = {
|
3736
4139
|
/*.mem_size =*/ buf_compute.size,
|
3737
4140
|
/*.mem_buffer =*/ buf_compute.data,
|
3738
|
-
/*.no_alloc =*/
|
4141
|
+
/*.no_alloc =*/ true,
|
3739
4142
|
};
|
3740
4143
|
|
3741
|
-
params.no_alloc = true;
|
3742
|
-
|
3743
4144
|
struct ggml_context * ctx0 = ggml_init(params);
|
3744
4145
|
|
3745
4146
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3746
4147
|
|
3747
4148
|
struct ggml_tensor * cur;
|
3748
|
-
struct ggml_tensor * token;
|
3749
|
-
struct ggml_tensor * position;
|
3750
4149
|
struct ggml_tensor * inpL;
|
3751
4150
|
|
3752
4151
|
if (batch.token) {
|
@@ -3758,30 +4157,390 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3758
4157
|
}
|
3759
4158
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3760
4159
|
|
3761
|
-
|
4160
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3762
4161
|
} else {
|
3763
4162
|
#ifdef GGML_USE_MPI
|
3764
4163
|
GGML_ASSERT(false && "not implemented");
|
3765
4164
|
#endif
|
3766
4165
|
|
3767
|
-
|
4166
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3768
4167
|
|
3769
|
-
ggml_allocr_alloc(lctx.alloc,
|
4168
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3770
4169
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3771
|
-
memcpy(
|
4170
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3772
4171
|
}
|
3773
4172
|
}
|
3774
4173
|
|
3775
|
-
|
3776
|
-
|
3777
|
-
|
3778
|
-
|
4174
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4175
|
+
(void) i_gpu_start;
|
4176
|
+
|
4177
|
+
// offload functions set the tensor output backend to GPU
|
4178
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
4179
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4180
|
+
offload_func_t offload_func_kq = llama_nop;
|
4181
|
+
offload_func_t offload_func_v = llama_nop;
|
4182
|
+
|
4183
|
+
#ifdef GGML_USE_CUBLAS
|
4184
|
+
if (n_gpu_layers > n_layer) {
|
4185
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
4186
|
+
}
|
4187
|
+
if (n_gpu_layers > n_layer + 1) {
|
4188
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
4189
|
+
}
|
4190
|
+
if (n_gpu_layers > n_layer + 2) {
|
4191
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
4192
|
+
}
|
4193
|
+
#endif // GGML_USE_CUBLAS
|
4194
|
+
|
4195
|
+
// KQ_scale
|
4196
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4197
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4198
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4199
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4200
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4204
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4205
|
+
offload_func_kq(KQ_mask);
|
4206
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4207
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4208
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4209
|
+
float * data = (float *) KQ_mask->data;
|
4210
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4211
|
+
|
4212
|
+
for (int h = 0; h < 1; ++h) {
|
4213
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4214
|
+
const llama_pos pos = batch.pos[j];
|
4215
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4216
|
+
|
4217
|
+
for (int i = 0; i < n_kv; ++i) {
|
4218
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4219
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4220
|
+
}
|
4221
|
+
}
|
4222
|
+
}
|
4223
|
+
}
|
4224
|
+
}
|
4225
|
+
|
4226
|
+
// KQ_pos - contains the positions
|
4227
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4228
|
+
offload_func_kq(KQ_pos);
|
4229
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4230
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4231
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4232
|
+
int * data = (int *) KQ_pos->data;
|
4233
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4234
|
+
data[i] = batch.pos[i];
|
4235
|
+
}
|
4236
|
+
}
|
4237
|
+
|
4238
|
+
// shift the entire K-cache if needed
|
4239
|
+
if (do_rope_shift) {
|
4240
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4241
|
+
offload_func_kq(K_shift);
|
4242
|
+
ggml_set_name(K_shift, "K_shift");
|
4243
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3779
4244
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3780
|
-
|
3781
|
-
|
4245
|
+
int * data = (int *) K_shift->data;
|
4246
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4247
|
+
data[i] = kv_self.cells[i].delta;
|
3782
4248
|
}
|
3783
4249
|
}
|
3784
|
-
|
4250
|
+
|
4251
|
+
for (int il = 0; il < n_layer; ++il) {
|
4252
|
+
struct ggml_tensor * tmp =
|
4253
|
+
ggml_rope_custom_inplace(ctx0,
|
4254
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4255
|
+
n_embd_head, n_head_kv, n_ctx,
|
4256
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4257
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4258
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
4259
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
4260
|
+
offload_func_kq(tmp);
|
4261
|
+
ggml_build_forward_expand(gf, tmp);
|
4262
|
+
}
|
4263
|
+
}
|
4264
|
+
|
4265
|
+
for (int il = 0; il < n_layer; ++il) {
|
4266
|
+
struct ggml_tensor * attn_norm;
|
4267
|
+
|
4268
|
+
offload_func_t offload_func = llama_nop;
|
4269
|
+
|
4270
|
+
#ifdef GGML_USE_CUBLAS
|
4271
|
+
if (il >= i_gpu_start) {
|
4272
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
4273
|
+
}
|
4274
|
+
#endif // GGML_USE_CUBLAS
|
4275
|
+
|
4276
|
+
// self-attention
|
4277
|
+
// TODO: refactor into common function (shared with LLaMA)
|
4278
|
+
{
|
4279
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
4280
|
+
offload_func(attn_norm);
|
4281
|
+
|
4282
|
+
attn_norm = ggml_add(ctx0,
|
4283
|
+
ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
|
4284
|
+
model.layers[il].attn_norm_b);
|
4285
|
+
offload_func(attn_norm->src[0]);
|
4286
|
+
offload_func(attn_norm);
|
4287
|
+
|
4288
|
+
if (model.layers[il].attn_norm_2) { // Falcon-40B
|
4289
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4290
|
+
offload_func(cur);
|
4291
|
+
|
4292
|
+
cur = ggml_add(ctx0,
|
4293
|
+
ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
|
4294
|
+
model.layers[il].attn_norm_2_b);
|
4295
|
+
offload_func(cur->src[0]);
|
4296
|
+
offload_func(cur);
|
4297
|
+
} else { // Falcon 7B
|
4298
|
+
cur = attn_norm;
|
4299
|
+
}
|
4300
|
+
|
4301
|
+
// compute QKV
|
4302
|
+
|
4303
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4304
|
+
offload_func_kq(cur);
|
4305
|
+
|
4306
|
+
// Note that the strides for Kcur, Vcur are set up so that the
|
4307
|
+
// resulting views are misaligned with the tensor's storage
|
4308
|
+
// (by applying the K/V offset we shift the tensor's original
|
4309
|
+
// view to stick out behind the viewed QKV tensor's allocated
|
4310
|
+
// memory, so to say). This is ok because no actual accesses
|
4311
|
+
// happen to that out-of-range memory, but it can require some
|
4312
|
+
// trickery when trying to accurately dump these views for
|
4313
|
+
// debugging.
|
4314
|
+
|
4315
|
+
const size_t wsize = ggml_type_size(cur->type);
|
4316
|
+
|
4317
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
4318
|
+
// non-contiguous views is added for the rope operator
|
4319
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
4320
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
4321
|
+
wsize * n_embd_head,
|
4322
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4323
|
+
0));
|
4324
|
+
offload_func_kq(tmpq);
|
4325
|
+
|
4326
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
4327
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
4328
|
+
wsize * n_embd_head,
|
4329
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4330
|
+
wsize * n_embd_head * n_head));
|
4331
|
+
offload_func_kq(tmpk);
|
4332
|
+
|
4333
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4334
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
4335
|
+
wsize * n_embd_head,
|
4336
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4337
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
4338
|
+
offload_func_v(tmpv);
|
4339
|
+
|
4340
|
+
// using mode = 2 for neox mode
|
4341
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
4342
|
+
offload_func_kq(Qcur);
|
4343
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
4344
|
+
offload_func_kq(Kcur);
|
4345
|
+
|
4346
|
+
{
|
4347
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4348
|
+
offload_func_v(Vcur);
|
4349
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
4350
|
+
ggml_set_name(Vcur, "Vcur");
|
4351
|
+
|
4352
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4353
|
+
offload_func_kq(k);
|
4354
|
+
ggml_set_name(k, "k");
|
4355
|
+
|
4356
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4357
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4358
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4359
|
+
offload_func_v(v);
|
4360
|
+
|
4361
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4362
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4363
|
+
}
|
4364
|
+
|
4365
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
4366
|
+
offload_func_kq(Q);
|
4367
|
+
ggml_set_name(Q, "Q");
|
4368
|
+
|
4369
|
+
struct ggml_tensor * K =
|
4370
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4371
|
+
n_embd_head, n_kv, n_head_kv,
|
4372
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4373
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4374
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4375
|
+
offload_func_kq(K);
|
4376
|
+
ggml_set_name(K, "K");
|
4377
|
+
|
4378
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4379
|
+
offload_func_kq(KQ);
|
4380
|
+
ggml_set_name(KQ, "KQ");
|
4381
|
+
|
4382
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
4383
|
+
offload_func_kq(KQ_scaled);
|
4384
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4385
|
+
|
4386
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4387
|
+
offload_func_kq(KQ_masked);
|
4388
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4389
|
+
|
4390
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
4391
|
+
offload_func_v(KQ_soft_max);
|
4392
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4393
|
+
|
4394
|
+
struct ggml_tensor * V =
|
4395
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4396
|
+
n_kv, n_embd_head, n_head_kv,
|
4397
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4398
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4399
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4400
|
+
offload_func_v(V);
|
4401
|
+
ggml_set_name(V, "V");
|
4402
|
+
|
4403
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4404
|
+
offload_func_v(KQV);
|
4405
|
+
ggml_set_name(KQV, "KQV");
|
4406
|
+
|
4407
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4408
|
+
offload_func_v(KQV_merged);
|
4409
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4410
|
+
|
4411
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4412
|
+
offload_func_v(cur);
|
4413
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4414
|
+
|
4415
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
4416
|
+
offload_func(cur);
|
4417
|
+
ggml_set_name(cur, "result_wo");
|
4418
|
+
}
|
4419
|
+
|
4420
|
+
struct ggml_tensor * attn_out = cur;
|
4421
|
+
|
4422
|
+
// feed forward
|
4423
|
+
{
|
4424
|
+
struct ggml_tensor * inpFF = attn_norm;
|
4425
|
+
|
4426
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
4427
|
+
offload_func(cur);
|
4428
|
+
|
4429
|
+
cur = ggml_gelu(ctx0, cur);
|
4430
|
+
offload_func(cur);
|
4431
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
4432
|
+
offload_func(cur);
|
4433
|
+
}
|
4434
|
+
|
4435
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
4436
|
+
offload_func(cur);
|
4437
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4438
|
+
offload_func(cur);
|
4439
|
+
|
4440
|
+
// input for next layer
|
4441
|
+
inpL = cur;
|
4442
|
+
}
|
4443
|
+
|
4444
|
+
cur = inpL;
|
4445
|
+
|
4446
|
+
// norm
|
4447
|
+
{
|
4448
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
4449
|
+
offload_func_nr(cur);
|
4450
|
+
|
4451
|
+
cur = ggml_add(ctx0,
|
4452
|
+
ggml_mul(ctx0, cur, model.output_norm),
|
4453
|
+
model.output_norm_b);
|
4454
|
+
ggml_set_name(cur, "result_norm");
|
4455
|
+
}
|
4456
|
+
|
4457
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4458
|
+
ggml_set_name(cur, "result_output");
|
4459
|
+
|
4460
|
+
ggml_build_forward_expand(gf, cur);
|
4461
|
+
|
4462
|
+
ggml_free(ctx0);
|
4463
|
+
|
4464
|
+
return gf;
|
4465
|
+
}
|
4466
|
+
|
4467
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
4468
|
+
llama_context & lctx,
|
4469
|
+
const llama_batch & batch) {
|
4470
|
+
const auto & model = lctx.model;
|
4471
|
+
const auto & hparams = model.hparams;
|
4472
|
+
const auto & cparams = lctx.cparams;
|
4473
|
+
|
4474
|
+
const auto & kv_self = lctx.kv_self;
|
4475
|
+
|
4476
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4477
|
+
|
4478
|
+
const int64_t n_embd = hparams.n_embd;
|
4479
|
+
const int64_t n_layer = hparams.n_layer;
|
4480
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4481
|
+
const int64_t n_head = hparams.n_head;
|
4482
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4483
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4484
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4485
|
+
|
4486
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4487
|
+
|
4488
|
+
const float norm_eps = hparams.f_norm_eps;
|
4489
|
+
|
4490
|
+
const int32_t n_tokens = batch.n_tokens;
|
4491
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4492
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4493
|
+
|
4494
|
+
auto & buf_compute = lctx.buf_compute;
|
4495
|
+
|
4496
|
+
struct ggml_init_params params = {
|
4497
|
+
/*.mem_size =*/ buf_compute.size,
|
4498
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4499
|
+
/*.no_alloc =*/ true,
|
4500
|
+
};
|
4501
|
+
|
4502
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4503
|
+
|
4504
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4505
|
+
|
4506
|
+
struct ggml_tensor * cur;
|
4507
|
+
struct ggml_tensor * token;
|
4508
|
+
struct ggml_tensor * position;
|
4509
|
+
struct ggml_tensor * inpL;
|
4510
|
+
|
4511
|
+
if (batch.token) {
|
4512
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4513
|
+
|
4514
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4515
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4516
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4517
|
+
}
|
4518
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4519
|
+
|
4520
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4521
|
+
} else {
|
4522
|
+
#ifdef GGML_USE_MPI
|
4523
|
+
GGML_ASSERT(false && "not implemented");
|
4524
|
+
#endif
|
4525
|
+
|
4526
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4527
|
+
|
4528
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
4529
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4530
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4531
|
+
}
|
4532
|
+
}
|
4533
|
+
|
4534
|
+
{
|
4535
|
+
// Compute position embeddings.
|
4536
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4537
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4538
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4539
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4540
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4541
|
+
}
|
4542
|
+
}
|
4543
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3785
4544
|
|
3786
4545
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3787
4546
|
}
|
@@ -3816,48 +4575,984 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3816
4575
|
}
|
3817
4576
|
}
|
3818
4577
|
|
3819
|
-
inpL = ggml_add(ctx0, token, position);
|
3820
|
-
ggml_set_name(inpL, "inpL");
|
3821
|
-
|
4578
|
+
inpL = ggml_add(ctx0, token, position);
|
4579
|
+
ggml_set_name(inpL, "inpL");
|
4580
|
+
|
4581
|
+
for (int il = 0; il < n_layer; ++il) {
|
4582
|
+
{
|
4583
|
+
// Norm
|
4584
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4585
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
4586
|
+
}
|
4587
|
+
|
4588
|
+
{
|
4589
|
+
// Self Attention
|
4590
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
4591
|
+
|
4592
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4593
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4594
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
4595
|
+
|
4596
|
+
struct ggml_tensor * Qcur = tmpq;
|
4597
|
+
struct ggml_tensor * Kcur = tmpk;
|
4598
|
+
|
4599
|
+
{
|
4600
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4601
|
+
ggml_set_name(Vcur, "Vcur");
|
4602
|
+
|
4603
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4604
|
+
ggml_set_name(k, "k");
|
4605
|
+
|
4606
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4607
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4608
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4609
|
+
|
4610
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4611
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4612
|
+
}
|
4613
|
+
|
4614
|
+
struct ggml_tensor * Q =
|
4615
|
+
ggml_permute(ctx0,
|
4616
|
+
ggml_cpy(ctx0,
|
4617
|
+
Qcur,
|
4618
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4619
|
+
0, 2, 1, 3);
|
4620
|
+
ggml_set_name(Q, "Q");
|
4621
|
+
|
4622
|
+
struct ggml_tensor * K =
|
4623
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4624
|
+
n_embd_head, n_kv, n_head_kv,
|
4625
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4626
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4627
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4628
|
+
ggml_set_name(K, "K");
|
4629
|
+
|
4630
|
+
// K * Q
|
4631
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4632
|
+
ggml_set_name(KQ, "KQ");
|
4633
|
+
|
4634
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
4635
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
4636
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
4637
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4638
|
+
|
4639
|
+
// KQ_masked = mask_past(KQ_scaled)
|
4640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4641
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4642
|
+
|
4643
|
+
// KQ = soft_max(KQ_masked)
|
4644
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
4645
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4646
|
+
|
4647
|
+
// split cached V into n_head heads
|
4648
|
+
struct ggml_tensor * V =
|
4649
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4650
|
+
n_kv, n_embd_head, n_head_kv,
|
4651
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4652
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4653
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4654
|
+
ggml_set_name(V, "V");
|
4655
|
+
|
4656
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4657
|
+
ggml_set_name(KQV, "KQV");
|
4658
|
+
|
4659
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4660
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4661
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4662
|
+
|
4663
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4664
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4665
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4666
|
+
}
|
4667
|
+
|
4668
|
+
// Projection
|
4669
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
4670
|
+
|
4671
|
+
// Add the input
|
4672
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4673
|
+
|
4674
|
+
struct ggml_tensor * inpFF = cur;
|
4675
|
+
|
4676
|
+
// FF
|
4677
|
+
{
|
4678
|
+
// Norm
|
4679
|
+
{
|
4680
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
4681
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
4682
|
+
}
|
4683
|
+
|
4684
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
4685
|
+
|
4686
|
+
// GELU activation
|
4687
|
+
cur = ggml_gelu(ctx0, cur);
|
4688
|
+
|
4689
|
+
// Projection
|
4690
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
4691
|
+
}
|
4692
|
+
|
4693
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
4694
|
+
}
|
4695
|
+
|
4696
|
+
// Output Norm
|
4697
|
+
{
|
4698
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4699
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
4700
|
+
}
|
4701
|
+
ggml_set_name(cur, "result_norm");
|
4702
|
+
|
4703
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4704
|
+
ggml_set_name(cur, "result_output");
|
4705
|
+
|
4706
|
+
ggml_build_forward_expand(gf, cur);
|
4707
|
+
ggml_free(ctx0);
|
4708
|
+
|
4709
|
+
return gf;
|
4710
|
+
}
|
4711
|
+
|
4712
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4713
|
+
llama_context & lctx,
|
4714
|
+
const llama_batch & batch) {
|
4715
|
+
const auto & model = lctx.model;
|
4716
|
+
const auto & hparams = model.hparams;
|
4717
|
+
|
4718
|
+
const auto & kv_self = lctx.kv_self;
|
4719
|
+
|
4720
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4721
|
+
|
4722
|
+
const auto & cparams = lctx.cparams;
|
4723
|
+
const int64_t n_embd = hparams.n_embd;
|
4724
|
+
const int64_t n_layer = hparams.n_layer;
|
4725
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4726
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4727
|
+
const int64_t n_head = hparams.n_head;
|
4728
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4729
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4730
|
+
const size_t n_rot = n_embd_head / 2;
|
4731
|
+
|
4732
|
+
const float freq_base = cparams.rope_freq_base;
|
4733
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4734
|
+
const float norm_eps = hparams.f_norm_eps;
|
4735
|
+
|
4736
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4737
|
+
|
4738
|
+
|
4739
|
+
const int32_t n_tokens = batch.n_tokens;
|
4740
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4741
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4742
|
+
|
4743
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4744
|
+
|
4745
|
+
auto & buf_compute = lctx.buf_compute;
|
4746
|
+
struct ggml_init_params params = {
|
4747
|
+
/*.mem_size =*/ buf_compute.size,
|
4748
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4749
|
+
/*.no_alloc =*/ true,
|
4750
|
+
};
|
4751
|
+
|
4752
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4753
|
+
|
4754
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4755
|
+
|
4756
|
+
struct ggml_tensor * cur;
|
4757
|
+
struct ggml_tensor * inpL;
|
4758
|
+
|
4759
|
+
if (batch.token) {
|
4760
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4761
|
+
|
4762
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4763
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4764
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4765
|
+
}
|
4766
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4767
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4768
|
+
} else {
|
4769
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4770
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4772
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4773
|
+
}
|
4774
|
+
}
|
4775
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4776
|
+
(void) i_gpu_start;
|
4777
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4778
|
+
offload_func_t offload_func_kq = llama_nop;
|
4779
|
+
offload_func_t offload_func_v = llama_nop;
|
4780
|
+
// KQ_scale
|
4781
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4782
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4783
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4784
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4785
|
+
}
|
4786
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4787
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4788
|
+
offload_func_kq(KQ_mask);
|
4789
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4790
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4791
|
+
|
4792
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4793
|
+
float * data = (float *) KQ_mask->data;
|
4794
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4795
|
+
for (int h = 0; h < 1; ++h) {
|
4796
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4797
|
+
const llama_pos pos = batch.pos[j];
|
4798
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4799
|
+
for (int i = 0; i < n_kv; ++i) {
|
4800
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4802
|
+
}
|
4803
|
+
}
|
4804
|
+
}
|
4805
|
+
}
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4809
|
+
offload_func_kq(KQ_pos);
|
4810
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4811
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4812
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4813
|
+
int * data = (int *) KQ_pos->data;
|
4814
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4815
|
+
data[i] = batch.pos[i];
|
4816
|
+
}
|
4817
|
+
}
|
4818
|
+
if (do_rope_shift) {
|
4819
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4820
|
+
offload_func_kq(K_shift);
|
4821
|
+
ggml_set_name(K_shift, "K_shift");
|
4822
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4824
|
+
int * data = (int *) K_shift->data;
|
4825
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4826
|
+
data[i] = kv_self.cells[i].delta;
|
4827
|
+
}
|
4828
|
+
}
|
4829
|
+
for (int il = 0; il < n_layer; ++il) {
|
4830
|
+
struct ggml_tensor * tmp =
|
4831
|
+
// we rotate only the first n_rot dimensions.
|
4832
|
+
ggml_rope_custom_inplace(ctx0,
|
4833
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4834
|
+
n_rot, n_head, n_ctx,
|
4835
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4836
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4837
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4838
|
+
),
|
4839
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4840
|
+
offload_func_kq(tmp);
|
4841
|
+
ggml_build_forward_expand(gf, tmp);
|
4842
|
+
}
|
4843
|
+
}
|
4844
|
+
for (int il=0; il < n_layer; ++il) {
|
4845
|
+
struct ggml_tensor * residual = inpL;
|
4846
|
+
offload_func_t offload_func = llama_nop;
|
4847
|
+
{
|
4848
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4849
|
+
offload_func(cur);
|
4850
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4851
|
+
offload_func(cur);
|
4852
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4853
|
+
offload_func(cur);
|
4854
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4855
|
+
}
|
4856
|
+
// self attention
|
4857
|
+
{
|
4858
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4859
|
+
offload_func_kq(cur);
|
4860
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4861
|
+
offload_func_kq(cur);
|
4862
|
+
|
4863
|
+
// split qkv
|
4864
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4865
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4866
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4867
|
+
offload_func_kq(tmpqkv);
|
4868
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4869
|
+
offload_func_kq(tmpqkv_perm);
|
4870
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4871
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4872
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4873
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4874
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4875
|
+
0
|
4876
|
+
);
|
4877
|
+
offload_func_kq(tmpq);
|
4878
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4879
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4880
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4881
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4882
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4883
|
+
);
|
4884
|
+
offload_func_kq(tmpk);
|
4885
|
+
// Q/K Layernorm
|
4886
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4887
|
+
offload_func_kq(tmpq);
|
4888
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4889
|
+
offload_func_kq(tmpq);
|
4890
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4891
|
+
offload_func_kq(tmpq);
|
4892
|
+
|
4893
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4894
|
+
offload_func_v(tmpk);
|
4895
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
4896
|
+
offload_func_v(tmpk);
|
4897
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
4898
|
+
offload_func_v(tmpk);
|
4899
|
+
|
4900
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4901
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
4902
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4903
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4904
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4905
|
+
0
|
4906
|
+
);
|
4907
|
+
offload_func_kq(qrot);
|
4908
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
4909
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
4910
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4911
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4912
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4913
|
+
0
|
4914
|
+
);
|
4915
|
+
offload_func_kq(krot);
|
4916
|
+
ggml_format_name(krot, "krot_%d", il);
|
4917
|
+
|
4918
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4919
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4920
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4921
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4922
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4923
|
+
ggml_element_size(tmpq) * n_rot
|
4924
|
+
);
|
4925
|
+
offload_func_kq(qpass);
|
4926
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
4927
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4928
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4929
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4930
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4931
|
+
ggml_element_size(tmpk) * n_rot
|
4932
|
+
);
|
4933
|
+
offload_func_kq(kpass);
|
4934
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
4935
|
+
|
4936
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4937
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4938
|
+
);
|
4939
|
+
offload_func_kq(qrotated);
|
4940
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4941
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4942
|
+
);
|
4943
|
+
offload_func_kq(krotated);
|
4944
|
+
// ggml currently only supports concatenation on dim=2
|
4945
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4946
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4947
|
+
offload_func_kq(qrotated);
|
4948
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4949
|
+
offload_func_kq(krotated);
|
4950
|
+
|
4951
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4952
|
+
offload_func_kq(qpass);
|
4953
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4954
|
+
offload_func_kq(kpass);
|
4955
|
+
|
4956
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4957
|
+
offload_func_kq(Qcur);
|
4958
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4959
|
+
offload_func_kq(Kcur);
|
4960
|
+
|
4961
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
4962
|
+
offload_func_kq(Q);
|
4963
|
+
|
4964
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4965
|
+
offload_func_kq(Kcur);
|
4966
|
+
{
|
4967
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4968
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4969
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4970
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4971
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
4972
|
+
);
|
4973
|
+
offload_func_v(tmpv);
|
4974
|
+
// store K, V in cache
|
4975
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
4976
|
+
offload_func_v(Vcur);
|
4977
|
+
ggml_set_name(Vcur, "Vcur");
|
4978
|
+
|
4979
|
+
struct ggml_tensor * k = ggml_view_1d(
|
4980
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
4981
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
4982
|
+
);
|
4983
|
+
offload_func_kq(k);
|
4984
|
+
ggml_set_name(k, "k");
|
4985
|
+
|
4986
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4987
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4988
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4989
|
+
offload_func_v(v);
|
4990
|
+
ggml_set_name(v, "v");
|
4991
|
+
|
4992
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
4993
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4994
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4995
|
+
}
|
4996
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
4997
|
+
n_embd_head, n_kv, n_head_kv,
|
4998
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4999
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5000
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5001
|
+
|
5002
|
+
offload_func_kq(K);
|
5003
|
+
ggml_format_name(K, "K_%d", il);
|
5004
|
+
|
5005
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5006
|
+
offload_func_kq(KQ);
|
5007
|
+
ggml_set_name(KQ, "KQ");
|
5008
|
+
|
5009
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5010
|
+
offload_func_kq(KQ_scaled);
|
5011
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5012
|
+
|
5013
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5014
|
+
offload_func_kq(KQ_masked);
|
5015
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5016
|
+
|
5017
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5018
|
+
offload_func_kq(KQ_soft_max);
|
5019
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5020
|
+
|
5021
|
+
struct ggml_tensor * V =
|
5022
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5023
|
+
n_kv, n_embd_head, n_head_kv,
|
5024
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5025
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5026
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5027
|
+
offload_func_v(V);
|
5028
|
+
ggml_set_name(V, "V");
|
5029
|
+
|
5030
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5031
|
+
offload_func_v(KQV);
|
5032
|
+
ggml_set_name(KQV, "KQV");
|
5033
|
+
|
5034
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5035
|
+
offload_func_v(KQV_merged);
|
5036
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5037
|
+
|
5038
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5039
|
+
offload_func_v(cur);
|
5040
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5041
|
+
|
5042
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5043
|
+
offload_func(cur);
|
5044
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5045
|
+
offload_func(cur);
|
5046
|
+
ggml_set_name(cur, "result_wo");
|
5047
|
+
}
|
5048
|
+
|
5049
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5050
|
+
offload_func(inpFF);
|
5051
|
+
ggml_set_name(inpFF, "inpFF");
|
5052
|
+
{
|
5053
|
+
// MLP
|
5054
|
+
{
|
5055
|
+
// Norm
|
5056
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5057
|
+
offload_func(cur);
|
5058
|
+
cur = ggml_add(ctx0,
|
5059
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5060
|
+
model.layers[il].ffn_norm_b
|
5061
|
+
);
|
5062
|
+
ggml_set_name(cur, "ffn_norm");
|
5063
|
+
offload_func(cur);
|
5064
|
+
}
|
5065
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5066
|
+
offload_func(cur);
|
5067
|
+
|
5068
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5069
|
+
offload_func(cur);
|
5070
|
+
ggml_set_name(cur, "result_ffn_up");
|
5071
|
+
|
5072
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5073
|
+
ggml_set_name(cur, "result_ffn_act");
|
5074
|
+
offload_func(cur);
|
5075
|
+
offload_func(cur->src[0]);
|
5076
|
+
|
5077
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5078
|
+
offload_func(cur);
|
5079
|
+
cur = ggml_add(ctx0,
|
5080
|
+
cur,
|
5081
|
+
model.layers[il].b2);
|
5082
|
+
offload_func(cur);
|
5083
|
+
ggml_set_name(cur, "outFF");
|
5084
|
+
}
|
5085
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5086
|
+
offload_func(cur);
|
5087
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5088
|
+
inpL = cur;
|
5089
|
+
}
|
5090
|
+
cur = inpL;
|
5091
|
+
{
|
5092
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5093
|
+
offload_func_nr(cur);
|
5094
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5095
|
+
offload_func_nr(cur);
|
5096
|
+
|
5097
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5098
|
+
// offload_func_nr(cur);
|
5099
|
+
|
5100
|
+
ggml_set_name(cur, "result_norm");
|
5101
|
+
}
|
5102
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5103
|
+
ggml_set_name(cur, "result_output");
|
5104
|
+
ggml_build_forward_expand(gf, cur);
|
5105
|
+
ggml_free(ctx0);
|
5106
|
+
return gf;
|
5107
|
+
}
|
5108
|
+
|
5109
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5110
|
+
llama_context & lctx,
|
5111
|
+
const llama_batch & batch) {
|
5112
|
+
const auto & model = lctx.model;
|
5113
|
+
const auto & hparams = model.hparams;
|
5114
|
+
const auto & cparams = lctx.cparams;
|
5115
|
+
|
5116
|
+
const auto & kv_self = lctx.kv_self;
|
5117
|
+
|
5118
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5119
|
+
|
5120
|
+
const int64_t n_embd = hparams.n_embd;
|
5121
|
+
const int64_t n_layer = hparams.n_layer;
|
5122
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5123
|
+
const int64_t n_head = hparams.n_head;
|
5124
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5125
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5126
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5127
|
+
|
5128
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5129
|
+
|
5130
|
+
const float norm_eps = hparams.f_norm_eps;
|
5131
|
+
|
5132
|
+
const int32_t n_tokens = batch.n_tokens;
|
5133
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5134
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5135
|
+
|
5136
|
+
auto & buf_compute = lctx.buf_compute;
|
5137
|
+
|
5138
|
+
struct ggml_init_params params = {
|
5139
|
+
/*.mem_size =*/ buf_compute.size,
|
5140
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5141
|
+
/*.no_alloc =*/ false,
|
5142
|
+
};
|
5143
|
+
|
5144
|
+
params.no_alloc = true;
|
5145
|
+
|
5146
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5147
|
+
|
5148
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5149
|
+
|
5150
|
+
struct ggml_tensor * cur;
|
5151
|
+
struct ggml_tensor * token;
|
5152
|
+
struct ggml_tensor * inpL;
|
5153
|
+
|
5154
|
+
if (batch.token) {
|
5155
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5156
|
+
|
5157
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5158
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5159
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5160
|
+
}
|
5161
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5162
|
+
|
5163
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5164
|
+
} else {
|
5165
|
+
#ifdef GGML_USE_MPI
|
5166
|
+
GGML_ASSERT(false && "not implemented");
|
5167
|
+
#endif
|
5168
|
+
|
5169
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5170
|
+
|
5171
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5172
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5173
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5174
|
+
}
|
5175
|
+
}
|
5176
|
+
|
5177
|
+
// KQ_scale
|
5178
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5179
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5180
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5181
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5182
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5183
|
+
}
|
5184
|
+
|
5185
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5186
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5187
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5188
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5189
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5190
|
+
float * data = (float *) KQ_mask->data;
|
5191
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5192
|
+
|
5193
|
+
for (int h = 0; h < 1; ++h) {
|
5194
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5195
|
+
const llama_pos pos = batch.pos[j];
|
5196
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5197
|
+
|
5198
|
+
for (int i = 0; i < n_kv; ++i) {
|
5199
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5200
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5201
|
+
}
|
5202
|
+
}
|
5203
|
+
}
|
5204
|
+
}
|
5205
|
+
}
|
5206
|
+
|
5207
|
+
// norm
|
5208
|
+
{
|
5209
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5210
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
ggml_set_name(inpL, "inpL");
|
5214
|
+
|
5215
|
+
for (int il = 0; il < n_layer; ++il) {
|
5216
|
+
{
|
5217
|
+
// Norm
|
5218
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5219
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5220
|
+
}
|
5221
|
+
|
5222
|
+
{
|
5223
|
+
// Self Attention
|
5224
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5225
|
+
|
5226
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5227
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5228
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5229
|
+
|
5230
|
+
struct ggml_tensor * Qcur = tmpq;
|
5231
|
+
struct ggml_tensor * Kcur = tmpk;
|
5232
|
+
|
5233
|
+
// store key and value to memory
|
5234
|
+
{
|
5235
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5236
|
+
ggml_set_name(Vcur, "Vcur");
|
5237
|
+
|
5238
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5239
|
+
ggml_set_name(k, "k");
|
5240
|
+
|
5241
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5242
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5243
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5244
|
+
|
5245
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5246
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5247
|
+
}
|
5248
|
+
|
5249
|
+
struct ggml_tensor * Q =
|
5250
|
+
ggml_permute(ctx0,
|
5251
|
+
ggml_cpy(ctx0,
|
5252
|
+
Qcur,
|
5253
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5254
|
+
0, 2, 1, 3);
|
5255
|
+
ggml_set_name(Q, "Q");
|
5256
|
+
|
5257
|
+
struct ggml_tensor * K =
|
5258
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5259
|
+
n_embd_head, n_kv, n_head_kv,
|
5260
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5261
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5262
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5263
|
+
ggml_set_name(K, "K");
|
5264
|
+
|
5265
|
+
// K * Q
|
5266
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5267
|
+
ggml_set_name(KQ, "KQ");
|
5268
|
+
|
5269
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5270
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5271
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5272
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5273
|
+
|
5274
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5275
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5276
|
+
|
5277
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5278
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5279
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5280
|
+
|
5281
|
+
// KQ = soft_max(KQ_masked)
|
5282
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5283
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5284
|
+
|
5285
|
+
// split cached V into n_head heads
|
5286
|
+
struct ggml_tensor * V =
|
5287
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5288
|
+
n_kv, n_embd_head, n_head_kv,
|
5289
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5290
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5291
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5292
|
+
ggml_set_name(V, "V");
|
5293
|
+
|
5294
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5295
|
+
ggml_set_name(KQV, "KQV");
|
5296
|
+
|
5297
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5298
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5299
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5300
|
+
|
5301
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5302
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5303
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5304
|
+
}
|
5305
|
+
|
5306
|
+
// Projection
|
5307
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5308
|
+
|
5309
|
+
// Add the input
|
5310
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5311
|
+
|
5312
|
+
struct ggml_tensor * inpFF = cur;
|
5313
|
+
|
5314
|
+
// FF
|
5315
|
+
{
|
5316
|
+
// Norm
|
5317
|
+
{
|
5318
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5319
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5320
|
+
}
|
5321
|
+
|
5322
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5323
|
+
|
5324
|
+
// GELU activation
|
5325
|
+
cur = ggml_gelu(ctx0, cur);
|
5326
|
+
|
5327
|
+
// Projection
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5329
|
+
}
|
5330
|
+
|
5331
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
// Output Norm
|
5335
|
+
{
|
5336
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5337
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5338
|
+
}
|
5339
|
+
ggml_set_name(cur, "result_norm");
|
5340
|
+
|
5341
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5342
|
+
ggml_set_name(cur, "result_output");
|
5343
|
+
|
5344
|
+
ggml_build_forward_expand(gf, cur);
|
5345
|
+
|
5346
|
+
ggml_free(ctx0);
|
5347
|
+
|
5348
|
+
return gf;
|
5349
|
+
}
|
5350
|
+
|
5351
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5352
|
+
llama_context & lctx,
|
5353
|
+
const llama_batch & batch) {
|
5354
|
+
const auto & model = lctx.model;
|
5355
|
+
const auto & hparams = model.hparams;
|
5356
|
+
const auto & cparams = lctx.cparams;
|
5357
|
+
|
5358
|
+
const auto & kv_self = lctx.kv_self;
|
5359
|
+
|
5360
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5361
|
+
|
5362
|
+
const int64_t n_embd = hparams.n_embd;
|
5363
|
+
const int64_t n_layer = hparams.n_layer;
|
5364
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5365
|
+
const int64_t n_head = hparams.n_head;
|
5366
|
+
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5367
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
|
+
|
5370
|
+
const float norm_eps = hparams.f_norm_eps;
|
5371
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5372
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5373
|
+
|
5374
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5375
|
+
|
5376
|
+
const int32_t n_tokens = batch.n_tokens;
|
5377
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5378
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5379
|
+
|
5380
|
+
auto & buf_compute = lctx.buf_compute;
|
5381
|
+
|
5382
|
+
struct ggml_init_params params = {
|
5383
|
+
/*.mem_size =*/ buf_compute.size,
|
5384
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5385
|
+
/*.no_alloc =*/ false,
|
5386
|
+
};
|
5387
|
+
|
5388
|
+
params.no_alloc = true;
|
5389
|
+
|
5390
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5391
|
+
|
5392
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5393
|
+
|
5394
|
+
struct ggml_tensor * cur;
|
5395
|
+
struct ggml_tensor * inpL;
|
5396
|
+
|
5397
|
+
//int warmup = 0;
|
5398
|
+
if (batch.token) {
|
5399
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5400
|
+
|
5401
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5402
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5403
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5404
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5405
|
+
}
|
5406
|
+
|
5407
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5408
|
+
|
5409
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5410
|
+
} else {
|
5411
|
+
#ifdef GGML_USE_MPI
|
5412
|
+
GGML_ASSERT(false && "not implemented");
|
5413
|
+
#endif
|
5414
|
+
|
5415
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5416
|
+
|
5417
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5418
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5419
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5420
|
+
}
|
5421
|
+
}
|
5422
|
+
|
5423
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5424
|
+
(void) i_gpu_start;
|
5425
|
+
|
5426
|
+
// offload functions set the tensor output backend to GPU
|
5427
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5428
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5429
|
+
offload_func_t offload_func_kq = llama_nop;
|
5430
|
+
offload_func_t offload_func_v = llama_nop;
|
5431
|
+
|
5432
|
+
#ifdef GGML_USE_CUBLAS
|
5433
|
+
if (n_gpu_layers > n_layer) {
|
5434
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5435
|
+
}
|
5436
|
+
if (n_gpu_layers > n_layer + 1) {
|
5437
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5438
|
+
}
|
5439
|
+
if (n_gpu_layers > n_layer + 2) {
|
5440
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5441
|
+
}
|
5442
|
+
#endif // GGML_USE_CUBLAS
|
5443
|
+
|
5444
|
+
// KQ_scale
|
5445
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5446
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5447
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5448
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5449
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5450
|
+
}
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
offload_func_kq(KQ_mask);
|
5455
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5456
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5457
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5458
|
+
float * data = (float *) KQ_mask->data;
|
5459
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5460
|
+
|
5461
|
+
for (int h = 0; h < 1; ++h) {
|
5462
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5463
|
+
const llama_pos pos = batch.pos[j];
|
5464
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5465
|
+
|
5466
|
+
for (int i = 0; i < n_kv; ++i) {
|
5467
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5468
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5469
|
+
}
|
5470
|
+
}
|
5471
|
+
}
|
5472
|
+
}
|
5473
|
+
}
|
5474
|
+
|
3822
5475
|
for (int il = 0; il < n_layer; ++il) {
|
3823
|
-
|
3824
|
-
|
3825
|
-
|
3826
|
-
|
5476
|
+
struct ggml_tensor * attn_norm;
|
5477
|
+
|
5478
|
+
offload_func_t offload_func = llama_nop;
|
5479
|
+
|
5480
|
+
#ifdef GGML_USE_CUBLAS
|
5481
|
+
if (il >= i_gpu_start) {
|
5482
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3827
5483
|
}
|
5484
|
+
#endif // GGML_USE_CUBLAS
|
3828
5485
|
|
5486
|
+
// self-attention
|
5487
|
+
// TODO: refactor into common function (shared with LLaMA)
|
3829
5488
|
{
|
3830
|
-
|
3831
|
-
|
5489
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5490
|
+
offload_func(attn_norm);
|
3832
5491
|
|
3833
|
-
|
3834
|
-
|
3835
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5492
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5493
|
+
offload_func(attn_norm);
|
3836
5494
|
|
3837
|
-
|
3838
|
-
|
5495
|
+
if (1) {
|
5496
|
+
cur = attn_norm;
|
5497
|
+
}
|
5498
|
+
|
5499
|
+
// compute QKV
|
5500
|
+
|
5501
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5502
|
+
offload_func_kq(cur);
|
5503
|
+
|
5504
|
+
if (clamp_kqv > 0.0f) {
|
5505
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5506
|
+
offload_func_kq(cur);
|
5507
|
+
}
|
5508
|
+
|
5509
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5510
|
+
|
5511
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5512
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5513
|
+
wsize * n_embd_head,
|
5514
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5515
|
+
0);
|
5516
|
+
offload_func_kq(Qcur);
|
5517
|
+
|
5518
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5519
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5520
|
+
wsize * n_embd_head,
|
5521
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5522
|
+
wsize * n_embd_head * n_head);
|
5523
|
+
offload_func_kq(Kcur);
|
5524
|
+
|
5525
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5526
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5527
|
+
wsize * n_embd_head,
|
5528
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5529
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5530
|
+
offload_func_kq(Kcur);
|
5531
|
+
|
5532
|
+
ggml_set_name(Qcur, "Qcur");
|
5533
|
+
ggml_set_name(Kcur, "Kcur");
|
3839
5534
|
|
3840
5535
|
{
|
3841
5536
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5537
|
+
offload_func_v(Vcur);
|
5538
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
3842
5539
|
ggml_set_name(Vcur, "Vcur");
|
3843
5540
|
|
3844
5541
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5542
|
+
offload_func_kq(k);
|
3845
5543
|
ggml_set_name(k, "k");
|
3846
5544
|
|
3847
5545
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3848
5546
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3849
5547
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5548
|
+
offload_func_v(v);
|
3850
5549
|
|
3851
5550
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3852
5551
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3853
5552
|
}
|
3854
5553
|
|
3855
|
-
struct ggml_tensor * Q =
|
3856
|
-
|
3857
|
-
ggml_cpy(ctx0,
|
3858
|
-
Qcur,
|
3859
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3860
|
-
0, 2, 1, 3);
|
5554
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5555
|
+
offload_func_kq(Q);
|
3861
5556
|
ggml_set_name(Q, "Q");
|
3862
5557
|
|
3863
5558
|
struct ggml_tensor * K =
|
@@ -3866,85 +5561,105 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3866
5561
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3867
5562
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3868
5563
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5564
|
+
offload_func_kq(K);
|
3869
5565
|
ggml_set_name(K, "K");
|
3870
5566
|
|
3871
|
-
// K * Q
|
3872
5567
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5568
|
+
offload_func_kq(KQ);
|
3873
5569
|
ggml_set_name(KQ, "KQ");
|
3874
5570
|
|
3875
|
-
|
3876
|
-
|
3877
|
-
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5571
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5572
|
+
offload_func_kq(KQ_scaled);
|
3878
5573
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3879
5574
|
|
3880
|
-
//
|
3881
|
-
struct ggml_tensor *
|
5575
|
+
// TODO: replace with ggml_add()
|
5576
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5577
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5578
|
+
offload_func_kq(KQ_scaled_alibi);
|
5579
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5580
|
+
|
5581
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5582
|
+
offload_func_kq(KQ_masked);
|
3882
5583
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3883
5584
|
|
3884
|
-
|
3885
|
-
|
5585
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5586
|
+
offload_func_v(KQ_soft_max);
|
3886
5587
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3887
5588
|
|
3888
|
-
// split cached V into n_head heads
|
3889
5589
|
struct ggml_tensor * V =
|
3890
5590
|
ggml_view_3d(ctx0, kv_self.v,
|
3891
5591
|
n_kv, n_embd_head, n_head_kv,
|
3892
5592
|
ggml_element_size(kv_self.v)*n_ctx,
|
3893
5593
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3894
5594
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5595
|
+
offload_func_v(V);
|
3895
5596
|
ggml_set_name(V, "V");
|
3896
5597
|
|
3897
5598
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5599
|
+
offload_func_v(KQV);
|
3898
5600
|
ggml_set_name(KQV, "KQV");
|
3899
5601
|
|
3900
|
-
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3901
5602
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5603
|
+
offload_func_v(KQV_merged);
|
3902
5604
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3903
5605
|
|
3904
|
-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
5606
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5607
|
+
offload_func_v(cur);
|
3906
5608
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3907
|
-
}
|
3908
5609
|
|
3909
|
-
|
3910
|
-
|
5610
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5611
|
+
offload_func(cur);
|
5612
|
+
ggml_set_name(cur, "result_wo");
|
5613
|
+
}
|
3911
5614
|
|
3912
5615
|
// Add the input
|
3913
5616
|
cur = ggml_add(ctx0, cur, inpL);
|
5617
|
+
offload_func(cur);
|
3914
5618
|
|
3915
|
-
struct ggml_tensor *
|
5619
|
+
struct ggml_tensor * attn_out = cur;
|
3916
5620
|
|
3917
|
-
//
|
5621
|
+
// feed forward
|
3918
5622
|
{
|
3919
5623
|
// Norm
|
3920
5624
|
{
|
3921
|
-
cur = ggml_norm(ctx0,
|
3922
|
-
|
5625
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5626
|
+
offload_func(cur);
|
5627
|
+
|
5628
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5629
|
+
offload_func(cur);
|
3923
5630
|
}
|
3924
5631
|
|
3925
|
-
cur =
|
5632
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5633
|
+
offload_func(cur);
|
3926
5634
|
|
3927
|
-
// GELU activation
|
3928
5635
|
cur = ggml_gelu(ctx0, cur);
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
5636
|
+
offload_func(cur);
|
5637
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5638
|
+
offload_func(cur);
|
3932
5639
|
}
|
3933
5640
|
|
3934
|
-
|
5641
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5642
|
+
offload_func(cur);
|
5643
|
+
// input for next layer
|
5644
|
+
inpL = cur;
|
3935
5645
|
}
|
3936
5646
|
|
3937
|
-
|
5647
|
+
cur = inpL;
|
5648
|
+
|
5649
|
+
// norm
|
3938
5650
|
{
|
3939
|
-
cur = ggml_norm(ctx0,
|
3940
|
-
|
5651
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5652
|
+
offload_func_nr(cur);
|
5653
|
+
|
5654
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5655
|
+
ggml_set_name(cur, "result_norm");
|
3941
5656
|
}
|
3942
|
-
ggml_set_name(cur, "result_norm");
|
3943
5657
|
|
3944
5658
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3945
5659
|
ggml_set_name(cur, "result_output");
|
3946
5660
|
|
3947
5661
|
ggml_build_forward_expand(gf, cur);
|
5662
|
+
|
3948
5663
|
ggml_free(ctx0);
|
3949
5664
|
|
3950
5665
|
return gf;
|
@@ -3974,6 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3974
5689
|
{
|
3975
5690
|
result = llm_build_starcoder(lctx, batch);
|
3976
5691
|
} break;
|
5692
|
+
case LLM_ARCH_PERSIMMON:
|
5693
|
+
{
|
5694
|
+
result = llm_build_persimmon(lctx, batch);
|
5695
|
+
} break;
|
5696
|
+
case LLM_ARCH_REFACT:
|
5697
|
+
{
|
5698
|
+
result = llm_build_refact(lctx, batch);
|
5699
|
+
} break;
|
5700
|
+
case LLM_ARCH_BLOOM:
|
5701
|
+
{
|
5702
|
+
result = llm_build_bloom(lctx, batch);
|
5703
|
+
} break;
|
5704
|
+
case LLM_ARCH_MPT:
|
5705
|
+
{
|
5706
|
+
result = llm_build_mpt(lctx, batch);
|
5707
|
+
} break;
|
3977
5708
|
default:
|
3978
5709
|
GGML_ASSERT(false);
|
3979
5710
|
}
|
@@ -3985,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3985
5716
|
//
|
3986
5717
|
// - lctx: llama context
|
3987
5718
|
// - batch: batch to evaluate
|
3988
|
-
// - n_threads: number of threads to use
|
3989
5719
|
//
|
3990
5720
|
// return 0 on success
|
3991
5721
|
// return positive int on warning
|
@@ -4052,10 +5782,6 @@ static int llama_decode_internal(
|
|
4052
5782
|
batch.seq_id = seq_id.data();
|
4053
5783
|
}
|
4054
5784
|
|
4055
|
-
// we always start to search for a free slot from the start of the cache
|
4056
|
-
// TODO: better strategies can be implemented
|
4057
|
-
kv_self.head = 0;
|
4058
|
-
|
4059
5785
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
5786
|
return 1;
|
4061
5787
|
}
|
@@ -4107,7 +5833,9 @@ static int llama_decode_internal(
|
|
4107
5833
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
5834
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
5835
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
-
model.arch == LLM_ARCH_FALCON
|
5836
|
+
model.arch == LLM_ARCH_FALCON ||
|
5837
|
+
model.arch == LLM_ARCH_REFACT ||
|
5838
|
+
model.arch == LLM_ARCH_MPT;
|
4111
5839
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
5840
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
5841
|
n_threads = 1;
|
@@ -4140,8 +5868,12 @@ static int llama_decode_internal(
|
|
4140
5868
|
#endif
|
4141
5869
|
|
4142
5870
|
// update the kv ring buffer
|
4143
|
-
lctx.kv_self.head += n_tokens;
|
4144
5871
|
lctx.kv_self.has_shift = false;
|
5872
|
+
lctx.kv_self.head += n_tokens;
|
5873
|
+
// Ensure kv cache head points to a valid index.
|
5874
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5875
|
+
lctx.kv_self.head = 0;
|
5876
|
+
}
|
4145
5877
|
|
4146
5878
|
#ifdef GGML_PERF
|
4147
5879
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -4227,18 +5959,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
4227
5959
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
4228
5960
|
}
|
4229
5961
|
|
4230
|
-
static
|
5962
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
5963
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
5964
|
+
}
|
5965
|
+
|
5966
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
4231
5967
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
4232
5968
|
const auto& token_data = vocab.id_to_token.at(id);
|
4233
|
-
|
4234
|
-
|
5969
|
+
switch (llama_vocab_get_type(vocab)) {
|
5970
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
5971
|
+
auto buf = token_data.text.substr(3, 2);
|
5972
|
+
return strtol(buf.c_str(), NULL, 16);
|
5973
|
+
}
|
5974
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
5975
|
+
GGML_ASSERT(false);
|
5976
|
+
return unicode_to_bytes_bpe(token_data.text);
|
5977
|
+
}
|
5978
|
+
default:
|
5979
|
+
GGML_ASSERT(false);
|
5980
|
+
}
|
4235
5981
|
}
|
4236
5982
|
|
4237
5983
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
5984
|
+
switch (llama_vocab_get_type(vocab)) {
|
5985
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
5986
|
+
char buf[7];
|
5987
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
5988
|
+
GGML_ASSERT(0 <= result && result < 7);
|
5989
|
+
return vocab.token_to_id.at(buf);
|
5990
|
+
}
|
5991
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
5992
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
5993
|
+
}
|
5994
|
+
default:
|
5995
|
+
GGML_ASSERT(false);
|
5996
|
+
}
|
4242
5997
|
}
|
4243
5998
|
|
4244
5999
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4518,15 +6273,9 @@ struct llm_tokenizer_bpe {
|
|
4518
6273
|
std::string byte_str(1, *j);
|
4519
6274
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4520
6275
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4521
|
-
|
4522
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4523
|
-
output.push_back(token_byte);
|
4524
|
-
} catch (const std::out_of_range & err) {
|
4525
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4526
|
-
}
|
4527
|
-
} else {
|
4528
|
-
output.push_back((*token_multibyte).second);
|
6276
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4529
6277
|
}
|
6278
|
+
output.push_back((*token_multibyte).second);
|
4530
6279
|
}
|
4531
6280
|
} else {
|
4532
6281
|
output.push_back((*token).second);
|
@@ -4563,23 +6312,143 @@ private:
|
|
4563
6312
|
work_queue.push(bigram);
|
4564
6313
|
}
|
4565
6314
|
|
4566
|
-
|
4567
|
-
|
4568
|
-
std::vector<std::string>
|
6315
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
6316
|
+
std::vector<std::string> bpe_words;
|
6317
|
+
std::vector<std::string> bpe_encoded_words;
|
6318
|
+
|
6319
|
+
std::string token = "";
|
6320
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
6321
|
+
bool collecting_numeric = false;
|
6322
|
+
bool collecting_letter = false;
|
6323
|
+
bool collecting_special = false;
|
6324
|
+
bool collecting_whitespace_lookahead = false;
|
6325
|
+
bool collecting = false;
|
6326
|
+
|
6327
|
+
std::vector<std::string> text_utf;
|
6328
|
+
text_utf.reserve(text.size());
|
6329
|
+
bpe_words.reserve(text.size());
|
6330
|
+
bpe_encoded_words.reserve(text.size());
|
6331
|
+
|
6332
|
+
auto cps = codepoints_from_utf8(text);
|
6333
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
6334
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
6335
|
+
|
6336
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
6337
|
+
const std::string & utf_char = text_utf[i];
|
6338
|
+
bool split_condition = false;
|
6339
|
+
int bytes_remain = text_utf.size() - i;
|
6340
|
+
// forward backward lookups
|
6341
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
6342
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
6343
|
+
|
6344
|
+
// handling contractions
|
6345
|
+
if (!split_condition && bytes_remain >= 2) {
|
6346
|
+
// 's|'t|'m|'d
|
6347
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
6348
|
+
split_condition = true;
|
6349
|
+
}
|
6350
|
+
if (split_condition) {
|
6351
|
+
if (token.size()) {
|
6352
|
+
bpe_words.emplace_back(token); // push previous content as token
|
6353
|
+
}
|
6354
|
+
token = utf_char + utf_char_next;
|
6355
|
+
bpe_words.emplace_back(token);
|
6356
|
+
token = "";
|
6357
|
+
i++;
|
6358
|
+
continue;
|
6359
|
+
}
|
6360
|
+
}
|
6361
|
+
if (!split_condition && bytes_remain >= 3) {
|
6362
|
+
// 're|'ve|'ll
|
6363
|
+
if (utf_char == "\'" && (
|
6364
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6365
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6366
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
6367
|
+
) {
|
6368
|
+
split_condition = true;
|
6369
|
+
}
|
6370
|
+
if (split_condition) {
|
6371
|
+
// current token + next token can be defined
|
6372
|
+
if (token.size()) {
|
6373
|
+
bpe_words.emplace_back(token); // push previous content as token
|
6374
|
+
}
|
6375
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
6376
|
+
bpe_words.emplace_back(token); // the contraction
|
6377
|
+
token = "";
|
6378
|
+
i += 2;
|
6379
|
+
continue;
|
6380
|
+
}
|
6381
|
+
}
|
6382
|
+
|
6383
|
+
if (!split_condition && !collecting) {
|
6384
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
6385
|
+
collecting_letter = true;
|
6386
|
+
collecting = true;
|
6387
|
+
}
|
6388
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
6389
|
+
collecting_numeric = true;
|
6390
|
+
collecting = true;
|
6391
|
+
}
|
6392
|
+
else if (
|
6393
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
6394
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
6395
|
+
) {
|
6396
|
+
collecting_special = true;
|
6397
|
+
collecting = true;
|
6398
|
+
}
|
6399
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
6400
|
+
collecting_whitespace_lookahead = true;
|
6401
|
+
collecting = true;
|
6402
|
+
}
|
6403
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
6404
|
+
split_condition = true;
|
6405
|
+
}
|
6406
|
+
}
|
6407
|
+
else if (!split_condition && collecting) {
|
6408
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
6409
|
+
split_condition = true;
|
6410
|
+
}
|
6411
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
6412
|
+
split_condition = true;
|
6413
|
+
}
|
6414
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
6415
|
+
split_condition = true;
|
6416
|
+
}
|
6417
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
6418
|
+
split_condition = true;
|
6419
|
+
}
|
6420
|
+
}
|
6421
|
+
|
6422
|
+
if (utf_char_next == "") {
|
6423
|
+
split_condition = true; // final
|
6424
|
+
token += utf_char;
|
6425
|
+
}
|
4569
6426
|
|
4570
|
-
|
4571
|
-
|
4572
|
-
|
6427
|
+
if (split_condition) {
|
6428
|
+
if (token.size()) {
|
6429
|
+
bpe_words.emplace_back(token);
|
6430
|
+
}
|
6431
|
+
token = utf_char;
|
6432
|
+
collecting = false;
|
6433
|
+
collecting_letter = false;
|
6434
|
+
collecting_numeric = false;
|
6435
|
+
collecting_special = false;
|
6436
|
+
collecting_whitespace_lookahead = false;
|
6437
|
+
}
|
6438
|
+
else {
|
6439
|
+
token += utf_char;
|
6440
|
+
}
|
6441
|
+
}
|
4573
6442
|
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4578
|
-
|
4579
|
-
|
6443
|
+
for (std::string & word : bpe_words) {
|
6444
|
+
std::string encoded_token = "";
|
6445
|
+
for (char & c : word) {
|
6446
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
6447
|
+
}
|
6448
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4580
6449
|
}
|
4581
|
-
return words;
|
4582
6450
|
|
6451
|
+
return bpe_encoded_words;
|
4583
6452
|
}
|
4584
6453
|
|
4585
6454
|
const llama_vocab & vocab;
|
@@ -6022,7 +7891,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6022
7891
|
nthread = std::thread::hardware_concurrency();
|
6023
7892
|
}
|
6024
7893
|
|
6025
|
-
|
7894
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
7895
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
7896
|
+
#if defined(__linux__) || defined(_WIN32)
|
7897
|
+
constexpr bool use_mmap = true;
|
7898
|
+
#else
|
7899
|
+
constexpr bool use_mmap = false;
|
7900
|
+
#endif
|
7901
|
+
|
7902
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
7903
|
+
if (ml.use_mmap) {
|
7904
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7905
|
+
}
|
6026
7906
|
|
6027
7907
|
llama_model model;
|
6028
7908
|
llm_load_arch(ml, model);
|
@@ -6050,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6050
7930
|
const std::string name = ggml_get_name(meta);
|
6051
7931
|
|
6052
7932
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6053
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
7933
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6054
7934
|
++n_attention_wv;
|
6055
7935
|
}
|
6056
7936
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6087,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6087
7967
|
}
|
6088
7968
|
|
6089
7969
|
std::ofstream fout(fname_out, std::ios::binary);
|
7970
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6090
7971
|
|
6091
7972
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6092
7973
|
|
@@ -6100,10 +7981,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6100
7981
|
|
6101
7982
|
const std::string name = ggml_get_name(tensor);
|
6102
7983
|
|
6103
|
-
if (
|
6104
|
-
read_data.
|
7984
|
+
if (!ml.use_mmap) {
|
7985
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
7986
|
+
read_data.resize(ggml_nbytes(tensor));
|
7987
|
+
}
|
7988
|
+
tensor->data = read_data.data();
|
6105
7989
|
}
|
6106
|
-
tensor->data = read_data.data();
|
6107
7990
|
ml.load_data_for(tensor);
|
6108
7991
|
|
6109
7992
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
@@ -6738,13 +8621,14 @@ struct llama_context * llama_new_context_with_model(
|
|
6738
8621
|
|
6739
8622
|
#ifdef GGML_USE_METAL
|
6740
8623
|
if (model->n_gpu_layers > 0) {
|
8624
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8625
|
+
|
6741
8626
|
ctx->ctx_metal = ggml_metal_init(1);
|
6742
8627
|
if (!ctx->ctx_metal) {
|
6743
8628
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6744
8629
|
llama_free(ctx);
|
6745
8630
|
return NULL;
|
6746
8631
|
}
|
6747
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
8632
|
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
8633
|
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6750
8634
|
}
|
@@ -6872,6 +8756,10 @@ int llama_n_embd(const struct llama_model * model) {
|
|
6872
8756
|
return model->hparams.n_embd;
|
6873
8757
|
}
|
6874
8758
|
|
8759
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
8760
|
+
return model->hparams.rope_freq_scale_train;
|
8761
|
+
}
|
8762
|
+
|
6875
8763
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6876
8764
|
return snprintf(buf, buf_size, "%s %s %s",
|
6877
8765
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -7039,16 +8927,6 @@ struct llama_data_file_context : llama_data_context {
|
|
7039
8927
|
*
|
7040
8928
|
*/
|
7041
8929
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
-
// TODO: does not support multi-sequence states
|
7043
|
-
{
|
7044
|
-
const auto & kv_self = ctx->kv_self;
|
7045
|
-
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
-
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
-
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
-
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
-
}
|
7050
|
-
}
|
7051
|
-
|
7052
8930
|
// copy rng
|
7053
8931
|
{
|
7054
8932
|
std::stringstream rng_ss;
|
@@ -7101,36 +8979,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7101
8979
|
const auto & hparams = ctx->model.hparams;
|
7102
8980
|
const auto & cparams = ctx->cparams;
|
7103
8981
|
|
7104
|
-
const
|
7105
|
-
const
|
7106
|
-
const
|
8982
|
+
const auto n_layer = hparams.n_layer;
|
8983
|
+
const auto n_embd = hparams.n_embd_gqa();
|
8984
|
+
const auto n_ctx = cparams.n_ctx;
|
7107
8985
|
|
7108
|
-
const size_t
|
7109
|
-
const
|
8986
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
8987
|
+
const uint32_t kv_head = kv_self.head;
|
8988
|
+
const uint32_t kv_size = kv_self.size;
|
7110
8989
|
|
7111
|
-
data_ctx->write(&
|
7112
|
-
data_ctx->write(&
|
8990
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8991
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
8992
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7113
8993
|
|
7114
|
-
if (
|
8994
|
+
if (kv_buf_size) {
|
7115
8995
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7116
8996
|
|
7117
8997
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7118
8998
|
ggml_cgraph gf{};
|
7119
8999
|
|
7120
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
9000
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7121
9001
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
7122
9002
|
kout3d->data = kout3d_data.data();
|
7123
9003
|
|
7124
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
9004
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7125
9005
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
7126
9006
|
vout3d->data = vout3d_data.data();
|
7127
9007
|
|
7128
9008
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7129
|
-
n_embd,
|
9009
|
+
n_embd, kv_head, n_layer,
|
7130
9010
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7131
9011
|
|
7132
9012
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7133
|
-
|
9013
|
+
kv_head, n_embd, n_layer,
|
7134
9014
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7135
9015
|
|
7136
9016
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -7144,6 +9024,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7144
9024
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
7145
9025
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
7146
9026
|
}
|
9027
|
+
|
9028
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
9029
|
+
const auto & cell = kv_self.cells[i];
|
9030
|
+
|
9031
|
+
const llama_pos pos = cell.pos;
|
9032
|
+
const size_t seq_id_size = cell.seq_id.size();
|
9033
|
+
|
9034
|
+
data_ctx->write(&pos, sizeof(pos));
|
9035
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
9036
|
+
|
9037
|
+
for (auto seq_id : cell.seq_id) {
|
9038
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
9039
|
+
}
|
9040
|
+
}
|
7147
9041
|
}
|
7148
9042
|
}
|
7149
9043
|
|
@@ -7215,34 +9109,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7215
9109
|
const int n_embd = hparams.n_embd_gqa();
|
7216
9110
|
const int n_ctx = cparams.n_ctx;
|
7217
9111
|
|
7218
|
-
size_t
|
7219
|
-
|
9112
|
+
size_t kv_buf_size;
|
9113
|
+
uint32_t kv_head;
|
9114
|
+
uint32_t kv_size;
|
7220
9115
|
|
7221
|
-
memcpy(&
|
7222
|
-
memcpy(&
|
9116
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9117
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9118
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
7223
9119
|
|
7224
|
-
if (
|
7225
|
-
GGML_ASSERT(kv_self.buf.size ==
|
9120
|
+
if (kv_buf_size) {
|
9121
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
7226
9122
|
|
7227
9123
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7228
9124
|
|
7229
9125
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7230
9126
|
ggml_cgraph gf{};
|
7231
9127
|
|
7232
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
9128
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7233
9129
|
kin3d->data = (void *) inp;
|
7234
9130
|
inp += ggml_nbytes(kin3d);
|
7235
9131
|
|
7236
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
9132
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7237
9133
|
vin3d->data = (void *) inp;
|
7238
9134
|
inp += ggml_nbytes(vin3d);
|
7239
9135
|
|
7240
9136
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7241
|
-
n_embd,
|
9137
|
+
n_embd, kv_head, n_layer,
|
7242
9138
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7243
9139
|
|
7244
9140
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7245
|
-
|
9141
|
+
kv_head, n_embd, n_layer,
|
7246
9142
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7247
9143
|
|
7248
9144
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -7252,8 +9148,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7252
9148
|
ggml_free(cpy_ctx);
|
7253
9149
|
}
|
7254
9150
|
|
7255
|
-
ctx->kv_self.head =
|
9151
|
+
ctx->kv_self.head = kv_head;
|
7256
9152
|
ctx->kv_self.size = kv_size;
|
9153
|
+
|
9154
|
+
ctx->kv_self.cells.resize(kv_size);
|
9155
|
+
|
9156
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
9157
|
+
llama_pos pos;
|
9158
|
+
size_t seq_id_size;
|
9159
|
+
|
9160
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
9161
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
9162
|
+
|
9163
|
+
ctx->kv_self.cells[i].pos = pos;
|
9164
|
+
|
9165
|
+
llama_seq_id seq_id;
|
9166
|
+
|
9167
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
9168
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
9169
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
9170
|
+
}
|
9171
|
+
}
|
7257
9172
|
}
|
7258
9173
|
|
7259
9174
|
const size_t nread = inp - src;
|
@@ -7471,6 +9386,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7471
9386
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7472
9387
|
return ctx->model.vocab.linefeed_id;
|
7473
9388
|
}
|
9389
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
9390
|
+
return ctx->model.vocab.special_prefix_id;
|
9391
|
+
}
|
9392
|
+
|
9393
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
9394
|
+
return ctx->model.vocab.special_middle_id;
|
9395
|
+
}
|
9396
|
+
|
9397
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
9398
|
+
return ctx->model.vocab.special_suffix_id;
|
9399
|
+
}
|
9400
|
+
|
9401
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
9402
|
+
return ctx->model.vocab.special_eot_id;
|
9403
|
+
}
|
9404
|
+
|
7474
9405
|
|
7475
9406
|
int llama_tokenize(
|
7476
9407
|
const struct llama_model * model,
|
@@ -7493,35 +9424,70 @@ int llama_tokenize(
|
|
7493
9424
|
return res.size();
|
7494
9425
|
}
|
7495
9426
|
|
9427
|
+
static std::string llama_decode_text(const std::string & text) {
|
9428
|
+
std::string decoded_text;
|
9429
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
9430
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
9431
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
9432
|
+
}
|
9433
|
+
|
9434
|
+
return decoded_text;
|
9435
|
+
}
|
9436
|
+
|
7496
9437
|
// does not write null-terminator to buf
|
7497
9438
|
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
9439
|
if (0 <= token && token < llama_n_vocab(model)) {
|
7499
|
-
|
7500
|
-
|
7501
|
-
if (
|
9440
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
9441
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
9442
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
9443
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7502
9444
|
llama_unescape_whitespace(result);
|
9445
|
+
if (length < (int) result.length()) {
|
9446
|
+
return -result.length();
|
9447
|
+
}
|
9448
|
+
memcpy(buf, result.c_str(), result.length());
|
9449
|
+
return result.length();
|
9450
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
9451
|
+
if (length < 3) {
|
9452
|
+
return -3;
|
9453
|
+
}
|
9454
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
9455
|
+
return 3;
|
9456
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
9457
|
+
;
|
9458
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
9459
|
+
if (length < 1) {
|
9460
|
+
return -1;
|
9461
|
+
}
|
9462
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
9463
|
+
return 1;
|
9464
|
+
} else {
|
9465
|
+
// TODO: for now we accept all unsupported token types,
|
9466
|
+
// suppressing them like CONTROL tokens.
|
9467
|
+
// GGML_ASSERT(false);
|
7503
9468
|
}
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
|
7520
|
-
|
7521
|
-
return -1;
|
9469
|
+
break;
|
9470
|
+
}
|
9471
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
9472
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
9473
|
+
std::string result = model->vocab.id_to_token[token].text;
|
9474
|
+
result = llama_decode_text(result);
|
9475
|
+
if (length < (int) result.length()) {
|
9476
|
+
return -result.length();
|
9477
|
+
}
|
9478
|
+
memcpy(buf, result.c_str(), result.length());
|
9479
|
+
return result.length();
|
9480
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
9481
|
+
;
|
9482
|
+
} else {
|
9483
|
+
// TODO: for now we accept all unsupported token types,
|
9484
|
+
// suppressing them like CONTROL tokens.
|
9485
|
+
// GGML_ASSERT(false);
|
7522
9486
|
}
|
7523
|
-
|
7524
|
-
|
9487
|
+
break;
|
9488
|
+
}
|
9489
|
+
default:
|
9490
|
+
GGML_ASSERT(false);
|
7525
9491
|
}
|
7526
9492
|
}
|
7527
9493
|
return 0;
|
@@ -7548,14 +9514,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7548
9514
|
const llama_timings timings = llama_get_timings(ctx);
|
7549
9515
|
|
7550
9516
|
LLAMA_LOG_INFO("\n");
|
7551
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7552
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
9517
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
9518
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7553
9519
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7554
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
9520
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7555
9521
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7556
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
9522
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7557
9523
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7558
|
-
LLAMA_LOG_INFO("%s: total time = %
|
9524
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7559
9525
|
}
|
7560
9526
|
|
7561
9527
|
void llama_reset_timings(struct llama_context * ctx) {
|