llama_cpp 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
123
125
|
}
|
124
126
|
s = std::move(result);
|
125
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
126
149
|
#ifdef GGML_USE_CPU_HBM
|
127
150
|
#include <hbwmalloc.h>
|
128
151
|
#endif
|
@@ -163,6 +186,9 @@ enum llm_arch {
|
|
163
186
|
LLM_ARCH_GPTNEOX,
|
164
187
|
LLM_ARCH_MPT,
|
165
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_PERSIMMON,
|
190
|
+
LLM_ARCH_REFACT,
|
191
|
+
LLM_ARCH_BLOOM,
|
166
192
|
LLM_ARCH_UNKNOWN,
|
167
193
|
};
|
168
194
|
|
@@ -175,6 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
175
201
|
{ LLM_ARCH_MPT, "mpt" },
|
176
202
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
203
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
204
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205
|
+
{ LLM_ARCH_REFACT, "refact" },
|
206
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
178
207
|
};
|
179
208
|
|
180
209
|
enum llm_kv {
|
@@ -277,6 +306,7 @@ struct LLM_KV {
|
|
277
306
|
|
278
307
|
enum llm_tensor {
|
279
308
|
LLM_TENSOR_TOKEN_EMBD,
|
309
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
280
310
|
LLM_TENSOR_POS_EMBD,
|
281
311
|
LLM_TENSOR_OUTPUT,
|
282
312
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -293,6 +323,8 @@ enum llm_tensor {
|
|
293
323
|
LLM_TENSOR_FFN_DOWN,
|
294
324
|
LLM_TENSOR_FFN_UP,
|
295
325
|
LLM_TENSOR_FFN_NORM,
|
326
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
327
|
+
LLM_TENSOR_ATTN_K_NORM,
|
296
328
|
};
|
297
329
|
|
298
330
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -374,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
374
406
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
375
407
|
},
|
376
408
|
},
|
409
|
+
{
|
410
|
+
LLM_ARCH_PERSIMMON,
|
411
|
+
{
|
412
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
413
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
414
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
415
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
416
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
417
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
418
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
419
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
420
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
421
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
422
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
423
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
424
|
+
},
|
425
|
+
},
|
377
426
|
{
|
378
427
|
LLM_ARCH_MPT,
|
379
428
|
{
|
380
429
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
430
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
431
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
432
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
433
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
434
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
435
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
381
438
|
},
|
382
439
|
},
|
383
440
|
{
|
@@ -395,6 +452,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
395
452
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
453
|
},
|
397
454
|
},
|
455
|
+
{
|
456
|
+
LLM_ARCH_REFACT,
|
457
|
+
{
|
458
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
459
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
460
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
461
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
462
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
463
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
464
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
465
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
466
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
467
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
468
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
469
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
470
|
+
},
|
471
|
+
},
|
472
|
+
{
|
473
|
+
LLM_ARCH_BLOOM,
|
474
|
+
{
|
475
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
476
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
477
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
478
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
479
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
480
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
481
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
482
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
483
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
485
|
+
},
|
486
|
+
},
|
398
487
|
{
|
399
488
|
LLM_ARCH_UNKNOWN,
|
400
489
|
{
|
@@ -912,6 +1001,7 @@ enum e_model {
|
|
912
1001
|
MODEL_1B,
|
913
1002
|
MODEL_3B,
|
914
1003
|
MODEL_7B,
|
1004
|
+
MODEL_8B,
|
915
1005
|
MODEL_13B,
|
916
1006
|
MODEL_15B,
|
917
1007
|
MODEL_30B,
|
@@ -942,8 +1032,28 @@ struct llama_hparams {
|
|
942
1032
|
float rope_freq_base_train;
|
943
1033
|
float rope_freq_scale_train;
|
944
1034
|
|
1035
|
+
float f_clamp_kqv;
|
1036
|
+
float f_max_alibi_bias;
|
1037
|
+
|
945
1038
|
bool operator!=(const llama_hparams & other) const {
|
946
|
-
|
1039
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1040
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1041
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1042
|
+
if (this->n_embd != other.n_embd) return true;
|
1043
|
+
if (this->n_head != other.n_head) return true;
|
1044
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1045
|
+
if (this->n_layer != other.n_layer) return true;
|
1046
|
+
if (this->n_rot != other.n_rot) return true;
|
1047
|
+
if (this->n_ff != other.n_ff) return true;
|
1048
|
+
|
1049
|
+
const float EPSILON = 1e-9;
|
1050
|
+
|
1051
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1052
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1053
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1054
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1055
|
+
|
1056
|
+
return false;
|
947
1057
|
}
|
948
1058
|
|
949
1059
|
uint32_t n_gqa() const {
|
@@ -977,6 +1087,10 @@ struct llama_layer {
|
|
977
1087
|
struct ggml_tensor * attn_norm_b;
|
978
1088
|
struct ggml_tensor * attn_norm_2;
|
979
1089
|
struct ggml_tensor * attn_norm_2_b;
|
1090
|
+
struct ggml_tensor * attn_q_norm;
|
1091
|
+
struct ggml_tensor * attn_q_norm_b;
|
1092
|
+
struct ggml_tensor * attn_k_norm;
|
1093
|
+
struct ggml_tensor * attn_k_norm_b;
|
980
1094
|
|
981
1095
|
// attention
|
982
1096
|
struct ggml_tensor * wq;
|
@@ -1018,6 +1132,9 @@ struct llama_kv_cell {
|
|
1018
1132
|
struct llama_kv_cache {
|
1019
1133
|
bool has_shift = false;
|
1020
1134
|
|
1135
|
+
// Note: The value of head isn't only used to optimize searching
|
1136
|
+
// for a free KV slot. llama_decode_internal also uses it, so it
|
1137
|
+
// cannot be freely changed after a slot has been allocated.
|
1021
1138
|
uint32_t head = 0;
|
1022
1139
|
uint32_t size = 0;
|
1023
1140
|
|
@@ -1071,6 +1188,10 @@ struct llama_vocab {
|
|
1071
1188
|
id special_pad_id = -1;
|
1072
1189
|
|
1073
1190
|
id linefeed_id = 13;
|
1191
|
+
id special_prefix_id = 32007;
|
1192
|
+
id special_middle_id = 32009;
|
1193
|
+
id special_suffix_id = 32008;
|
1194
|
+
id special_eot_id = 32010;
|
1074
1195
|
|
1075
1196
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1076
1197
|
replace_all(token_left, " ", "\u0120");
|
@@ -1099,6 +1220,8 @@ struct llama_model {
|
|
1099
1220
|
|
1100
1221
|
struct ggml_tensor * tok_embeddings;
|
1101
1222
|
struct ggml_tensor * pos_embeddings;
|
1223
|
+
struct ggml_tensor * tok_norm;
|
1224
|
+
struct ggml_tensor * tok_norm_b;
|
1102
1225
|
|
1103
1226
|
struct ggml_tensor * output_norm;
|
1104
1227
|
struct ggml_tensor * output_norm_b;
|
@@ -1228,7 +1351,11 @@ static bool llama_kv_cache_init(
|
|
1228
1351
|
cache.cells.clear();
|
1229
1352
|
cache.cells.resize(n_ctx);
|
1230
1353
|
|
1354
|
+
// TODO: this should be:
|
1355
|
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1356
|
+
// change it and test that it works
|
1231
1357
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
memset(cache.buf.data, 0, cache.buf.size);
|
1232
1359
|
|
1233
1360
|
struct ggml_init_params params;
|
1234
1361
|
params.mem_size = cache.buf.size;
|
@@ -1271,9 +1398,11 @@ static bool llama_kv_cache_init(
|
|
1271
1398
|
|
1272
1399
|
// find an empty slot of size "n_tokens" in the cache
|
1273
1400
|
// updates the cache head
|
1401
|
+
// Note: On success, it's important that cache.head points
|
1402
|
+
// to the first cell of the slot.
|
1274
1403
|
static bool llama_kv_cache_find_slot(
|
1275
|
-
|
1276
|
-
|
1404
|
+
struct llama_kv_cache & cache,
|
1405
|
+
const struct llama_batch & batch) {
|
1277
1406
|
const uint32_t n_ctx = cache.size;
|
1278
1407
|
const uint32_t n_tokens = batch.n_tokens;
|
1279
1408
|
|
@@ -1286,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
|
|
1286
1415
|
|
1287
1416
|
while (true) {
|
1288
1417
|
if (cache.head + n_tokens > n_ctx) {
|
1418
|
+
n_tested += n_ctx - cache.head;
|
1289
1419
|
cache.head = 0;
|
1290
|
-
n_tested += n_ctx - cache.head;
|
1291
1420
|
continue;
|
1292
1421
|
}
|
1293
1422
|
|
@@ -1338,29 +1467,46 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1338
1467
|
cache.cells[i].pos = -1;
|
1339
1468
|
cache.cells[i].seq_id.clear();
|
1340
1469
|
}
|
1470
|
+
|
1471
|
+
// Searching for a free slot can start here since we know it will be empty.
|
1472
|
+
cache.head = uint32_t(c0);
|
1341
1473
|
}
|
1342
1474
|
|
1343
1475
|
static void llama_kv_cache_seq_rm(
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1476
|
+
struct llama_kv_cache & cache,
|
1477
|
+
llama_seq_id seq_id,
|
1478
|
+
llama_pos p0,
|
1479
|
+
llama_pos p1) {
|
1480
|
+
uint32_t new_head = cache.size;
|
1481
|
+
|
1482
|
+
if (p0 < 0) p0 = 0;
|
1483
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1484
|
+
|
1348
1485
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
1486
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
1487
|
cache.cells[i].seq_id.erase(seq_id);
|
1351
1488
|
if (cache.cells[i].seq_id.empty()) {
|
1352
1489
|
cache.cells[i].pos = -1;
|
1490
|
+
if (new_head == cache.size) new_head = i;
|
1353
1491
|
}
|
1354
1492
|
}
|
1355
1493
|
}
|
1494
|
+
|
1495
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1496
|
+
if (new_head != cache.size) cache.head = new_head;
|
1356
1497
|
}
|
1357
1498
|
|
1358
1499
|
static void llama_kv_cache_seq_cp(
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1500
|
+
struct llama_kv_cache & cache,
|
1501
|
+
llama_seq_id seq_id_src,
|
1502
|
+
llama_seq_id seq_id_dst,
|
1503
|
+
llama_pos p0,
|
1504
|
+
llama_pos p1) {
|
1505
|
+
if (p0 < 0) p0 = 0;
|
1506
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1507
|
+
|
1508
|
+
cache.head = 0;
|
1509
|
+
|
1364
1510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
1511
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
1512
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1369,32 +1515,48 @@ static void llama_kv_cache_seq_cp(
|
|
1369
1515
|
}
|
1370
1516
|
|
1371
1517
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1518
|
+
uint32_t new_head = cache.size;
|
1519
|
+
|
1372
1520
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
1521
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
1522
|
cache.cells[i].pos = -1;
|
1375
1523
|
cache.cells[i].seq_id.clear();
|
1524
|
+
if (new_head == cache.size) new_head = i;
|
1376
1525
|
}
|
1377
1526
|
}
|
1527
|
+
|
1528
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1529
|
+
if (new_head != cache.size) cache.head = new_head;
|
1378
1530
|
}
|
1379
1531
|
|
1380
1532
|
static void llama_kv_cache_seq_shift(
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1533
|
+
struct llama_kv_cache & cache,
|
1534
|
+
llama_seq_id seq_id,
|
1535
|
+
llama_pos p0,
|
1536
|
+
llama_pos p1,
|
1537
|
+
llama_pos delta) {
|
1538
|
+
uint32_t new_head = cache.size;
|
1539
|
+
|
1540
|
+
if (p0 < 0) p0 = 0;
|
1541
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1542
|
+
|
1386
1543
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
1544
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
1545
|
cache.cells[i].pos += delta;
|
1389
1546
|
if (cache.cells[i].pos < 0) {
|
1390
1547
|
cache.cells[i].pos = -1;
|
1391
1548
|
cache.cells[i].seq_id.clear();
|
1549
|
+
if (new_head == cache.size) new_head = i;
|
1392
1550
|
} else {
|
1393
1551
|
cache.has_shift = true;
|
1394
1552
|
cache.cells[i].delta = delta;
|
1395
1553
|
}
|
1396
1554
|
}
|
1397
1555
|
}
|
1556
|
+
|
1557
|
+
// If we freed up a slot, set head to it so searching can start there.
|
1558
|
+
// Otherwise we just start the next search from the beginning.
|
1559
|
+
cache.head = new_head != cache.size ? new_head : 0;
|
1398
1560
|
}
|
1399
1561
|
|
1400
1562
|
//
|
@@ -1598,7 +1760,7 @@ struct llama_model_loader {
|
|
1598
1760
|
}
|
1599
1761
|
}
|
1600
1762
|
|
1601
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1763
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1602
1764
|
if (backend != GGML_BACKEND_CPU) {
|
1603
1765
|
ggml_set_no_alloc(ctx, true);
|
1604
1766
|
}
|
@@ -1616,7 +1778,7 @@ struct llama_model_loader {
|
|
1616
1778
|
return tensor;
|
1617
1779
|
}
|
1618
1780
|
|
1619
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1781
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1620
1782
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1621
1783
|
|
1622
1784
|
if (cur == NULL) {
|
@@ -1795,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
1795
1957
|
case MODEL_1B: return "1B";
|
1796
1958
|
case MODEL_3B: return "3B";
|
1797
1959
|
case MODEL_7B: return "7B";
|
1960
|
+
case MODEL_8B: return "8B";
|
1798
1961
|
case MODEL_13B: return "13B";
|
1799
1962
|
case MODEL_15B: return "15B";
|
1800
1963
|
case MODEL_30B: return "30B";
|
@@ -1907,6 +2070,49 @@ static void llm_load_hparams(
|
|
1907
2070
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
2071
|
}
|
1909
2072
|
} break;
|
2073
|
+
case LLM_ARCH_PERSIMMON:
|
2074
|
+
{
|
2075
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2076
|
+
switch (hparams.n_layer) {
|
2077
|
+
case 36: model.type = e_model::MODEL_8B; break;
|
2078
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2079
|
+
}
|
2080
|
+
} break;
|
2081
|
+
case LLM_ARCH_REFACT:
|
2082
|
+
{
|
2083
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
2084
|
+
switch (hparams.n_layer) {
|
2085
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
2086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2087
|
+
}
|
2088
|
+
} break;
|
2089
|
+
case LLM_ARCH_BLOOM:
|
2090
|
+
{
|
2091
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2092
|
+
|
2093
|
+
switch (hparams.n_layer) {
|
2094
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2095
|
+
case 30:
|
2096
|
+
switch (hparams.n_embd) {
|
2097
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2098
|
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2099
|
+
} break;
|
2100
|
+
}
|
2101
|
+
} break;
|
2102
|
+
case LLM_ARCH_MPT:
|
2103
|
+
{
|
2104
|
+
hparams.f_clamp_kqv = 0.0f;
|
2105
|
+
|
2106
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2107
|
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2108
|
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2109
|
+
|
2110
|
+
switch (hparams.n_layer) {
|
2111
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2112
|
+
case 48: model.type = e_model::MODEL_30B; break;
|
2113
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2114
|
+
}
|
2115
|
+
} break;
|
1910
2116
|
default: (void)0;
|
1911
2117
|
}
|
1912
2118
|
|
@@ -1971,6 +2177,7 @@ static void llm_load_vocab(
|
|
1971
2177
|
|
1972
2178
|
for (int i = 0; i < n_merges; i++) {
|
1973
2179
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2180
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1974
2181
|
|
1975
2182
|
std::string first;
|
1976
2183
|
std::string second;
|
@@ -2005,6 +2212,7 @@ static void llm_load_vocab(
|
|
2005
2212
|
|
2006
2213
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
2007
2214
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2215
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
2008
2216
|
|
2009
2217
|
vocab.token_to_id[word] = i;
|
2010
2218
|
|
@@ -2013,12 +2221,13 @@ static void llm_load_vocab(
|
|
2013
2221
|
token_data.score = scores ? scores[i] : 0.0f;
|
2014
2222
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
2015
2223
|
}
|
2224
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
2016
2225
|
|
2017
2226
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
2018
2227
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2019
2228
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2020
2229
|
} else {
|
2021
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2230
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
2022
2231
|
}
|
2023
2232
|
|
2024
2233
|
// special tokens
|
@@ -2048,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2048
2257
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
2258
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
2259
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2260
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2261
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2051
2262
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
2263
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
2264
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2141,13 +2352,14 @@ static void llm_load_tensors(
|
|
2141
2352
|
const auto tn = LLM_TN(model.arch);
|
2142
2353
|
switch (model.arch) {
|
2143
2354
|
case LLM_ARCH_LLAMA:
|
2355
|
+
case LLM_ARCH_REFACT:
|
2144
2356
|
{
|
2145
2357
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2146
2358
|
|
2147
2359
|
// output
|
2148
2360
|
{
|
2149
|
-
|
2150
|
-
|
2361
|
+
ggml_backend_type backend_norm;
|
2362
|
+
ggml_backend_type backend_output;
|
2151
2363
|
|
2152
2364
|
if (n_gpu_layers > int(n_layer)) {
|
2153
2365
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2182,8 +2394,8 @@ static void llm_load_tensors(
|
|
2182
2394
|
model.layers.resize(n_layer);
|
2183
2395
|
|
2184
2396
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2185
|
-
const
|
2186
|
-
const
|
2397
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2398
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2187
2399
|
|
2188
2400
|
auto & layer = model.layers[i];
|
2189
2401
|
|
@@ -2212,8 +2424,8 @@ static void llm_load_tensors(
|
|
2212
2424
|
{
|
2213
2425
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2214
2426
|
{
|
2215
|
-
|
2216
|
-
|
2427
|
+
ggml_backend_type backend_norm;
|
2428
|
+
ggml_backend_type backend_output;
|
2217
2429
|
|
2218
2430
|
if (n_gpu_layers > int(n_layer)) {
|
2219
2431
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2248,8 +2460,8 @@ static void llm_load_tensors(
|
|
2248
2460
|
model.layers.resize(n_layer);
|
2249
2461
|
|
2250
2462
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2251
|
-
const
|
2252
|
-
const
|
2463
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2464
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2253
2465
|
|
2254
2466
|
auto & layer = model.layers[i];
|
2255
2467
|
|
@@ -2282,8 +2494,8 @@ static void llm_load_tensors(
|
|
2282
2494
|
|
2283
2495
|
// output
|
2284
2496
|
{
|
2285
|
-
|
2286
|
-
|
2497
|
+
ggml_backend_type backend_norm;
|
2498
|
+
ggml_backend_type backend_output;
|
2287
2499
|
|
2288
2500
|
if (n_gpu_layers > int(n_layer)) {
|
2289
2501
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2320,8 +2532,8 @@ static void llm_load_tensors(
|
|
2320
2532
|
model.layers.resize(n_layer);
|
2321
2533
|
|
2322
2534
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2323
|
-
const
|
2324
|
-
const
|
2535
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2536
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2325
2537
|
|
2326
2538
|
auto & layer = model.layers[i];
|
2327
2539
|
|
@@ -2359,8 +2571,8 @@ static void llm_load_tensors(
|
|
2359
2571
|
|
2360
2572
|
// output
|
2361
2573
|
{
|
2362
|
-
|
2363
|
-
|
2574
|
+
ggml_backend_type backend_norm;
|
2575
|
+
ggml_backend_type backend_output;
|
2364
2576
|
|
2365
2577
|
if (n_gpu_layers > int(n_layer)) {
|
2366
2578
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2397,8 +2609,8 @@ static void llm_load_tensors(
|
|
2397
2609
|
model.layers.resize(n_layer);
|
2398
2610
|
|
2399
2611
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2400
|
-
const
|
2401
|
-
const
|
2612
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2613
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2402
2614
|
|
2403
2615
|
auto & layer = model.layers[i];
|
2404
2616
|
|
@@ -2431,103 +2643,313 @@ static void llm_load_tensors(
|
|
2431
2643
|
}
|
2432
2644
|
}
|
2433
2645
|
} break;
|
2434
|
-
|
2435
|
-
|
2436
|
-
|
2437
|
-
}
|
2646
|
+
case LLM_ARCH_PERSIMMON:
|
2647
|
+
{
|
2648
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2438
2649
|
|
2439
|
-
|
2650
|
+
{
|
2651
|
+
ggml_backend_type backend_norm;
|
2652
|
+
ggml_backend_type backend_output;
|
2440
2653
|
|
2441
|
-
|
2442
|
-
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2654
|
+
if (n_gpu_layers > int(n_layer)) {
|
2655
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2656
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2657
|
+
#ifndef _WIN32
|
2658
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2659
|
+
#else
|
2660
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2661
|
+
#endif // _WIN32
|
2447
2662
|
|
2448
|
-
|
2663
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2664
|
+
} else {
|
2665
|
+
backend_norm = GGML_BACKEND_CPU;
|
2666
|
+
backend_output = GGML_BACKEND_CPU;
|
2667
|
+
}
|
2449
2668
|
|
2450
|
-
|
2451
|
-
|
2669
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2670
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2671
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2452
2672
|
|
2453
|
-
|
2454
|
-
|
2455
|
-
|
2456
|
-
|
2673
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2674
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2675
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2676
|
+
}
|
2677
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2678
|
+
vram_weights += ggml_nbytes(model.output);
|
2679
|
+
}
|
2680
|
+
}
|
2457
2681
|
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
2461
|
-
|
2462
|
-
|
2463
|
-
|
2464
|
-
|
2682
|
+
const uint32_t n_ff = hparams.n_ff;
|
2683
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2684
|
+
model.layers.resize(n_layer);
|
2685
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2686
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2687
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2688
|
+
auto & layer = model.layers[i];
|
2689
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2690
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2691
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2692
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2693
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2694
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2695
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2696
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2697
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2698
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2699
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2700
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2701
|
+
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
2702
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
2703
|
+
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
2704
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2705
|
+
}
|
2706
|
+
} break;
|
2707
|
+
case LLM_ARCH_BLOOM:
|
2708
|
+
{
|
2709
|
+
// TODO: CPU-only for now
|
2465
2710
|
|
2466
|
-
|
2467
|
-
|
2711
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2712
|
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2713
|
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2714
|
+
|
2715
|
+
// output
|
2716
|
+
{
|
2717
|
+
ggml_backend_type backend_norm;
|
2718
|
+
ggml_backend_type backend_output;
|
2719
|
+
|
2720
|
+
if (n_gpu_layers > int(n_layer)) {
|
2721
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2722
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2723
|
+
#ifndef _WIN32
|
2724
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2468
2725
|
#else
|
2469
|
-
|
2470
|
-
#endif //
|
2471
|
-
}
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2727
|
+
#endif // _WIN32
|
2472
2728
|
|
2473
|
-
|
2474
|
-
|
2475
|
-
|
2476
|
-
|
2477
|
-
|
2729
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2730
|
+
} else {
|
2731
|
+
backend_norm = GGML_BACKEND_CPU;
|
2732
|
+
backend_output = GGML_BACKEND_CPU;
|
2733
|
+
}
|
2478
2734
|
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
ggml_cuda_set_tensor_split(tensor_split);
|
2483
|
-
}
|
2484
|
-
#endif
|
2735
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2736
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2737
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2485
2738
|
|
2486
|
-
|
2739
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2740
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2741
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2742
|
+
}
|
2743
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2744
|
+
vram_weights += ggml_nbytes(model.output);
|
2745
|
+
}
|
2746
|
+
}
|
2487
2747
|
|
2488
|
-
|
2489
|
-
progress_callback(1.0f, progress_callback_user_data);
|
2490
|
-
}
|
2748
|
+
const uint32_t n_ff = hparams.n_ff;
|
2491
2749
|
|
2492
|
-
|
2750
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2493
2751
|
|
2494
|
-
|
2495
|
-
// we take page faults deferred by mmap() into consideration
|
2496
|
-
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2497
|
-
}
|
2752
|
+
model.layers.resize(n_layer);
|
2498
2753
|
|
2499
|
-
|
2500
|
-
|
2501
|
-
|
2502
|
-
int n_gpu_layers,
|
2503
|
-
int main_gpu,
|
2504
|
-
const float * tensor_split,
|
2505
|
-
bool use_mmap,
|
2506
|
-
bool use_mlock,
|
2507
|
-
bool vocab_only,
|
2508
|
-
llama_progress_callback progress_callback,
|
2509
|
-
void *progress_callback_user_data) {
|
2510
|
-
try {
|
2511
|
-
llama_model_loader ml(fname, use_mmap);
|
2754
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2512
2757
|
|
2513
|
-
|
2758
|
+
auto & layer = model.layers[i];
|
2514
2759
|
|
2515
|
-
|
2516
|
-
|
2517
|
-
llm_load_vocab (ml, model);
|
2760
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2761
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2518
2762
|
|
2519
|
-
|
2763
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2764
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2520
2765
|
|
2521
|
-
|
2522
|
-
|
2523
|
-
}
|
2766
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2767
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2524
2768
|
|
2525
|
-
|
2526
|
-
|
2527
|
-
return true;
|
2528
|
-
}
|
2769
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2529
2771
|
|
2530
|
-
|
2772
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2773
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2774
|
+
|
2775
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2776
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2777
|
+
|
2778
|
+
if (backend == GGML_BACKEND_GPU) {
|
2779
|
+
vram_weights +=
|
2780
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2781
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2782
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2783
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2784
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2785
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2786
|
+
}
|
2787
|
+
}
|
2788
|
+
} break;
|
2789
|
+
case LLM_ARCH_MPT:
|
2790
|
+
{
|
2791
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2792
|
+
|
2793
|
+
// output
|
2794
|
+
{
|
2795
|
+
ggml_backend_type backend_norm;
|
2796
|
+
ggml_backend_type backend_output;
|
2797
|
+
|
2798
|
+
if (n_gpu_layers > int(n_layer)) {
|
2799
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2800
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2801
|
+
#ifndef _WIN32
|
2802
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2803
|
+
#else
|
2804
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2805
|
+
#endif // _WIN32
|
2806
|
+
|
2807
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2808
|
+
} else {
|
2809
|
+
backend_norm = GGML_BACKEND_CPU;
|
2810
|
+
backend_output = GGML_BACKEND_CPU;
|
2811
|
+
}
|
2812
|
+
|
2813
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2814
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2815
|
+
|
2816
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2817
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2818
|
+
}
|
2819
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2820
|
+
vram_weights += ggml_nbytes(model.output);
|
2821
|
+
}
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
const uint32_t n_ff = hparams.n_ff;
|
2825
|
+
|
2826
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2827
|
+
|
2828
|
+
model.layers.resize(n_layer);
|
2829
|
+
|
2830
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2831
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2832
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2833
|
+
|
2834
|
+
auto & layer = model.layers[i];
|
2835
|
+
|
2836
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2838
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
|
+
|
2840
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
|
+
|
2842
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2843
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2844
|
+
|
2845
|
+
if (backend == GGML_BACKEND_GPU) {
|
2846
|
+
vram_weights +=
|
2847
|
+
ggml_nbytes(layer.attn_norm) +
|
2848
|
+
ggml_nbytes(layer.wqkv) +
|
2849
|
+
ggml_nbytes(layer.wo) +
|
2850
|
+
ggml_nbytes(layer.ffn_norm) +
|
2851
|
+
ggml_nbytes(layer.w2) +
|
2852
|
+
ggml_nbytes(layer.w3);
|
2853
|
+
}
|
2854
|
+
}
|
2855
|
+
} break;
|
2856
|
+
default:
|
2857
|
+
throw std::runtime_error("unknown architecture");
|
2858
|
+
}
|
2859
|
+
}
|
2860
|
+
|
2861
|
+
ml.done_getting_tensors();
|
2862
|
+
|
2863
|
+
// print memory requirements
|
2864
|
+
{
|
2865
|
+
// this is the total memory required to run the inference
|
2866
|
+
size_t mem_required =
|
2867
|
+
ctx_size +
|
2868
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2869
|
+
|
2870
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2871
|
+
|
2872
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2873
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
2874
|
+
|
2875
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
2876
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
2877
|
+
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2878
|
+
}
|
2879
|
+
|
2880
|
+
#ifdef GGML_USE_CUBLAS
|
2881
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2882
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2883
|
+
#elif defined(GGML_USE_CLBLAST)
|
2884
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2885
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
2886
|
+
#endif // GGML_USE_CUBLAS
|
2887
|
+
|
2888
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2889
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2890
|
+
#else
|
2891
|
+
(void) n_gpu_layers;
|
2892
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2893
|
+
}
|
2894
|
+
|
2895
|
+
// populate `tensors_by_name`
|
2896
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
2897
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
2898
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
2899
|
+
}
|
2900
|
+
|
2901
|
+
(void) tensor_split;
|
2902
|
+
#ifdef GGML_USE_CUBLAS
|
2903
|
+
{
|
2904
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
2905
|
+
}
|
2906
|
+
#endif
|
2907
|
+
|
2908
|
+
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
2909
|
+
|
2910
|
+
if (progress_callback) {
|
2911
|
+
progress_callback(1.0f, progress_callback_user_data);
|
2912
|
+
}
|
2913
|
+
|
2914
|
+
model.mapping = std::move(ml.mapping);
|
2915
|
+
|
2916
|
+
// loading time will be recalculate after the first eval, so
|
2917
|
+
// we take page faults deferred by mmap() into consideration
|
2918
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
2919
|
+
}
|
2920
|
+
|
2921
|
+
static bool llama_model_load(
|
2922
|
+
const std::string & fname,
|
2923
|
+
llama_model & model,
|
2924
|
+
int n_gpu_layers,
|
2925
|
+
int main_gpu,
|
2926
|
+
const float * tensor_split,
|
2927
|
+
bool use_mmap,
|
2928
|
+
bool use_mlock,
|
2929
|
+
bool vocab_only,
|
2930
|
+
llama_progress_callback progress_callback,
|
2931
|
+
void *progress_callback_user_data) {
|
2932
|
+
try {
|
2933
|
+
llama_model_loader ml(fname, use_mmap);
|
2934
|
+
|
2935
|
+
model.hparams.vocab_only = vocab_only;
|
2936
|
+
|
2937
|
+
llm_load_arch (ml, model);
|
2938
|
+
llm_load_hparams(ml, model);
|
2939
|
+
llm_load_vocab (ml, model);
|
2940
|
+
|
2941
|
+
llm_load_print_meta(ml, model);
|
2942
|
+
|
2943
|
+
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2944
|
+
throw std::runtime_error("vocab size mismatch");
|
2945
|
+
}
|
2946
|
+
|
2947
|
+
if (vocab_only) {
|
2948
|
+
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
2949
|
+
return true;
|
2950
|
+
}
|
2951
|
+
|
2952
|
+
llm_load_tensors(
|
2531
2953
|
ml, model, n_gpu_layers,
|
2532
2954
|
main_gpu, tensor_split,
|
2533
2955
|
use_mlock, progress_callback, progress_callback_user_data);
|
@@ -2540,8 +2962,8 @@ static bool llama_model_load(
|
|
2540
2962
|
}
|
2541
2963
|
|
2542
2964
|
static struct ggml_cgraph * llm_build_llama(
|
2543
|
-
|
2544
|
-
|
2965
|
+
llama_context & lctx,
|
2966
|
+
const llama_batch & batch) {
|
2545
2967
|
const auto & model = lctx.model;
|
2546
2968
|
const auto & hparams = model.hparams;
|
2547
2969
|
const auto & cparams = lctx.cparams;
|
@@ -2579,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2579
3001
|
struct ggml_init_params params = {
|
2580
3002
|
/*.mem_size =*/ buf_compute.size,
|
2581
3003
|
/*.mem_buffer =*/ buf_compute.data,
|
2582
|
-
/*.no_alloc =*/
|
3004
|
+
/*.no_alloc =*/ true,
|
2583
3005
|
};
|
2584
3006
|
|
2585
|
-
params.no_alloc = true;
|
2586
|
-
|
2587
3007
|
struct ggml_context * ctx0 = ggml_init(params);
|
2588
3008
|
|
2589
3009
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -2967,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2967
3387
|
struct ggml_init_params params = {
|
2968
3388
|
/*.mem_size =*/ buf_compute.size,
|
2969
3389
|
/*.mem_buffer =*/ buf_compute.data,
|
2970
|
-
/*.no_alloc =*/
|
3390
|
+
/*.no_alloc =*/ true,
|
2971
3391
|
};
|
2972
3392
|
|
2973
|
-
params.no_alloc = true;
|
2974
|
-
|
2975
3393
|
struct ggml_context * ctx0 = ggml_init(params);
|
2976
3394
|
|
2977
3395
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3334,7 +3752,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3334
3752
|
return gf;
|
3335
3753
|
}
|
3336
3754
|
|
3337
|
-
static struct ggml_cgraph *
|
3755
|
+
static struct ggml_cgraph * llm_build_refact(
|
3338
3756
|
llama_context & lctx,
|
3339
3757
|
const llama_batch & batch) {
|
3340
3758
|
const auto & model = lctx.model;
|
@@ -3353,11 +3771,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3353
3771
|
const int64_t n_embd_head = hparams.n_embd_head();
|
3354
3772
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3355
3773
|
|
3356
|
-
|
3357
|
-
|
3358
|
-
const float freq_base = cparams.rope_freq_base;
|
3359
|
-
const float freq_scale = cparams.rope_freq_scale;
|
3360
|
-
const float norm_eps = hparams.f_norm_eps;
|
3774
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3361
3775
|
|
3362
3776
|
const int n_gpu_layers = model.n_gpu_layers;
|
3363
3777
|
|
@@ -3365,21 +3779,16 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3365
3779
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
3780
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
3781
|
|
3368
|
-
|
3369
|
-
|
3370
|
-
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
-
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3782
|
+
// printf("n_kv = %d\n", n_kv);
|
3372
3783
|
|
3373
3784
|
auto & buf_compute = lctx.buf_compute;
|
3374
3785
|
|
3375
3786
|
struct ggml_init_params params = {
|
3376
3787
|
/*.mem_size =*/ buf_compute.size,
|
3377
3788
|
/*.mem_buffer =*/ buf_compute.data,
|
3378
|
-
/*.no_alloc =*/
|
3789
|
+
/*.no_alloc =*/ true,
|
3379
3790
|
};
|
3380
3791
|
|
3381
|
-
params.no_alloc = true;
|
3382
|
-
|
3383
3792
|
struct ggml_context * ctx0 = ggml_init(params);
|
3384
3793
|
|
3385
3794
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
@@ -3436,7 +3845,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3436
3845
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3437
3846
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3438
3847
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3439
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
3848
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3440
3849
|
}
|
3441
3850
|
|
3442
3851
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
@@ -3462,47 +3871,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3462
3871
|
}
|
3463
3872
|
}
|
3464
3873
|
|
3465
|
-
// KQ_pos - contains the positions
|
3466
|
-
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
-
offload_func_kq(KQ_pos);
|
3468
|
-
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
-
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
-
int * data = (int *) KQ_pos->data;
|
3472
|
-
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
-
data[i] = batch.pos[i];
|
3474
|
-
}
|
3475
|
-
}
|
3476
|
-
|
3477
|
-
// shift the entire K-cache if needed
|
3478
|
-
if (do_rope_shift) {
|
3479
|
-
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
-
offload_func_kq(K_shift);
|
3481
|
-
ggml_set_name(K_shift, "K_shift");
|
3482
|
-
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
-
int * data = (int *) K_shift->data;
|
3485
|
-
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
-
data[i] = kv_self.cells[i].delta;
|
3487
|
-
}
|
3488
|
-
}
|
3489
|
-
|
3490
|
-
for (int il = 0; il < n_layer; ++il) {
|
3491
|
-
struct ggml_tensor * tmp =
|
3492
|
-
ggml_rope_custom_inplace(ctx0,
|
3493
|
-
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
-
n_embd_head, n_head_kv, n_ctx,
|
3495
|
-
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
-
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
-
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
-
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
-
offload_func_kq(tmp);
|
3500
|
-
ggml_build_forward_expand(gf, tmp);
|
3501
|
-
}
|
3502
|
-
}
|
3503
|
-
|
3504
3874
|
for (int il = 0; il < n_layer; ++il) {
|
3505
|
-
|
3875
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3506
3876
|
|
3507
3877
|
offload_func_t offload_func = llama_nop;
|
3508
3878
|
|
@@ -3512,80 +3882,49 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3512
3882
|
}
|
3513
3883
|
#endif // GGML_USE_CUBLAS
|
3514
3884
|
|
3515
|
-
|
3516
|
-
// TODO: refactor into common function (shared with LLaMA)
|
3517
|
-
{
|
3518
|
-
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
3519
|
-
offload_func(attn_norm);
|
3885
|
+
struct ggml_tensor * inpSA = inpL;
|
3520
3886
|
|
3521
|
-
|
3522
|
-
|
3523
|
-
|
3524
|
-
offload_func(
|
3525
|
-
|
3887
|
+
// norm
|
3888
|
+
{
|
3889
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3890
|
+
offload_func(cur);
|
3891
|
+
ggml_set_name(cur, "rms_norm_0");
|
3526
3892
|
|
3527
|
-
|
3528
|
-
|
3529
|
-
|
3893
|
+
// cur = cur*attn_norm(broadcasted)
|
3894
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3895
|
+
offload_func(cur);
|
3896
|
+
ggml_set_name(cur, "attention_norm_0");
|
3897
|
+
}
|
3530
3898
|
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3535
|
-
|
3536
|
-
|
3537
|
-
cur = attn_norm;
|
3538
|
-
}
|
3539
|
-
|
3540
|
-
// compute QKV
|
3541
|
-
|
3542
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
3543
|
-
offload_func_kq(cur);
|
3544
|
-
|
3545
|
-
// Note that the strides for Kcur, Vcur are set up so that the
|
3546
|
-
// resulting views are misaligned with the tensor's storage
|
3547
|
-
// (by applying the K/V offset we shift the tensor's original
|
3548
|
-
// view to stick out behind the viewed QKV tensor's allocated
|
3549
|
-
// memory, so to say). This is ok because no actual accesses
|
3550
|
-
// happen to that out-of-range memory, but it can require some
|
3551
|
-
// trickery when trying to accurately dump these views for
|
3552
|
-
// debugging.
|
3553
|
-
|
3554
|
-
const size_t wsize = ggml_type_size(cur->type);
|
3899
|
+
// self-attention
|
3900
|
+
{
|
3901
|
+
// compute Q and K
|
3902
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3903
|
+
offload_func_kq(tmpk);
|
3904
|
+
ggml_set_name(tmpk, "tmpk");
|
3555
3905
|
|
3556
|
-
|
3557
|
-
// non-contiguous views is added for the rope operator
|
3558
|
-
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3559
|
-
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3560
|
-
wsize * n_embd_head,
|
3561
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3562
|
-
0));
|
3906
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3563
3907
|
offload_func_kq(tmpq);
|
3908
|
+
ggml_set_name(tmpq, "tmpq");
|
3564
3909
|
|
3565
|
-
struct ggml_tensor *
|
3566
|
-
|
3567
|
-
|
3568
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3569
|
-
wsize * n_embd_head * n_head));
|
3570
|
-
offload_func_kq(tmpk);
|
3571
|
-
|
3572
|
-
struct ggml_tensor * tmpv = ggml_view_3d(
|
3573
|
-
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3574
|
-
wsize * n_embd_head,
|
3575
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3576
|
-
wsize * n_embd_head * (n_head + n_head_kv));
|
3577
|
-
offload_func_v(tmpv);
|
3910
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3911
|
+
offload_func_kq(Kcur);
|
3912
|
+
ggml_set_name(Kcur, "Kcur");
|
3578
3913
|
|
3579
|
-
|
3580
|
-
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3914
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3581
3915
|
offload_func_kq(Qcur);
|
3582
|
-
|
3583
|
-
offload_func_kq(Kcur);
|
3916
|
+
ggml_set_name(Qcur, "Qcur");
|
3584
3917
|
|
3918
|
+
// store key and value to memory
|
3585
3919
|
{
|
3586
|
-
|
3920
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3921
|
+
|
3922
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3923
|
+
offload_func_v(tmpv);
|
3924
|
+
ggml_set_name(tmpv, "tmpv");
|
3925
|
+
|
3926
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3587
3927
|
offload_func_v(Vcur);
|
3588
|
-
offload_func_v(Vcur->src[0]->src[0]);
|
3589
3928
|
ggml_set_name(Vcur, "Vcur");
|
3590
3929
|
|
3591
3930
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
@@ -3596,6 +3935,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3596
3935
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3597
3936
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3598
3937
|
offload_func_v(v);
|
3938
|
+
ggml_set_name(v, "v");
|
3599
3939
|
|
3600
3940
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3601
3941
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3614,22 +3954,31 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3614
3954
|
offload_func_kq(K);
|
3615
3955
|
ggml_set_name(K, "K");
|
3616
3956
|
|
3957
|
+
// K * Q
|
3617
3958
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3618
3959
|
offload_func_kq(KQ);
|
3619
3960
|
ggml_set_name(KQ, "KQ");
|
3620
3961
|
|
3962
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3963
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3621
3964
|
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3622
3965
|
offload_func_kq(KQ_scaled);
|
3623
3966
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3624
3967
|
|
3625
|
-
|
3968
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3969
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3970
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3971
|
+
|
3972
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3626
3973
|
offload_func_kq(KQ_masked);
|
3627
3974
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3628
3975
|
|
3976
|
+
// KQ = soft_max(KQ_masked)
|
3629
3977
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3630
3978
|
offload_func_v(KQ_soft_max);
|
3631
3979
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3632
3980
|
|
3981
|
+
// split cached V into n_head heads
|
3633
3982
|
struct ggml_tensor * V =
|
3634
3983
|
ggml_view_3d(ctx0, kv_self.v,
|
3635
3984
|
n_kv, n_embd_head, n_head_kv,
|
@@ -3639,42 +3988,85 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3639
3988
|
offload_func_v(V);
|
3640
3989
|
ggml_set_name(V, "V");
|
3641
3990
|
|
3991
|
+
#if 1
|
3642
3992
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3643
3993
|
offload_func_v(KQV);
|
3644
3994
|
ggml_set_name(KQV, "KQV");
|
3995
|
+
#else
|
3996
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3997
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3998
|
+
// is there a better way?
|
3999
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
4000
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
4001
|
+
#endif
|
3645
4002
|
|
4003
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3646
4004
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3647
4005
|
offload_func_v(KQV_merged);
|
3648
4006
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3649
4007
|
|
4008
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3650
4009
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3651
4010
|
offload_func_v(cur);
|
3652
4011
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3653
4012
|
|
3654
|
-
|
4013
|
+
// projection (no bias)
|
4014
|
+
cur = ggml_mul_mat(ctx0,
|
4015
|
+
model.layers[il].wo,
|
4016
|
+
cur);
|
3655
4017
|
offload_func(cur);
|
3656
4018
|
ggml_set_name(cur, "result_wo");
|
3657
4019
|
}
|
3658
4020
|
|
3659
|
-
struct ggml_tensor *
|
4021
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
4022
|
+
offload_func(inpFF);
|
4023
|
+
ggml_set_name(inpFF, "inpFF");
|
3660
4024
|
|
3661
|
-
// feed
|
4025
|
+
// feed-forward network
|
3662
4026
|
{
|
3663
|
-
|
4027
|
+
// norm
|
4028
|
+
{
|
4029
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
4030
|
+
offload_func(cur);
|
4031
|
+
ggml_set_name(cur, "rms_norm_1");
|
3664
4032
|
|
3665
|
-
|
4033
|
+
// cur = cur*ffn_norm(broadcasted)
|
4034
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
4035
|
+
offload_func(cur);
|
4036
|
+
ggml_set_name(cur, "ffn_norm");
|
4037
|
+
}
|
4038
|
+
|
4039
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
4040
|
+
model.layers[il].w3,
|
4041
|
+
cur);
|
4042
|
+
offload_func(tmp);
|
4043
|
+
ggml_set_name(tmp, "result_w3");
|
4044
|
+
|
4045
|
+
cur = ggml_mul_mat(ctx0,
|
4046
|
+
model.layers[il].w1,
|
4047
|
+
cur);
|
3666
4048
|
offload_func(cur);
|
4049
|
+
ggml_set_name(cur, "result_w1");
|
3667
4050
|
|
3668
|
-
|
4051
|
+
// SILU activation
|
4052
|
+
cur = ggml_silu(ctx0, cur);
|
3669
4053
|
offload_func(cur);
|
3670
|
-
cur
|
4054
|
+
ggml_set_name(cur, "silu");
|
4055
|
+
|
4056
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3671
4057
|
offload_func(cur);
|
4058
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
4059
|
+
|
4060
|
+
cur = ggml_mul_mat(ctx0,
|
4061
|
+
model.layers[il].w2,
|
4062
|
+
cur);
|
4063
|
+
offload_func(cur);
|
4064
|
+
ggml_set_name(cur, "result_w2");
|
3672
4065
|
}
|
3673
4066
|
|
3674
|
-
cur = ggml_add(ctx0, cur,
|
3675
|
-
offload_func(cur);
|
3676
|
-
cur = ggml_add(ctx0, cur, inpL);
|
4067
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3677
4068
|
offload_func(cur);
|
4069
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3678
4070
|
|
3679
4071
|
// input for next layer
|
3680
4072
|
inpL = cur;
|
@@ -3684,15 +4076,17 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3684
4076
|
|
3685
4077
|
// norm
|
3686
4078
|
{
|
3687
|
-
cur =
|
4079
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3688
4080
|
offload_func_nr(cur);
|
4081
|
+
ggml_set_name(cur, "rms_norm_2");
|
3689
4082
|
|
3690
|
-
cur =
|
3691
|
-
|
3692
|
-
|
4083
|
+
// cur = cur*norm(broadcasted)
|
4084
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
4085
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3693
4086
|
ggml_set_name(cur, "result_norm");
|
3694
4087
|
}
|
3695
4088
|
|
4089
|
+
// lm_head
|
3696
4090
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3697
4091
|
ggml_set_name(cur, "result_output");
|
3698
4092
|
|
@@ -3703,7 +4097,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3703
4097
|
return gf;
|
3704
4098
|
}
|
3705
4099
|
|
3706
|
-
static struct ggml_cgraph *
|
4100
|
+
static struct ggml_cgraph * llm_build_falcon(
|
3707
4101
|
llama_context & lctx,
|
3708
4102
|
const llama_batch & batch) {
|
3709
4103
|
const auto & model = lctx.model;
|
@@ -3724,29 +4118,34 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3724
4118
|
|
3725
4119
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3726
4120
|
|
3727
|
-
const float
|
4121
|
+
const float freq_base = cparams.rope_freq_base;
|
4122
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4123
|
+
const float norm_eps = hparams.f_norm_eps;
|
4124
|
+
|
4125
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3728
4126
|
|
3729
4127
|
const int32_t n_tokens = batch.n_tokens;
|
3730
4128
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
4129
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3732
4130
|
|
4131
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4132
|
+
|
4133
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
4134
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
4135
|
+
|
3733
4136
|
auto & buf_compute = lctx.buf_compute;
|
3734
4137
|
|
3735
4138
|
struct ggml_init_params params = {
|
3736
4139
|
/*.mem_size =*/ buf_compute.size,
|
3737
4140
|
/*.mem_buffer =*/ buf_compute.data,
|
3738
|
-
/*.no_alloc =*/
|
4141
|
+
/*.no_alloc =*/ true,
|
3739
4142
|
};
|
3740
4143
|
|
3741
|
-
params.no_alloc = true;
|
3742
|
-
|
3743
4144
|
struct ggml_context * ctx0 = ggml_init(params);
|
3744
4145
|
|
3745
4146
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3746
4147
|
|
3747
4148
|
struct ggml_tensor * cur;
|
3748
|
-
struct ggml_tensor * token;
|
3749
|
-
struct ggml_tensor * position;
|
3750
4149
|
struct ggml_tensor * inpL;
|
3751
4150
|
|
3752
4151
|
if (batch.token) {
|
@@ -3758,30 +4157,390 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3758
4157
|
}
|
3759
4158
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3760
4159
|
|
3761
|
-
|
4160
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3762
4161
|
} else {
|
3763
4162
|
#ifdef GGML_USE_MPI
|
3764
4163
|
GGML_ASSERT(false && "not implemented");
|
3765
4164
|
#endif
|
3766
4165
|
|
3767
|
-
|
4166
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3768
4167
|
|
3769
|
-
ggml_allocr_alloc(lctx.alloc,
|
4168
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3770
4169
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3771
|
-
memcpy(
|
4170
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3772
4171
|
}
|
3773
4172
|
}
|
3774
4173
|
|
3775
|
-
|
3776
|
-
|
3777
|
-
|
3778
|
-
|
4174
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4175
|
+
(void) i_gpu_start;
|
4176
|
+
|
4177
|
+
// offload functions set the tensor output backend to GPU
|
4178
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
4179
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4180
|
+
offload_func_t offload_func_kq = llama_nop;
|
4181
|
+
offload_func_t offload_func_v = llama_nop;
|
4182
|
+
|
4183
|
+
#ifdef GGML_USE_CUBLAS
|
4184
|
+
if (n_gpu_layers > n_layer) {
|
4185
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
4186
|
+
}
|
4187
|
+
if (n_gpu_layers > n_layer + 1) {
|
4188
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
4189
|
+
}
|
4190
|
+
if (n_gpu_layers > n_layer + 2) {
|
4191
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
4192
|
+
}
|
4193
|
+
#endif // GGML_USE_CUBLAS
|
4194
|
+
|
4195
|
+
// KQ_scale
|
4196
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4197
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4198
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4199
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4200
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4204
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4205
|
+
offload_func_kq(KQ_mask);
|
4206
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4207
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4208
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4209
|
+
float * data = (float *) KQ_mask->data;
|
4210
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4211
|
+
|
4212
|
+
for (int h = 0; h < 1; ++h) {
|
4213
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4214
|
+
const llama_pos pos = batch.pos[j];
|
4215
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4216
|
+
|
4217
|
+
for (int i = 0; i < n_kv; ++i) {
|
4218
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4219
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4220
|
+
}
|
4221
|
+
}
|
4222
|
+
}
|
4223
|
+
}
|
4224
|
+
}
|
4225
|
+
|
4226
|
+
// KQ_pos - contains the positions
|
4227
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4228
|
+
offload_func_kq(KQ_pos);
|
4229
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4230
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4231
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4232
|
+
int * data = (int *) KQ_pos->data;
|
4233
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4234
|
+
data[i] = batch.pos[i];
|
4235
|
+
}
|
4236
|
+
}
|
4237
|
+
|
4238
|
+
// shift the entire K-cache if needed
|
4239
|
+
if (do_rope_shift) {
|
4240
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4241
|
+
offload_func_kq(K_shift);
|
4242
|
+
ggml_set_name(K_shift, "K_shift");
|
4243
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3779
4244
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3780
|
-
|
3781
|
-
|
4245
|
+
int * data = (int *) K_shift->data;
|
4246
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4247
|
+
data[i] = kv_self.cells[i].delta;
|
3782
4248
|
}
|
3783
4249
|
}
|
3784
|
-
|
4250
|
+
|
4251
|
+
for (int il = 0; il < n_layer; ++il) {
|
4252
|
+
struct ggml_tensor * tmp =
|
4253
|
+
ggml_rope_custom_inplace(ctx0,
|
4254
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4255
|
+
n_embd_head, n_head_kv, n_ctx,
|
4256
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4257
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4258
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
4259
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
4260
|
+
offload_func_kq(tmp);
|
4261
|
+
ggml_build_forward_expand(gf, tmp);
|
4262
|
+
}
|
4263
|
+
}
|
4264
|
+
|
4265
|
+
for (int il = 0; il < n_layer; ++il) {
|
4266
|
+
struct ggml_tensor * attn_norm;
|
4267
|
+
|
4268
|
+
offload_func_t offload_func = llama_nop;
|
4269
|
+
|
4270
|
+
#ifdef GGML_USE_CUBLAS
|
4271
|
+
if (il >= i_gpu_start) {
|
4272
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
4273
|
+
}
|
4274
|
+
#endif // GGML_USE_CUBLAS
|
4275
|
+
|
4276
|
+
// self-attention
|
4277
|
+
// TODO: refactor into common function (shared with LLaMA)
|
4278
|
+
{
|
4279
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
4280
|
+
offload_func(attn_norm);
|
4281
|
+
|
4282
|
+
attn_norm = ggml_add(ctx0,
|
4283
|
+
ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
|
4284
|
+
model.layers[il].attn_norm_b);
|
4285
|
+
offload_func(attn_norm->src[0]);
|
4286
|
+
offload_func(attn_norm);
|
4287
|
+
|
4288
|
+
if (model.layers[il].attn_norm_2) { // Falcon-40B
|
4289
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4290
|
+
offload_func(cur);
|
4291
|
+
|
4292
|
+
cur = ggml_add(ctx0,
|
4293
|
+
ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
|
4294
|
+
model.layers[il].attn_norm_2_b);
|
4295
|
+
offload_func(cur->src[0]);
|
4296
|
+
offload_func(cur);
|
4297
|
+
} else { // Falcon 7B
|
4298
|
+
cur = attn_norm;
|
4299
|
+
}
|
4300
|
+
|
4301
|
+
// compute QKV
|
4302
|
+
|
4303
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4304
|
+
offload_func_kq(cur);
|
4305
|
+
|
4306
|
+
// Note that the strides for Kcur, Vcur are set up so that the
|
4307
|
+
// resulting views are misaligned with the tensor's storage
|
4308
|
+
// (by applying the K/V offset we shift the tensor's original
|
4309
|
+
// view to stick out behind the viewed QKV tensor's allocated
|
4310
|
+
// memory, so to say). This is ok because no actual accesses
|
4311
|
+
// happen to that out-of-range memory, but it can require some
|
4312
|
+
// trickery when trying to accurately dump these views for
|
4313
|
+
// debugging.
|
4314
|
+
|
4315
|
+
const size_t wsize = ggml_type_size(cur->type);
|
4316
|
+
|
4317
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
4318
|
+
// non-contiguous views is added for the rope operator
|
4319
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
4320
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
4321
|
+
wsize * n_embd_head,
|
4322
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4323
|
+
0));
|
4324
|
+
offload_func_kq(tmpq);
|
4325
|
+
|
4326
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
4327
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
4328
|
+
wsize * n_embd_head,
|
4329
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4330
|
+
wsize * n_embd_head * n_head));
|
4331
|
+
offload_func_kq(tmpk);
|
4332
|
+
|
4333
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4334
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
4335
|
+
wsize * n_embd_head,
|
4336
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
4337
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
4338
|
+
offload_func_v(tmpv);
|
4339
|
+
|
4340
|
+
// using mode = 2 for neox mode
|
4341
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
4342
|
+
offload_func_kq(Qcur);
|
4343
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
4344
|
+
offload_func_kq(Kcur);
|
4345
|
+
|
4346
|
+
{
|
4347
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4348
|
+
offload_func_v(Vcur);
|
4349
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
4350
|
+
ggml_set_name(Vcur, "Vcur");
|
4351
|
+
|
4352
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4353
|
+
offload_func_kq(k);
|
4354
|
+
ggml_set_name(k, "k");
|
4355
|
+
|
4356
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4357
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4358
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4359
|
+
offload_func_v(v);
|
4360
|
+
|
4361
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4362
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4363
|
+
}
|
4364
|
+
|
4365
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
4366
|
+
offload_func_kq(Q);
|
4367
|
+
ggml_set_name(Q, "Q");
|
4368
|
+
|
4369
|
+
struct ggml_tensor * K =
|
4370
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4371
|
+
n_embd_head, n_kv, n_head_kv,
|
4372
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4373
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4374
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4375
|
+
offload_func_kq(K);
|
4376
|
+
ggml_set_name(K, "K");
|
4377
|
+
|
4378
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4379
|
+
offload_func_kq(KQ);
|
4380
|
+
ggml_set_name(KQ, "KQ");
|
4381
|
+
|
4382
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
4383
|
+
offload_func_kq(KQ_scaled);
|
4384
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4385
|
+
|
4386
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4387
|
+
offload_func_kq(KQ_masked);
|
4388
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4389
|
+
|
4390
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
4391
|
+
offload_func_v(KQ_soft_max);
|
4392
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4393
|
+
|
4394
|
+
struct ggml_tensor * V =
|
4395
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4396
|
+
n_kv, n_embd_head, n_head_kv,
|
4397
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4398
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4399
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4400
|
+
offload_func_v(V);
|
4401
|
+
ggml_set_name(V, "V");
|
4402
|
+
|
4403
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4404
|
+
offload_func_v(KQV);
|
4405
|
+
ggml_set_name(KQV, "KQV");
|
4406
|
+
|
4407
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4408
|
+
offload_func_v(KQV_merged);
|
4409
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4410
|
+
|
4411
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4412
|
+
offload_func_v(cur);
|
4413
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4414
|
+
|
4415
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
4416
|
+
offload_func(cur);
|
4417
|
+
ggml_set_name(cur, "result_wo");
|
4418
|
+
}
|
4419
|
+
|
4420
|
+
struct ggml_tensor * attn_out = cur;
|
4421
|
+
|
4422
|
+
// feed forward
|
4423
|
+
{
|
4424
|
+
struct ggml_tensor * inpFF = attn_norm;
|
4425
|
+
|
4426
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
4427
|
+
offload_func(cur);
|
4428
|
+
|
4429
|
+
cur = ggml_gelu(ctx0, cur);
|
4430
|
+
offload_func(cur);
|
4431
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
4432
|
+
offload_func(cur);
|
4433
|
+
}
|
4434
|
+
|
4435
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
4436
|
+
offload_func(cur);
|
4437
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4438
|
+
offload_func(cur);
|
4439
|
+
|
4440
|
+
// input for next layer
|
4441
|
+
inpL = cur;
|
4442
|
+
}
|
4443
|
+
|
4444
|
+
cur = inpL;
|
4445
|
+
|
4446
|
+
// norm
|
4447
|
+
{
|
4448
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
4449
|
+
offload_func_nr(cur);
|
4450
|
+
|
4451
|
+
cur = ggml_add(ctx0,
|
4452
|
+
ggml_mul(ctx0, cur, model.output_norm),
|
4453
|
+
model.output_norm_b);
|
4454
|
+
ggml_set_name(cur, "result_norm");
|
4455
|
+
}
|
4456
|
+
|
4457
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4458
|
+
ggml_set_name(cur, "result_output");
|
4459
|
+
|
4460
|
+
ggml_build_forward_expand(gf, cur);
|
4461
|
+
|
4462
|
+
ggml_free(ctx0);
|
4463
|
+
|
4464
|
+
return gf;
|
4465
|
+
}
|
4466
|
+
|
4467
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
4468
|
+
llama_context & lctx,
|
4469
|
+
const llama_batch & batch) {
|
4470
|
+
const auto & model = lctx.model;
|
4471
|
+
const auto & hparams = model.hparams;
|
4472
|
+
const auto & cparams = lctx.cparams;
|
4473
|
+
|
4474
|
+
const auto & kv_self = lctx.kv_self;
|
4475
|
+
|
4476
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4477
|
+
|
4478
|
+
const int64_t n_embd = hparams.n_embd;
|
4479
|
+
const int64_t n_layer = hparams.n_layer;
|
4480
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4481
|
+
const int64_t n_head = hparams.n_head;
|
4482
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4483
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4484
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4485
|
+
|
4486
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4487
|
+
|
4488
|
+
const float norm_eps = hparams.f_norm_eps;
|
4489
|
+
|
4490
|
+
const int32_t n_tokens = batch.n_tokens;
|
4491
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4492
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4493
|
+
|
4494
|
+
auto & buf_compute = lctx.buf_compute;
|
4495
|
+
|
4496
|
+
struct ggml_init_params params = {
|
4497
|
+
/*.mem_size =*/ buf_compute.size,
|
4498
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4499
|
+
/*.no_alloc =*/ true,
|
4500
|
+
};
|
4501
|
+
|
4502
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4503
|
+
|
4504
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4505
|
+
|
4506
|
+
struct ggml_tensor * cur;
|
4507
|
+
struct ggml_tensor * token;
|
4508
|
+
struct ggml_tensor * position;
|
4509
|
+
struct ggml_tensor * inpL;
|
4510
|
+
|
4511
|
+
if (batch.token) {
|
4512
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4513
|
+
|
4514
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4515
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4516
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4517
|
+
}
|
4518
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4519
|
+
|
4520
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4521
|
+
} else {
|
4522
|
+
#ifdef GGML_USE_MPI
|
4523
|
+
GGML_ASSERT(false && "not implemented");
|
4524
|
+
#endif
|
4525
|
+
|
4526
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4527
|
+
|
4528
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
4529
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4530
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
4531
|
+
}
|
4532
|
+
}
|
4533
|
+
|
4534
|
+
{
|
4535
|
+
// Compute position embeddings.
|
4536
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4537
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
4538
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4539
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4540
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
4541
|
+
}
|
4542
|
+
}
|
4543
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3785
4544
|
|
3786
4545
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3787
4546
|
}
|
@@ -3816,48 +4575,984 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3816
4575
|
}
|
3817
4576
|
}
|
3818
4577
|
|
3819
|
-
inpL = ggml_add(ctx0, token, position);
|
3820
|
-
ggml_set_name(inpL, "inpL");
|
3821
|
-
|
4578
|
+
inpL = ggml_add(ctx0, token, position);
|
4579
|
+
ggml_set_name(inpL, "inpL");
|
4580
|
+
|
4581
|
+
for (int il = 0; il < n_layer; ++il) {
|
4582
|
+
{
|
4583
|
+
// Norm
|
4584
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4585
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
4586
|
+
}
|
4587
|
+
|
4588
|
+
{
|
4589
|
+
// Self Attention
|
4590
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
4591
|
+
|
4592
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4593
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4594
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
4595
|
+
|
4596
|
+
struct ggml_tensor * Qcur = tmpq;
|
4597
|
+
struct ggml_tensor * Kcur = tmpk;
|
4598
|
+
|
4599
|
+
{
|
4600
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
4601
|
+
ggml_set_name(Vcur, "Vcur");
|
4602
|
+
|
4603
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
4604
|
+
ggml_set_name(k, "k");
|
4605
|
+
|
4606
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4607
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4608
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4609
|
+
|
4610
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4611
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4612
|
+
}
|
4613
|
+
|
4614
|
+
struct ggml_tensor * Q =
|
4615
|
+
ggml_permute(ctx0,
|
4616
|
+
ggml_cpy(ctx0,
|
4617
|
+
Qcur,
|
4618
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
4619
|
+
0, 2, 1, 3);
|
4620
|
+
ggml_set_name(Q, "Q");
|
4621
|
+
|
4622
|
+
struct ggml_tensor * K =
|
4623
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4624
|
+
n_embd_head, n_kv, n_head_kv,
|
4625
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4626
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4627
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
4628
|
+
ggml_set_name(K, "K");
|
4629
|
+
|
4630
|
+
// K * Q
|
4631
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
4632
|
+
ggml_set_name(KQ, "KQ");
|
4633
|
+
|
4634
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
4635
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
4636
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
4637
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
4638
|
+
|
4639
|
+
// KQ_masked = mask_past(KQ_scaled)
|
4640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
4641
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
4642
|
+
|
4643
|
+
// KQ = soft_max(KQ_masked)
|
4644
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
4645
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
4646
|
+
|
4647
|
+
// split cached V into n_head heads
|
4648
|
+
struct ggml_tensor * V =
|
4649
|
+
ggml_view_3d(ctx0, kv_self.v,
|
4650
|
+
n_kv, n_embd_head, n_head_kv,
|
4651
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
4652
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
4653
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
4654
|
+
ggml_set_name(V, "V");
|
4655
|
+
|
4656
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
4657
|
+
ggml_set_name(KQV, "KQV");
|
4658
|
+
|
4659
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
4660
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
4661
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
4662
|
+
|
4663
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4664
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
4665
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
4666
|
+
}
|
4667
|
+
|
4668
|
+
// Projection
|
4669
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
4670
|
+
|
4671
|
+
// Add the input
|
4672
|
+
cur = ggml_add(ctx0, cur, inpL);
|
4673
|
+
|
4674
|
+
struct ggml_tensor * inpFF = cur;
|
4675
|
+
|
4676
|
+
// FF
|
4677
|
+
{
|
4678
|
+
// Norm
|
4679
|
+
{
|
4680
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
4681
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
4682
|
+
}
|
4683
|
+
|
4684
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
4685
|
+
|
4686
|
+
// GELU activation
|
4687
|
+
cur = ggml_gelu(ctx0, cur);
|
4688
|
+
|
4689
|
+
// Projection
|
4690
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
4691
|
+
}
|
4692
|
+
|
4693
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
4694
|
+
}
|
4695
|
+
|
4696
|
+
// Output Norm
|
4697
|
+
{
|
4698
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4699
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
4700
|
+
}
|
4701
|
+
ggml_set_name(cur, "result_norm");
|
4702
|
+
|
4703
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4704
|
+
ggml_set_name(cur, "result_output");
|
4705
|
+
|
4706
|
+
ggml_build_forward_expand(gf, cur);
|
4707
|
+
ggml_free(ctx0);
|
4708
|
+
|
4709
|
+
return gf;
|
4710
|
+
}
|
4711
|
+
|
4712
|
+
static struct ggml_cgraph * llm_build_persimmon(
|
4713
|
+
llama_context & lctx,
|
4714
|
+
const llama_batch & batch) {
|
4715
|
+
const auto & model = lctx.model;
|
4716
|
+
const auto & hparams = model.hparams;
|
4717
|
+
|
4718
|
+
const auto & kv_self = lctx.kv_self;
|
4719
|
+
|
4720
|
+
GGML_ASSERT(!!kv_self.ctx);
|
4721
|
+
|
4722
|
+
const auto & cparams = lctx.cparams;
|
4723
|
+
const int64_t n_embd = hparams.n_embd;
|
4724
|
+
const int64_t n_layer = hparams.n_layer;
|
4725
|
+
const int64_t n_ctx = cparams.n_ctx;
|
4726
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4727
|
+
const int64_t n_head = hparams.n_head;
|
4728
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
4729
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
4730
|
+
const size_t n_rot = n_embd_head / 2;
|
4731
|
+
|
4732
|
+
const float freq_base = cparams.rope_freq_base;
|
4733
|
+
const float freq_scale = cparams.rope_freq_scale;
|
4734
|
+
const float norm_eps = hparams.f_norm_eps;
|
4735
|
+
|
4736
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
4737
|
+
|
4738
|
+
|
4739
|
+
const int32_t n_tokens = batch.n_tokens;
|
4740
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4741
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
4742
|
+
|
4743
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
4744
|
+
|
4745
|
+
auto & buf_compute = lctx.buf_compute;
|
4746
|
+
struct ggml_init_params params = {
|
4747
|
+
/*.mem_size =*/ buf_compute.size,
|
4748
|
+
/*.mem_buffer =*/ buf_compute.data,
|
4749
|
+
/*.no_alloc =*/ true,
|
4750
|
+
};
|
4751
|
+
|
4752
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
4753
|
+
|
4754
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4755
|
+
|
4756
|
+
struct ggml_tensor * cur;
|
4757
|
+
struct ggml_tensor * inpL;
|
4758
|
+
|
4759
|
+
if (batch.token) {
|
4760
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4761
|
+
|
4762
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
4763
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4764
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
4765
|
+
}
|
4766
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
4767
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
4768
|
+
} else {
|
4769
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
4770
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
4771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4772
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
4773
|
+
}
|
4774
|
+
}
|
4775
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
4776
|
+
(void) i_gpu_start;
|
4777
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
4778
|
+
offload_func_t offload_func_kq = llama_nop;
|
4779
|
+
offload_func_t offload_func_v = llama_nop;
|
4780
|
+
// KQ_scale
|
4781
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4782
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
4783
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4784
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
4785
|
+
}
|
4786
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
4787
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4788
|
+
offload_func_kq(KQ_mask);
|
4789
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4790
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4791
|
+
|
4792
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4793
|
+
float * data = (float *) KQ_mask->data;
|
4794
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4795
|
+
for (int h = 0; h < 1; ++h) {
|
4796
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4797
|
+
const llama_pos pos = batch.pos[j];
|
4798
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4799
|
+
for (int i = 0; i < n_kv; ++i) {
|
4800
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4802
|
+
}
|
4803
|
+
}
|
4804
|
+
}
|
4805
|
+
}
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4809
|
+
offload_func_kq(KQ_pos);
|
4810
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
4811
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
4812
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4813
|
+
int * data = (int *) KQ_pos->data;
|
4814
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4815
|
+
data[i] = batch.pos[i];
|
4816
|
+
}
|
4817
|
+
}
|
4818
|
+
if (do_rope_shift) {
|
4819
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
4820
|
+
offload_func_kq(K_shift);
|
4821
|
+
ggml_set_name(K_shift, "K_shift");
|
4822
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
4823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4824
|
+
int * data = (int *) K_shift->data;
|
4825
|
+
for (int i = 0; i < n_ctx; ++i) {
|
4826
|
+
data[i] = kv_self.cells[i].delta;
|
4827
|
+
}
|
4828
|
+
}
|
4829
|
+
for (int il = 0; il < n_layer; ++il) {
|
4830
|
+
struct ggml_tensor * tmp =
|
4831
|
+
// we rotate only the first n_rot dimensions.
|
4832
|
+
ggml_rope_custom_inplace(ctx0,
|
4833
|
+
ggml_view_3d(ctx0, kv_self.k,
|
4834
|
+
n_rot, n_head, n_ctx,
|
4835
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4836
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
4837
|
+
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
4838
|
+
),
|
4839
|
+
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
4840
|
+
offload_func_kq(tmp);
|
4841
|
+
ggml_build_forward_expand(gf, tmp);
|
4842
|
+
}
|
4843
|
+
}
|
4844
|
+
for (int il=0; il < n_layer; ++il) {
|
4845
|
+
struct ggml_tensor * residual = inpL;
|
4846
|
+
offload_func_t offload_func = llama_nop;
|
4847
|
+
{
|
4848
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
4849
|
+
offload_func(cur);
|
4850
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
4851
|
+
offload_func(cur);
|
4852
|
+
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
4853
|
+
offload_func(cur);
|
4854
|
+
ggml_format_name(cur, "input_layernorm_%d", il);
|
4855
|
+
}
|
4856
|
+
// self attention
|
4857
|
+
{
|
4858
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
4859
|
+
offload_func_kq(cur);
|
4860
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
4861
|
+
offload_func_kq(cur);
|
4862
|
+
|
4863
|
+
// split qkv
|
4864
|
+
GGML_ASSERT(n_head_kv == n_head);
|
4865
|
+
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
4866
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
4867
|
+
offload_func_kq(tmpqkv);
|
4868
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
4869
|
+
offload_func_kq(tmpqkv_perm);
|
4870
|
+
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
4871
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
4872
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4873
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4874
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4875
|
+
0
|
4876
|
+
);
|
4877
|
+
offload_func_kq(tmpq);
|
4878
|
+
struct ggml_tensor * tmpk = ggml_view_3d(
|
4879
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4880
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4881
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4882
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
4883
|
+
);
|
4884
|
+
offload_func_kq(tmpk);
|
4885
|
+
// Q/K Layernorm
|
4886
|
+
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
4887
|
+
offload_func_kq(tmpq);
|
4888
|
+
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
4889
|
+
offload_func_kq(tmpq);
|
4890
|
+
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
4891
|
+
offload_func_kq(tmpq);
|
4892
|
+
|
4893
|
+
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
4894
|
+
offload_func_v(tmpk);
|
4895
|
+
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
4896
|
+
offload_func_v(tmpk);
|
4897
|
+
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
4898
|
+
offload_func_v(tmpk);
|
4899
|
+
|
4900
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4901
|
+
struct ggml_tensor * qrot = ggml_view_3d(
|
4902
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4903
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4904
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4905
|
+
0
|
4906
|
+
);
|
4907
|
+
offload_func_kq(qrot);
|
4908
|
+
ggml_format_name(qrot, "qrot_%d", il);
|
4909
|
+
struct ggml_tensor * krot = ggml_view_3d(
|
4910
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4911
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4912
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4913
|
+
0
|
4914
|
+
);
|
4915
|
+
offload_func_kq(krot);
|
4916
|
+
ggml_format_name(krot, "krot_%d", il);
|
4917
|
+
|
4918
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4919
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4920
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4921
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4922
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4923
|
+
ggml_element_size(tmpq) * n_rot
|
4924
|
+
);
|
4925
|
+
offload_func_kq(qpass);
|
4926
|
+
ggml_format_name(qpass, "qpass_%d", il);
|
4927
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4928
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4929
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4930
|
+
ggml_element_size(tmpk) * n_embd_head * n_head,
|
4931
|
+
ggml_element_size(tmpk) * n_rot
|
4932
|
+
);
|
4933
|
+
offload_func_kq(kpass);
|
4934
|
+
ggml_format_name(kpass, "kpass_%d", il);
|
4935
|
+
|
4936
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4937
|
+
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4938
|
+
);
|
4939
|
+
offload_func_kq(qrotated);
|
4940
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4941
|
+
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
4942
|
+
);
|
4943
|
+
offload_func_kq(krotated);
|
4944
|
+
// ggml currently only supports concatenation on dim=2
|
4945
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4946
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4947
|
+
offload_func_kq(qrotated);
|
4948
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4949
|
+
offload_func_kq(krotated);
|
4950
|
+
|
4951
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4952
|
+
offload_func_kq(qpass);
|
4953
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4954
|
+
offload_func_kq(kpass);
|
4955
|
+
|
4956
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4957
|
+
offload_func_kq(Qcur);
|
4958
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4959
|
+
offload_func_kq(Kcur);
|
4960
|
+
|
4961
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
4962
|
+
offload_func_kq(Q);
|
4963
|
+
|
4964
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4965
|
+
offload_func_kq(Kcur);
|
4966
|
+
{
|
4967
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
4968
|
+
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
4969
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
4970
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
4971
|
+
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
4972
|
+
);
|
4973
|
+
offload_func_v(tmpv);
|
4974
|
+
// store K, V in cache
|
4975
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
4976
|
+
offload_func_v(Vcur);
|
4977
|
+
ggml_set_name(Vcur, "Vcur");
|
4978
|
+
|
4979
|
+
struct ggml_tensor * k = ggml_view_1d(
|
4980
|
+
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
4981
|
+
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
4982
|
+
);
|
4983
|
+
offload_func_kq(k);
|
4984
|
+
ggml_set_name(k, "k");
|
4985
|
+
|
4986
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
4987
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
4988
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
4989
|
+
offload_func_v(v);
|
4990
|
+
ggml_set_name(v, "v");
|
4991
|
+
|
4992
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
4993
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
4994
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
4995
|
+
}
|
4996
|
+
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
4997
|
+
n_embd_head, n_kv, n_head_kv,
|
4998
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
4999
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5000
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5001
|
+
|
5002
|
+
offload_func_kq(K);
|
5003
|
+
ggml_format_name(K, "K_%d", il);
|
5004
|
+
|
5005
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5006
|
+
offload_func_kq(KQ);
|
5007
|
+
ggml_set_name(KQ, "KQ");
|
5008
|
+
|
5009
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5010
|
+
offload_func_kq(KQ_scaled);
|
5011
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5012
|
+
|
5013
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
5014
|
+
offload_func_kq(KQ_masked);
|
5015
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5016
|
+
|
5017
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5018
|
+
offload_func_kq(KQ_soft_max);
|
5019
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5020
|
+
|
5021
|
+
struct ggml_tensor * V =
|
5022
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5023
|
+
n_kv, n_embd_head, n_head_kv,
|
5024
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5025
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5026
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5027
|
+
offload_func_v(V);
|
5028
|
+
ggml_set_name(V, "V");
|
5029
|
+
|
5030
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5031
|
+
offload_func_v(KQV);
|
5032
|
+
ggml_set_name(KQV, "KQV");
|
5033
|
+
|
5034
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5035
|
+
offload_func_v(KQV_merged);
|
5036
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5037
|
+
|
5038
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5039
|
+
offload_func_v(cur);
|
5040
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5041
|
+
|
5042
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5043
|
+
offload_func(cur);
|
5044
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
5045
|
+
offload_func(cur);
|
5046
|
+
ggml_set_name(cur, "result_wo");
|
5047
|
+
}
|
5048
|
+
|
5049
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
5050
|
+
offload_func(inpFF);
|
5051
|
+
ggml_set_name(inpFF, "inpFF");
|
5052
|
+
{
|
5053
|
+
// MLP
|
5054
|
+
{
|
5055
|
+
// Norm
|
5056
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5057
|
+
offload_func(cur);
|
5058
|
+
cur = ggml_add(ctx0,
|
5059
|
+
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
5060
|
+
model.layers[il].ffn_norm_b
|
5061
|
+
);
|
5062
|
+
ggml_set_name(cur, "ffn_norm");
|
5063
|
+
offload_func(cur);
|
5064
|
+
}
|
5065
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5066
|
+
offload_func(cur);
|
5067
|
+
|
5068
|
+
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
5069
|
+
offload_func(cur);
|
5070
|
+
ggml_set_name(cur, "result_ffn_up");
|
5071
|
+
|
5072
|
+
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
5073
|
+
ggml_set_name(cur, "result_ffn_act");
|
5074
|
+
offload_func(cur);
|
5075
|
+
offload_func(cur->src[0]);
|
5076
|
+
|
5077
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5078
|
+
offload_func(cur);
|
5079
|
+
cur = ggml_add(ctx0,
|
5080
|
+
cur,
|
5081
|
+
model.layers[il].b2);
|
5082
|
+
offload_func(cur);
|
5083
|
+
ggml_set_name(cur, "outFF");
|
5084
|
+
}
|
5085
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
5086
|
+
offload_func(cur);
|
5087
|
+
ggml_set_name(cur, "inpFF_+_outFF");
|
5088
|
+
inpL = cur;
|
5089
|
+
}
|
5090
|
+
cur = inpL;
|
5091
|
+
{
|
5092
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5093
|
+
offload_func_nr(cur);
|
5094
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5095
|
+
offload_func_nr(cur);
|
5096
|
+
|
5097
|
+
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
5098
|
+
// offload_func_nr(cur);
|
5099
|
+
|
5100
|
+
ggml_set_name(cur, "result_norm");
|
5101
|
+
}
|
5102
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5103
|
+
ggml_set_name(cur, "result_output");
|
5104
|
+
ggml_build_forward_expand(gf, cur);
|
5105
|
+
ggml_free(ctx0);
|
5106
|
+
return gf;
|
5107
|
+
}
|
5108
|
+
|
5109
|
+
static struct ggml_cgraph * llm_build_bloom(
|
5110
|
+
llama_context & lctx,
|
5111
|
+
const llama_batch & batch) {
|
5112
|
+
const auto & model = lctx.model;
|
5113
|
+
const auto & hparams = model.hparams;
|
5114
|
+
const auto & cparams = lctx.cparams;
|
5115
|
+
|
5116
|
+
const auto & kv_self = lctx.kv_self;
|
5117
|
+
|
5118
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5119
|
+
|
5120
|
+
const int64_t n_embd = hparams.n_embd;
|
5121
|
+
const int64_t n_layer = hparams.n_layer;
|
5122
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5123
|
+
const int64_t n_head = hparams.n_head;
|
5124
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5125
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5126
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5127
|
+
|
5128
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5129
|
+
|
5130
|
+
const float norm_eps = hparams.f_norm_eps;
|
5131
|
+
|
5132
|
+
const int32_t n_tokens = batch.n_tokens;
|
5133
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5134
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5135
|
+
|
5136
|
+
auto & buf_compute = lctx.buf_compute;
|
5137
|
+
|
5138
|
+
struct ggml_init_params params = {
|
5139
|
+
/*.mem_size =*/ buf_compute.size,
|
5140
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5141
|
+
/*.no_alloc =*/ false,
|
5142
|
+
};
|
5143
|
+
|
5144
|
+
params.no_alloc = true;
|
5145
|
+
|
5146
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5147
|
+
|
5148
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5149
|
+
|
5150
|
+
struct ggml_tensor * cur;
|
5151
|
+
struct ggml_tensor * token;
|
5152
|
+
struct ggml_tensor * inpL;
|
5153
|
+
|
5154
|
+
if (batch.token) {
|
5155
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5156
|
+
|
5157
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5158
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5159
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5160
|
+
}
|
5161
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5162
|
+
|
5163
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5164
|
+
} else {
|
5165
|
+
#ifdef GGML_USE_MPI
|
5166
|
+
GGML_ASSERT(false && "not implemented");
|
5167
|
+
#endif
|
5168
|
+
|
5169
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5170
|
+
|
5171
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
5172
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5173
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5174
|
+
}
|
5175
|
+
}
|
5176
|
+
|
5177
|
+
// KQ_scale
|
5178
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5179
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5180
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5181
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5182
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5183
|
+
}
|
5184
|
+
|
5185
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5186
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5187
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5188
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5189
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5190
|
+
float * data = (float *) KQ_mask->data;
|
5191
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5192
|
+
|
5193
|
+
for (int h = 0; h < 1; ++h) {
|
5194
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5195
|
+
const llama_pos pos = batch.pos[j];
|
5196
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5197
|
+
|
5198
|
+
for (int i = 0; i < n_kv; ++i) {
|
5199
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5200
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5201
|
+
}
|
5202
|
+
}
|
5203
|
+
}
|
5204
|
+
}
|
5205
|
+
}
|
5206
|
+
|
5207
|
+
// norm
|
5208
|
+
{
|
5209
|
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5210
|
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5211
|
+
}
|
5212
|
+
|
5213
|
+
ggml_set_name(inpL, "inpL");
|
5214
|
+
|
5215
|
+
for (int il = 0; il < n_layer; ++il) {
|
5216
|
+
{
|
5217
|
+
// Norm
|
5218
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5219
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5220
|
+
}
|
5221
|
+
|
5222
|
+
{
|
5223
|
+
// Self Attention
|
5224
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5225
|
+
|
5226
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5227
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5228
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5229
|
+
|
5230
|
+
struct ggml_tensor * Qcur = tmpq;
|
5231
|
+
struct ggml_tensor * Kcur = tmpk;
|
5232
|
+
|
5233
|
+
// store key and value to memory
|
5234
|
+
{
|
5235
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5236
|
+
ggml_set_name(Vcur, "Vcur");
|
5237
|
+
|
5238
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5239
|
+
ggml_set_name(k, "k");
|
5240
|
+
|
5241
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5242
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5243
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5244
|
+
|
5245
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5246
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5247
|
+
}
|
5248
|
+
|
5249
|
+
struct ggml_tensor * Q =
|
5250
|
+
ggml_permute(ctx0,
|
5251
|
+
ggml_cpy(ctx0,
|
5252
|
+
Qcur,
|
5253
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5254
|
+
0, 2, 1, 3);
|
5255
|
+
ggml_set_name(Q, "Q");
|
5256
|
+
|
5257
|
+
struct ggml_tensor * K =
|
5258
|
+
ggml_view_3d(ctx0, kv_self.k,
|
5259
|
+
n_embd_head, n_kv, n_head_kv,
|
5260
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5261
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5262
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5263
|
+
ggml_set_name(K, "K");
|
5264
|
+
|
5265
|
+
// K * Q
|
5266
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5267
|
+
ggml_set_name(KQ, "KQ");
|
5268
|
+
|
5269
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5270
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5271
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5272
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5273
|
+
|
5274
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5275
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5276
|
+
|
5277
|
+
// KQ_masked = mask_past(KQ_scaled)
|
5278
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5279
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5280
|
+
|
5281
|
+
// KQ = soft_max(KQ_masked)
|
5282
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5283
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5284
|
+
|
5285
|
+
// split cached V into n_head heads
|
5286
|
+
struct ggml_tensor * V =
|
5287
|
+
ggml_view_3d(ctx0, kv_self.v,
|
5288
|
+
n_kv, n_embd_head, n_head_kv,
|
5289
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
5290
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5291
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5292
|
+
ggml_set_name(V, "V");
|
5293
|
+
|
5294
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5295
|
+
ggml_set_name(KQV, "KQV");
|
5296
|
+
|
5297
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5298
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5299
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5300
|
+
|
5301
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5302
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5303
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5304
|
+
}
|
5305
|
+
|
5306
|
+
// Projection
|
5307
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5308
|
+
|
5309
|
+
// Add the input
|
5310
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5311
|
+
|
5312
|
+
struct ggml_tensor * inpFF = cur;
|
5313
|
+
|
5314
|
+
// FF
|
5315
|
+
{
|
5316
|
+
// Norm
|
5317
|
+
{
|
5318
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5319
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5320
|
+
}
|
5321
|
+
|
5322
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5323
|
+
|
5324
|
+
// GELU activation
|
5325
|
+
cur = ggml_gelu(ctx0, cur);
|
5326
|
+
|
5327
|
+
// Projection
|
5328
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5329
|
+
}
|
5330
|
+
|
5331
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
// Output Norm
|
5335
|
+
{
|
5336
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5337
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5338
|
+
}
|
5339
|
+
ggml_set_name(cur, "result_norm");
|
5340
|
+
|
5341
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5342
|
+
ggml_set_name(cur, "result_output");
|
5343
|
+
|
5344
|
+
ggml_build_forward_expand(gf, cur);
|
5345
|
+
|
5346
|
+
ggml_free(ctx0);
|
5347
|
+
|
5348
|
+
return gf;
|
5349
|
+
}
|
5350
|
+
|
5351
|
+
static struct ggml_cgraph * llm_build_mpt(
|
5352
|
+
llama_context & lctx,
|
5353
|
+
const llama_batch & batch) {
|
5354
|
+
const auto & model = lctx.model;
|
5355
|
+
const auto & hparams = model.hparams;
|
5356
|
+
const auto & cparams = lctx.cparams;
|
5357
|
+
|
5358
|
+
const auto & kv_self = lctx.kv_self;
|
5359
|
+
|
5360
|
+
GGML_ASSERT(!!kv_self.ctx);
|
5361
|
+
|
5362
|
+
const int64_t n_embd = hparams.n_embd;
|
5363
|
+
const int64_t n_layer = hparams.n_layer;
|
5364
|
+
const int64_t n_ctx = cparams.n_ctx;
|
5365
|
+
const int64_t n_head = hparams.n_head;
|
5366
|
+
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5367
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
|
+
|
5370
|
+
const float norm_eps = hparams.f_norm_eps;
|
5371
|
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5372
|
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5373
|
+
|
5374
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
5375
|
+
|
5376
|
+
const int32_t n_tokens = batch.n_tokens;
|
5377
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5378
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5379
|
+
|
5380
|
+
auto & buf_compute = lctx.buf_compute;
|
5381
|
+
|
5382
|
+
struct ggml_init_params params = {
|
5383
|
+
/*.mem_size =*/ buf_compute.size,
|
5384
|
+
/*.mem_buffer =*/ buf_compute.data,
|
5385
|
+
/*.no_alloc =*/ false,
|
5386
|
+
};
|
5387
|
+
|
5388
|
+
params.no_alloc = true;
|
5389
|
+
|
5390
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
5391
|
+
|
5392
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5393
|
+
|
5394
|
+
struct ggml_tensor * cur;
|
5395
|
+
struct ggml_tensor * inpL;
|
5396
|
+
|
5397
|
+
//int warmup = 0;
|
5398
|
+
if (batch.token) {
|
5399
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5400
|
+
|
5401
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5402
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5403
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5404
|
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5405
|
+
}
|
5406
|
+
|
5407
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5408
|
+
|
5409
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5410
|
+
} else {
|
5411
|
+
#ifdef GGML_USE_MPI
|
5412
|
+
GGML_ASSERT(false && "not implemented");
|
5413
|
+
#endif
|
5414
|
+
|
5415
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5416
|
+
|
5417
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5418
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5419
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5420
|
+
}
|
5421
|
+
}
|
5422
|
+
|
5423
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5424
|
+
(void) i_gpu_start;
|
5425
|
+
|
5426
|
+
// offload functions set the tensor output backend to GPU
|
5427
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5428
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5429
|
+
offload_func_t offload_func_kq = llama_nop;
|
5430
|
+
offload_func_t offload_func_v = llama_nop;
|
5431
|
+
|
5432
|
+
#ifdef GGML_USE_CUBLAS
|
5433
|
+
if (n_gpu_layers > n_layer) {
|
5434
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5435
|
+
}
|
5436
|
+
if (n_gpu_layers > n_layer + 1) {
|
5437
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5438
|
+
}
|
5439
|
+
if (n_gpu_layers > n_layer + 2) {
|
5440
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5441
|
+
}
|
5442
|
+
#endif // GGML_USE_CUBLAS
|
5443
|
+
|
5444
|
+
// KQ_scale
|
5445
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5446
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5447
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5448
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5449
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5450
|
+
}
|
5451
|
+
|
5452
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5453
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5454
|
+
offload_func_kq(KQ_mask);
|
5455
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5456
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5457
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5458
|
+
float * data = (float *) KQ_mask->data;
|
5459
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5460
|
+
|
5461
|
+
for (int h = 0; h < 1; ++h) {
|
5462
|
+
for (int j = 0; j < n_tokens; ++j) {
|
5463
|
+
const llama_pos pos = batch.pos[j];
|
5464
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5465
|
+
|
5466
|
+
for (int i = 0; i < n_kv; ++i) {
|
5467
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5468
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5469
|
+
}
|
5470
|
+
}
|
5471
|
+
}
|
5472
|
+
}
|
5473
|
+
}
|
5474
|
+
|
3822
5475
|
for (int il = 0; il < n_layer; ++il) {
|
3823
|
-
|
3824
|
-
|
3825
|
-
|
3826
|
-
|
5476
|
+
struct ggml_tensor * attn_norm;
|
5477
|
+
|
5478
|
+
offload_func_t offload_func = llama_nop;
|
5479
|
+
|
5480
|
+
#ifdef GGML_USE_CUBLAS
|
5481
|
+
if (il >= i_gpu_start) {
|
5482
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3827
5483
|
}
|
5484
|
+
#endif // GGML_USE_CUBLAS
|
3828
5485
|
|
5486
|
+
// self-attention
|
5487
|
+
// TODO: refactor into common function (shared with LLaMA)
|
3829
5488
|
{
|
3830
|
-
|
3831
|
-
|
5489
|
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5490
|
+
offload_func(attn_norm);
|
3832
5491
|
|
3833
|
-
|
3834
|
-
|
3835
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5492
|
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5493
|
+
offload_func(attn_norm);
|
3836
5494
|
|
3837
|
-
|
3838
|
-
|
5495
|
+
if (1) {
|
5496
|
+
cur = attn_norm;
|
5497
|
+
}
|
5498
|
+
|
5499
|
+
// compute QKV
|
5500
|
+
|
5501
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5502
|
+
offload_func_kq(cur);
|
5503
|
+
|
5504
|
+
if (clamp_kqv > 0.0f) {
|
5505
|
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5506
|
+
offload_func_kq(cur);
|
5507
|
+
}
|
5508
|
+
|
5509
|
+
const size_t wsize = ggml_type_size(cur->type);
|
5510
|
+
|
5511
|
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5512
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5513
|
+
wsize * n_embd_head,
|
5514
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5515
|
+
0);
|
5516
|
+
offload_func_kq(Qcur);
|
5517
|
+
|
5518
|
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5519
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5520
|
+
wsize * n_embd_head,
|
5521
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5522
|
+
wsize * n_embd_head * n_head);
|
5523
|
+
offload_func_kq(Kcur);
|
5524
|
+
|
5525
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5526
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5527
|
+
wsize * n_embd_head,
|
5528
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5529
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5530
|
+
offload_func_kq(Kcur);
|
5531
|
+
|
5532
|
+
ggml_set_name(Qcur, "Qcur");
|
5533
|
+
ggml_set_name(Kcur, "Kcur");
|
3839
5534
|
|
3840
5535
|
{
|
3841
5536
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5537
|
+
offload_func_v(Vcur);
|
5538
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
3842
5539
|
ggml_set_name(Vcur, "Vcur");
|
3843
5540
|
|
3844
5541
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5542
|
+
offload_func_kq(k);
|
3845
5543
|
ggml_set_name(k, "k");
|
3846
5544
|
|
3847
5545
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3848
5546
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3849
5547
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5548
|
+
offload_func_v(v);
|
3850
5549
|
|
3851
5550
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3852
5551
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3853
5552
|
}
|
3854
5553
|
|
3855
|
-
struct ggml_tensor * Q =
|
3856
|
-
|
3857
|
-
ggml_cpy(ctx0,
|
3858
|
-
Qcur,
|
3859
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3860
|
-
0, 2, 1, 3);
|
5554
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5555
|
+
offload_func_kq(Q);
|
3861
5556
|
ggml_set_name(Q, "Q");
|
3862
5557
|
|
3863
5558
|
struct ggml_tensor * K =
|
@@ -3866,85 +5561,105 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3866
5561
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3867
5562
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3868
5563
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5564
|
+
offload_func_kq(K);
|
3869
5565
|
ggml_set_name(K, "K");
|
3870
5566
|
|
3871
|
-
// K * Q
|
3872
5567
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5568
|
+
offload_func_kq(KQ);
|
3873
5569
|
ggml_set_name(KQ, "KQ");
|
3874
5570
|
|
3875
|
-
|
3876
|
-
|
3877
|
-
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5571
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5572
|
+
offload_func_kq(KQ_scaled);
|
3878
5573
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3879
5574
|
|
3880
|
-
//
|
3881
|
-
struct ggml_tensor *
|
5575
|
+
// TODO: replace with ggml_add()
|
5576
|
+
struct ggml_tensor * KQ_scaled_alibi =
|
5577
|
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5578
|
+
offload_func_kq(KQ_scaled_alibi);
|
5579
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5580
|
+
|
5581
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5582
|
+
offload_func_kq(KQ_masked);
|
3882
5583
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3883
5584
|
|
3884
|
-
|
3885
|
-
|
5585
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5586
|
+
offload_func_v(KQ_soft_max);
|
3886
5587
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3887
5588
|
|
3888
|
-
// split cached V into n_head heads
|
3889
5589
|
struct ggml_tensor * V =
|
3890
5590
|
ggml_view_3d(ctx0, kv_self.v,
|
3891
5591
|
n_kv, n_embd_head, n_head_kv,
|
3892
5592
|
ggml_element_size(kv_self.v)*n_ctx,
|
3893
5593
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3894
5594
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5595
|
+
offload_func_v(V);
|
3895
5596
|
ggml_set_name(V, "V");
|
3896
5597
|
|
3897
5598
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5599
|
+
offload_func_v(KQV);
|
3898
5600
|
ggml_set_name(KQV, "KQV");
|
3899
5601
|
|
3900
|
-
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3901
5602
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5603
|
+
offload_func_v(KQV_merged);
|
3902
5604
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3903
5605
|
|
3904
|
-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
5606
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5607
|
+
offload_func_v(cur);
|
3906
5608
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3907
|
-
}
|
3908
5609
|
|
3909
|
-
|
3910
|
-
|
5610
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5611
|
+
offload_func(cur);
|
5612
|
+
ggml_set_name(cur, "result_wo");
|
5613
|
+
}
|
3911
5614
|
|
3912
5615
|
// Add the input
|
3913
5616
|
cur = ggml_add(ctx0, cur, inpL);
|
5617
|
+
offload_func(cur);
|
3914
5618
|
|
3915
|
-
struct ggml_tensor *
|
5619
|
+
struct ggml_tensor * attn_out = cur;
|
3916
5620
|
|
3917
|
-
//
|
5621
|
+
// feed forward
|
3918
5622
|
{
|
3919
5623
|
// Norm
|
3920
5624
|
{
|
3921
|
-
cur = ggml_norm(ctx0,
|
3922
|
-
|
5625
|
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5626
|
+
offload_func(cur);
|
5627
|
+
|
5628
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5629
|
+
offload_func(cur);
|
3923
5630
|
}
|
3924
5631
|
|
3925
|
-
cur =
|
5632
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5633
|
+
offload_func(cur);
|
3926
5634
|
|
3927
|
-
// GELU activation
|
3928
5635
|
cur = ggml_gelu(ctx0, cur);
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
5636
|
+
offload_func(cur);
|
5637
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5638
|
+
offload_func(cur);
|
3932
5639
|
}
|
3933
5640
|
|
3934
|
-
|
5641
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
5642
|
+
offload_func(cur);
|
5643
|
+
// input for next layer
|
5644
|
+
inpL = cur;
|
3935
5645
|
}
|
3936
5646
|
|
3937
|
-
|
5647
|
+
cur = inpL;
|
5648
|
+
|
5649
|
+
// norm
|
3938
5650
|
{
|
3939
|
-
cur = ggml_norm(ctx0,
|
3940
|
-
|
5651
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5652
|
+
offload_func_nr(cur);
|
5653
|
+
|
5654
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5655
|
+
ggml_set_name(cur, "result_norm");
|
3941
5656
|
}
|
3942
|
-
ggml_set_name(cur, "result_norm");
|
3943
5657
|
|
3944
5658
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3945
5659
|
ggml_set_name(cur, "result_output");
|
3946
5660
|
|
3947
5661
|
ggml_build_forward_expand(gf, cur);
|
5662
|
+
|
3948
5663
|
ggml_free(ctx0);
|
3949
5664
|
|
3950
5665
|
return gf;
|
@@ -3974,6 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3974
5689
|
{
|
3975
5690
|
result = llm_build_starcoder(lctx, batch);
|
3976
5691
|
} break;
|
5692
|
+
case LLM_ARCH_PERSIMMON:
|
5693
|
+
{
|
5694
|
+
result = llm_build_persimmon(lctx, batch);
|
5695
|
+
} break;
|
5696
|
+
case LLM_ARCH_REFACT:
|
5697
|
+
{
|
5698
|
+
result = llm_build_refact(lctx, batch);
|
5699
|
+
} break;
|
5700
|
+
case LLM_ARCH_BLOOM:
|
5701
|
+
{
|
5702
|
+
result = llm_build_bloom(lctx, batch);
|
5703
|
+
} break;
|
5704
|
+
case LLM_ARCH_MPT:
|
5705
|
+
{
|
5706
|
+
result = llm_build_mpt(lctx, batch);
|
5707
|
+
} break;
|
3977
5708
|
default:
|
3978
5709
|
GGML_ASSERT(false);
|
3979
5710
|
}
|
@@ -3985,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3985
5716
|
//
|
3986
5717
|
// - lctx: llama context
|
3987
5718
|
// - batch: batch to evaluate
|
3988
|
-
// - n_threads: number of threads to use
|
3989
5719
|
//
|
3990
5720
|
// return 0 on success
|
3991
5721
|
// return positive int on warning
|
@@ -4052,10 +5782,6 @@ static int llama_decode_internal(
|
|
4052
5782
|
batch.seq_id = seq_id.data();
|
4053
5783
|
}
|
4054
5784
|
|
4055
|
-
// we always start to search for a free slot from the start of the cache
|
4056
|
-
// TODO: better strategies can be implemented
|
4057
|
-
kv_self.head = 0;
|
4058
|
-
|
4059
5785
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
5786
|
return 1;
|
4061
5787
|
}
|
@@ -4107,7 +5833,9 @@ static int llama_decode_internal(
|
|
4107
5833
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
5834
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
5835
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
-
model.arch == LLM_ARCH_FALCON
|
5836
|
+
model.arch == LLM_ARCH_FALCON ||
|
5837
|
+
model.arch == LLM_ARCH_REFACT ||
|
5838
|
+
model.arch == LLM_ARCH_MPT;
|
4111
5839
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
5840
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
5841
|
n_threads = 1;
|
@@ -4140,8 +5868,12 @@ static int llama_decode_internal(
|
|
4140
5868
|
#endif
|
4141
5869
|
|
4142
5870
|
// update the kv ring buffer
|
4143
|
-
lctx.kv_self.head += n_tokens;
|
4144
5871
|
lctx.kv_self.has_shift = false;
|
5872
|
+
lctx.kv_self.head += n_tokens;
|
5873
|
+
// Ensure kv cache head points to a valid index.
|
5874
|
+
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
5875
|
+
lctx.kv_self.head = 0;
|
5876
|
+
}
|
4145
5877
|
|
4146
5878
|
#ifdef GGML_PERF
|
4147
5879
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -4227,18 +5959,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
4227
5959
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
4228
5960
|
}
|
4229
5961
|
|
4230
|
-
static
|
5962
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
5963
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
5964
|
+
}
|
5965
|
+
|
5966
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
4231
5967
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
4232
5968
|
const auto& token_data = vocab.id_to_token.at(id);
|
4233
|
-
|
4234
|
-
|
5969
|
+
switch (llama_vocab_get_type(vocab)) {
|
5970
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
5971
|
+
auto buf = token_data.text.substr(3, 2);
|
5972
|
+
return strtol(buf.c_str(), NULL, 16);
|
5973
|
+
}
|
5974
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
5975
|
+
GGML_ASSERT(false);
|
5976
|
+
return unicode_to_bytes_bpe(token_data.text);
|
5977
|
+
}
|
5978
|
+
default:
|
5979
|
+
GGML_ASSERT(false);
|
5980
|
+
}
|
4235
5981
|
}
|
4236
5982
|
|
4237
5983
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
5984
|
+
switch (llama_vocab_get_type(vocab)) {
|
5985
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
5986
|
+
char buf[7];
|
5987
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
5988
|
+
GGML_ASSERT(0 <= result && result < 7);
|
5989
|
+
return vocab.token_to_id.at(buf);
|
5990
|
+
}
|
5991
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
5992
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
5993
|
+
}
|
5994
|
+
default:
|
5995
|
+
GGML_ASSERT(false);
|
5996
|
+
}
|
4242
5997
|
}
|
4243
5998
|
|
4244
5999
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4518,15 +6273,9 @@ struct llm_tokenizer_bpe {
|
|
4518
6273
|
std::string byte_str(1, *j);
|
4519
6274
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4520
6275
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4521
|
-
|
4522
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4523
|
-
output.push_back(token_byte);
|
4524
|
-
} catch (const std::out_of_range & err) {
|
4525
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4526
|
-
}
|
4527
|
-
} else {
|
4528
|
-
output.push_back((*token_multibyte).second);
|
6276
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4529
6277
|
}
|
6278
|
+
output.push_back((*token_multibyte).second);
|
4530
6279
|
}
|
4531
6280
|
} else {
|
4532
6281
|
output.push_back((*token).second);
|
@@ -4563,23 +6312,143 @@ private:
|
|
4563
6312
|
work_queue.push(bigram);
|
4564
6313
|
}
|
4565
6314
|
|
4566
|
-
|
4567
|
-
|
4568
|
-
std::vector<std::string>
|
6315
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
6316
|
+
std::vector<std::string> bpe_words;
|
6317
|
+
std::vector<std::string> bpe_encoded_words;
|
6318
|
+
|
6319
|
+
std::string token = "";
|
6320
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
6321
|
+
bool collecting_numeric = false;
|
6322
|
+
bool collecting_letter = false;
|
6323
|
+
bool collecting_special = false;
|
6324
|
+
bool collecting_whitespace_lookahead = false;
|
6325
|
+
bool collecting = false;
|
6326
|
+
|
6327
|
+
std::vector<std::string> text_utf;
|
6328
|
+
text_utf.reserve(text.size());
|
6329
|
+
bpe_words.reserve(text.size());
|
6330
|
+
bpe_encoded_words.reserve(text.size());
|
6331
|
+
|
6332
|
+
auto cps = codepoints_from_utf8(text);
|
6333
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
6334
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
6335
|
+
|
6336
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
6337
|
+
const std::string & utf_char = text_utf[i];
|
6338
|
+
bool split_condition = false;
|
6339
|
+
int bytes_remain = text_utf.size() - i;
|
6340
|
+
// forward backward lookups
|
6341
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
6342
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
6343
|
+
|
6344
|
+
// handling contractions
|
6345
|
+
if (!split_condition && bytes_remain >= 2) {
|
6346
|
+
// 's|'t|'m|'d
|
6347
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
6348
|
+
split_condition = true;
|
6349
|
+
}
|
6350
|
+
if (split_condition) {
|
6351
|
+
if (token.size()) {
|
6352
|
+
bpe_words.emplace_back(token); // push previous content as token
|
6353
|
+
}
|
6354
|
+
token = utf_char + utf_char_next;
|
6355
|
+
bpe_words.emplace_back(token);
|
6356
|
+
token = "";
|
6357
|
+
i++;
|
6358
|
+
continue;
|
6359
|
+
}
|
6360
|
+
}
|
6361
|
+
if (!split_condition && bytes_remain >= 3) {
|
6362
|
+
// 're|'ve|'ll
|
6363
|
+
if (utf_char == "\'" && (
|
6364
|
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6365
|
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6366
|
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
6367
|
+
) {
|
6368
|
+
split_condition = true;
|
6369
|
+
}
|
6370
|
+
if (split_condition) {
|
6371
|
+
// current token + next token can be defined
|
6372
|
+
if (token.size()) {
|
6373
|
+
bpe_words.emplace_back(token); // push previous content as token
|
6374
|
+
}
|
6375
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
6376
|
+
bpe_words.emplace_back(token); // the contraction
|
6377
|
+
token = "";
|
6378
|
+
i += 2;
|
6379
|
+
continue;
|
6380
|
+
}
|
6381
|
+
}
|
6382
|
+
|
6383
|
+
if (!split_condition && !collecting) {
|
6384
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
6385
|
+
collecting_letter = true;
|
6386
|
+
collecting = true;
|
6387
|
+
}
|
6388
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
6389
|
+
collecting_numeric = true;
|
6390
|
+
collecting = true;
|
6391
|
+
}
|
6392
|
+
else if (
|
6393
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
6394
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
6395
|
+
) {
|
6396
|
+
collecting_special = true;
|
6397
|
+
collecting = true;
|
6398
|
+
}
|
6399
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
6400
|
+
collecting_whitespace_lookahead = true;
|
6401
|
+
collecting = true;
|
6402
|
+
}
|
6403
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
6404
|
+
split_condition = true;
|
6405
|
+
}
|
6406
|
+
}
|
6407
|
+
else if (!split_condition && collecting) {
|
6408
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
6409
|
+
split_condition = true;
|
6410
|
+
}
|
6411
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
6412
|
+
split_condition = true;
|
6413
|
+
}
|
6414
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
6415
|
+
split_condition = true;
|
6416
|
+
}
|
6417
|
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
6418
|
+
split_condition = true;
|
6419
|
+
}
|
6420
|
+
}
|
6421
|
+
|
6422
|
+
if (utf_char_next == "") {
|
6423
|
+
split_condition = true; // final
|
6424
|
+
token += utf_char;
|
6425
|
+
}
|
4569
6426
|
|
4570
|
-
|
4571
|
-
|
4572
|
-
|
6427
|
+
if (split_condition) {
|
6428
|
+
if (token.size()) {
|
6429
|
+
bpe_words.emplace_back(token);
|
6430
|
+
}
|
6431
|
+
token = utf_char;
|
6432
|
+
collecting = false;
|
6433
|
+
collecting_letter = false;
|
6434
|
+
collecting_numeric = false;
|
6435
|
+
collecting_special = false;
|
6436
|
+
collecting_whitespace_lookahead = false;
|
6437
|
+
}
|
6438
|
+
else {
|
6439
|
+
token += utf_char;
|
6440
|
+
}
|
6441
|
+
}
|
4573
6442
|
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4578
|
-
|
4579
|
-
|
6443
|
+
for (std::string & word : bpe_words) {
|
6444
|
+
std::string encoded_token = "";
|
6445
|
+
for (char & c : word) {
|
6446
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
6447
|
+
}
|
6448
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4580
6449
|
}
|
4581
|
-
return words;
|
4582
6450
|
|
6451
|
+
return bpe_encoded_words;
|
4583
6452
|
}
|
4584
6453
|
|
4585
6454
|
const llama_vocab & vocab;
|
@@ -6022,7 +7891,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6022
7891
|
nthread = std::thread::hardware_concurrency();
|
6023
7892
|
}
|
6024
7893
|
|
6025
|
-
|
7894
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
7895
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
7896
|
+
#if defined(__linux__) || defined(_WIN32)
|
7897
|
+
constexpr bool use_mmap = true;
|
7898
|
+
#else
|
7899
|
+
constexpr bool use_mmap = false;
|
7900
|
+
#endif
|
7901
|
+
|
7902
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
7903
|
+
if (ml.use_mmap) {
|
7904
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7905
|
+
}
|
6026
7906
|
|
6027
7907
|
llama_model model;
|
6028
7908
|
llm_load_arch(ml, model);
|
@@ -6050,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6050
7930
|
const std::string name = ggml_get_name(meta);
|
6051
7931
|
|
6052
7932
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
6053
|
-
if (name.find("attn_v.weight") != std::string::npos) {
|
7933
|
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
6054
7934
|
++n_attention_wv;
|
6055
7935
|
}
|
6056
7936
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
@@ -6087,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6087
7967
|
}
|
6088
7968
|
|
6089
7969
|
std::ofstream fout(fname_out, std::ios::binary);
|
7970
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
6090
7971
|
|
6091
7972
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
6092
7973
|
|
@@ -6100,10 +7981,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6100
7981
|
|
6101
7982
|
const std::string name = ggml_get_name(tensor);
|
6102
7983
|
|
6103
|
-
if (
|
6104
|
-
read_data.
|
7984
|
+
if (!ml.use_mmap) {
|
7985
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
7986
|
+
read_data.resize(ggml_nbytes(tensor));
|
7987
|
+
}
|
7988
|
+
tensor->data = read_data.data();
|
6105
7989
|
}
|
6106
|
-
tensor->data = read_data.data();
|
6107
7990
|
ml.load_data_for(tensor);
|
6108
7991
|
|
6109
7992
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
@@ -6738,13 +8621,14 @@ struct llama_context * llama_new_context_with_model(
|
|
6738
8621
|
|
6739
8622
|
#ifdef GGML_USE_METAL
|
6740
8623
|
if (model->n_gpu_layers > 0) {
|
8624
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8625
|
+
|
6741
8626
|
ctx->ctx_metal = ggml_metal_init(1);
|
6742
8627
|
if (!ctx->ctx_metal) {
|
6743
8628
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6744
8629
|
llama_free(ctx);
|
6745
8630
|
return NULL;
|
6746
8631
|
}
|
6747
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
8632
|
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
8633
|
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6750
8634
|
}
|
@@ -6872,6 +8756,10 @@ int llama_n_embd(const struct llama_model * model) {
|
|
6872
8756
|
return model->hparams.n_embd;
|
6873
8757
|
}
|
6874
8758
|
|
8759
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
8760
|
+
return model->hparams.rope_freq_scale_train;
|
8761
|
+
}
|
8762
|
+
|
6875
8763
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6876
8764
|
return snprintf(buf, buf_size, "%s %s %s",
|
6877
8765
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -7039,16 +8927,6 @@ struct llama_data_file_context : llama_data_context {
|
|
7039
8927
|
*
|
7040
8928
|
*/
|
7041
8929
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
-
// TODO: does not support multi-sequence states
|
7043
|
-
{
|
7044
|
-
const auto & kv_self = ctx->kv_self;
|
7045
|
-
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
-
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
-
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
-
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
-
}
|
7050
|
-
}
|
7051
|
-
|
7052
8930
|
// copy rng
|
7053
8931
|
{
|
7054
8932
|
std::stringstream rng_ss;
|
@@ -7101,36 +8979,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7101
8979
|
const auto & hparams = ctx->model.hparams;
|
7102
8980
|
const auto & cparams = ctx->cparams;
|
7103
8981
|
|
7104
|
-
const
|
7105
|
-
const
|
7106
|
-
const
|
8982
|
+
const auto n_layer = hparams.n_layer;
|
8983
|
+
const auto n_embd = hparams.n_embd_gqa();
|
8984
|
+
const auto n_ctx = cparams.n_ctx;
|
7107
8985
|
|
7108
|
-
const size_t
|
7109
|
-
const
|
8986
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
8987
|
+
const uint32_t kv_head = kv_self.head;
|
8988
|
+
const uint32_t kv_size = kv_self.size;
|
7110
8989
|
|
7111
|
-
data_ctx->write(&
|
7112
|
-
data_ctx->write(&
|
8990
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8991
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
8992
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7113
8993
|
|
7114
|
-
if (
|
8994
|
+
if (kv_buf_size) {
|
7115
8995
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7116
8996
|
|
7117
8997
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7118
8998
|
ggml_cgraph gf{};
|
7119
8999
|
|
7120
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
9000
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7121
9001
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
7122
9002
|
kout3d->data = kout3d_data.data();
|
7123
9003
|
|
7124
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
9004
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7125
9005
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
7126
9006
|
vout3d->data = vout3d_data.data();
|
7127
9007
|
|
7128
9008
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7129
|
-
n_embd,
|
9009
|
+
n_embd, kv_head, n_layer,
|
7130
9010
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7131
9011
|
|
7132
9012
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7133
|
-
|
9013
|
+
kv_head, n_embd, n_layer,
|
7134
9014
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7135
9015
|
|
7136
9016
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -7144,6 +9024,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7144
9024
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
7145
9025
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
7146
9026
|
}
|
9027
|
+
|
9028
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
9029
|
+
const auto & cell = kv_self.cells[i];
|
9030
|
+
|
9031
|
+
const llama_pos pos = cell.pos;
|
9032
|
+
const size_t seq_id_size = cell.seq_id.size();
|
9033
|
+
|
9034
|
+
data_ctx->write(&pos, sizeof(pos));
|
9035
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
9036
|
+
|
9037
|
+
for (auto seq_id : cell.seq_id) {
|
9038
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
9039
|
+
}
|
9040
|
+
}
|
7147
9041
|
}
|
7148
9042
|
}
|
7149
9043
|
|
@@ -7215,34 +9109,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7215
9109
|
const int n_embd = hparams.n_embd_gqa();
|
7216
9110
|
const int n_ctx = cparams.n_ctx;
|
7217
9111
|
|
7218
|
-
size_t
|
7219
|
-
|
9112
|
+
size_t kv_buf_size;
|
9113
|
+
uint32_t kv_head;
|
9114
|
+
uint32_t kv_size;
|
7220
9115
|
|
7221
|
-
memcpy(&
|
7222
|
-
memcpy(&
|
9116
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9117
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9118
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
7223
9119
|
|
7224
|
-
if (
|
7225
|
-
GGML_ASSERT(kv_self.buf.size ==
|
9120
|
+
if (kv_buf_size) {
|
9121
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
7226
9122
|
|
7227
9123
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7228
9124
|
|
7229
9125
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7230
9126
|
ggml_cgraph gf{};
|
7231
9127
|
|
7232
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
9128
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7233
9129
|
kin3d->data = (void *) inp;
|
7234
9130
|
inp += ggml_nbytes(kin3d);
|
7235
9131
|
|
7236
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
9132
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7237
9133
|
vin3d->data = (void *) inp;
|
7238
9134
|
inp += ggml_nbytes(vin3d);
|
7239
9135
|
|
7240
9136
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7241
|
-
n_embd,
|
9137
|
+
n_embd, kv_head, n_layer,
|
7242
9138
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7243
9139
|
|
7244
9140
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7245
|
-
|
9141
|
+
kv_head, n_embd, n_layer,
|
7246
9142
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7247
9143
|
|
7248
9144
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -7252,8 +9148,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7252
9148
|
ggml_free(cpy_ctx);
|
7253
9149
|
}
|
7254
9150
|
|
7255
|
-
ctx->kv_self.head =
|
9151
|
+
ctx->kv_self.head = kv_head;
|
7256
9152
|
ctx->kv_self.size = kv_size;
|
9153
|
+
|
9154
|
+
ctx->kv_self.cells.resize(kv_size);
|
9155
|
+
|
9156
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
9157
|
+
llama_pos pos;
|
9158
|
+
size_t seq_id_size;
|
9159
|
+
|
9160
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
9161
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
9162
|
+
|
9163
|
+
ctx->kv_self.cells[i].pos = pos;
|
9164
|
+
|
9165
|
+
llama_seq_id seq_id;
|
9166
|
+
|
9167
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
9168
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
9169
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
9170
|
+
}
|
9171
|
+
}
|
7257
9172
|
}
|
7258
9173
|
|
7259
9174
|
const size_t nread = inp - src;
|
@@ -7471,6 +9386,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7471
9386
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7472
9387
|
return ctx->model.vocab.linefeed_id;
|
7473
9388
|
}
|
9389
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
9390
|
+
return ctx->model.vocab.special_prefix_id;
|
9391
|
+
}
|
9392
|
+
|
9393
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
9394
|
+
return ctx->model.vocab.special_middle_id;
|
9395
|
+
}
|
9396
|
+
|
9397
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
9398
|
+
return ctx->model.vocab.special_suffix_id;
|
9399
|
+
}
|
9400
|
+
|
9401
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
9402
|
+
return ctx->model.vocab.special_eot_id;
|
9403
|
+
}
|
9404
|
+
|
7474
9405
|
|
7475
9406
|
int llama_tokenize(
|
7476
9407
|
const struct llama_model * model,
|
@@ -7493,35 +9424,70 @@ int llama_tokenize(
|
|
7493
9424
|
return res.size();
|
7494
9425
|
}
|
7495
9426
|
|
9427
|
+
static std::string llama_decode_text(const std::string & text) {
|
9428
|
+
std::string decoded_text;
|
9429
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
9430
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
9431
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
9432
|
+
}
|
9433
|
+
|
9434
|
+
return decoded_text;
|
9435
|
+
}
|
9436
|
+
|
7496
9437
|
// does not write null-terminator to buf
|
7497
9438
|
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
9439
|
if (0 <= token && token < llama_n_vocab(model)) {
|
7499
|
-
|
7500
|
-
|
7501
|
-
if (
|
9440
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
9441
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
9442
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
9443
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7502
9444
|
llama_unescape_whitespace(result);
|
9445
|
+
if (length < (int) result.length()) {
|
9446
|
+
return -result.length();
|
9447
|
+
}
|
9448
|
+
memcpy(buf, result.c_str(), result.length());
|
9449
|
+
return result.length();
|
9450
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
9451
|
+
if (length < 3) {
|
9452
|
+
return -3;
|
9453
|
+
}
|
9454
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
9455
|
+
return 3;
|
9456
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
9457
|
+
;
|
9458
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
9459
|
+
if (length < 1) {
|
9460
|
+
return -1;
|
9461
|
+
}
|
9462
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
9463
|
+
return 1;
|
9464
|
+
} else {
|
9465
|
+
// TODO: for now we accept all unsupported token types,
|
9466
|
+
// suppressing them like CONTROL tokens.
|
9467
|
+
// GGML_ASSERT(false);
|
7503
9468
|
}
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
|
7520
|
-
|
7521
|
-
return -1;
|
9469
|
+
break;
|
9470
|
+
}
|
9471
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
9472
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
9473
|
+
std::string result = model->vocab.id_to_token[token].text;
|
9474
|
+
result = llama_decode_text(result);
|
9475
|
+
if (length < (int) result.length()) {
|
9476
|
+
return -result.length();
|
9477
|
+
}
|
9478
|
+
memcpy(buf, result.c_str(), result.length());
|
9479
|
+
return result.length();
|
9480
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
9481
|
+
;
|
9482
|
+
} else {
|
9483
|
+
// TODO: for now we accept all unsupported token types,
|
9484
|
+
// suppressing them like CONTROL tokens.
|
9485
|
+
// GGML_ASSERT(false);
|
7522
9486
|
}
|
7523
|
-
|
7524
|
-
|
9487
|
+
break;
|
9488
|
+
}
|
9489
|
+
default:
|
9490
|
+
GGML_ASSERT(false);
|
7525
9491
|
}
|
7526
9492
|
}
|
7527
9493
|
return 0;
|
@@ -7548,14 +9514,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7548
9514
|
const llama_timings timings = llama_get_timings(ctx);
|
7549
9515
|
|
7550
9516
|
LLAMA_LOG_INFO("\n");
|
7551
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7552
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
9517
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
9518
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7553
9519
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7554
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
9520
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7555
9521
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7556
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
9522
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7557
9523
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7558
|
-
LLAMA_LOG_INFO("%s: total time = %
|
9524
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7559
9525
|
}
|
7560
9526
|
|
7561
9527
|
void llama_reset_timings(struct llama_context * ctx) {
|