llama_cpp 0.9.5 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -74,6 +74,7 @@
|
|
74
74
|
#include <set>
|
75
75
|
#include <sstream>
|
76
76
|
#include <thread>
|
77
|
+
#include <type_traits>
|
77
78
|
#include <unordered_map>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
@@ -90,7 +91,8 @@
|
|
90
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
91
92
|
#endif
|
92
93
|
|
93
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
#define LLAMA_MAX_EXPERTS 8
|
94
96
|
|
95
97
|
//
|
96
98
|
// logging
|
@@ -192,6 +194,7 @@ enum llm_arch {
|
|
192
194
|
LLM_ARCH_REFACT,
|
193
195
|
LLM_ARCH_BLOOM,
|
194
196
|
LLM_ARCH_STABLELM,
|
197
|
+
LLM_ARCH_QWEN,
|
195
198
|
LLM_ARCH_UNKNOWN,
|
196
199
|
};
|
197
200
|
|
@@ -208,6 +211,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
208
211
|
{ LLM_ARCH_REFACT, "refact" },
|
209
212
|
{ LLM_ARCH_BLOOM, "bloom" },
|
210
213
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
214
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
211
215
|
};
|
212
216
|
|
213
217
|
enum llm_kv {
|
@@ -228,6 +232,8 @@ enum llm_kv {
|
|
228
232
|
LLM_KV_FEED_FORWARD_LENGTH,
|
229
233
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
230
234
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
235
|
+
LLM_KV_EXPERT_COUNT,
|
236
|
+
LLM_KV_EXPERT_USED_COUNT,
|
231
237
|
|
232
238
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
233
239
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -278,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
278
284
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
279
285
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
280
286
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
287
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
288
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
281
289
|
|
282
290
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
283
291
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -335,10 +343,14 @@ enum llm_tensor {
|
|
335
343
|
LLM_TENSOR_ATTN_NORM,
|
336
344
|
LLM_TENSOR_ATTN_NORM_2,
|
337
345
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
346
|
+
LLM_TENSOR_FFN_GATE_INP,
|
347
|
+
LLM_TENSOR_FFN_NORM,
|
338
348
|
LLM_TENSOR_FFN_GATE,
|
339
349
|
LLM_TENSOR_FFN_DOWN,
|
340
350
|
LLM_TENSOR_FFN_UP,
|
341
|
-
|
351
|
+
LLM_TENSOR_FFN_DOWN_EXP,
|
352
|
+
LLM_TENSOR_FFN_GATE_EXP,
|
353
|
+
LLM_TENSOR_FFN_UP_EXP,
|
342
354
|
LLM_TENSOR_ATTN_Q_NORM,
|
343
355
|
LLM_TENSOR_ATTN_K_NORM,
|
344
356
|
};
|
@@ -357,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
357
369
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
358
370
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
359
371
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
372
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
360
373
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
361
374
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
362
375
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
363
376
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
377
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
378
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
379
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
364
380
|
},
|
365
381
|
},
|
366
382
|
{
|
@@ -518,6 +534,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
518
534
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
519
535
|
},
|
520
536
|
},
|
537
|
+
{
|
538
|
+
LLM_ARCH_QWEN,
|
539
|
+
{
|
540
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
541
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
542
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
543
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
544
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
545
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
546
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
547
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
548
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
549
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
550
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
551
|
+
},
|
552
|
+
},
|
521
553
|
|
522
554
|
{
|
523
555
|
LLM_ARCH_UNKNOWN,
|
@@ -566,27 +598,16 @@ struct LLM_TN {
|
|
566
598
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
567
599
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
568
600
|
}
|
601
|
+
|
602
|
+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
603
|
+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
604
|
+
}
|
569
605
|
};
|
570
606
|
|
571
607
|
//
|
572
608
|
// gguf helpers
|
573
609
|
//
|
574
610
|
|
575
|
-
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
576
|
-
do { \
|
577
|
-
const std::string skey(key); \
|
578
|
-
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
579
|
-
if (kid >= 0) { \
|
580
|
-
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
581
|
-
if (ktype != (type)) { \
|
582
|
-
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
583
|
-
} \
|
584
|
-
(dst) = func(ctx, kid); \
|
585
|
-
} else if (req) { \
|
586
|
-
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
587
|
-
} \
|
588
|
-
} while (0)
|
589
|
-
|
590
611
|
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
591
612
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
592
613
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
@@ -620,7 +641,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
620
641
|
}
|
621
642
|
}
|
622
643
|
|
623
|
-
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
644
|
+
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
624
645
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
625
646
|
|
626
647
|
switch (type) {
|
@@ -1155,6 +1176,8 @@ struct llama_hparams {
|
|
1155
1176
|
uint32_t n_layer;
|
1156
1177
|
uint32_t n_rot;
|
1157
1178
|
uint32_t n_ff;
|
1179
|
+
uint32_t n_expert = 0;
|
1180
|
+
uint32_t n_expert_used = 0;
|
1158
1181
|
|
1159
1182
|
float f_norm_eps;
|
1160
1183
|
float f_norm_rms_eps;
|
@@ -1169,15 +1192,18 @@ struct llama_hparams {
|
|
1169
1192
|
float f_max_alibi_bias;
|
1170
1193
|
|
1171
1194
|
bool operator!=(const llama_hparams & other) const {
|
1172
|
-
if (this->vocab_only
|
1173
|
-
if (this->n_vocab
|
1174
|
-
if (this->n_ctx_train
|
1175
|
-
if (this->n_embd
|
1176
|
-
if (this->n_head
|
1177
|
-
if (this->n_head_kv
|
1178
|
-
if (this->n_layer
|
1179
|
-
if (this->n_rot
|
1180
|
-
if (this->n_ff
|
1195
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1196
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1197
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1198
|
+
if (this->n_embd != other.n_embd) return true;
|
1199
|
+
if (this->n_head != other.n_head) return true;
|
1200
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1201
|
+
if (this->n_layer != other.n_layer) return true;
|
1202
|
+
if (this->n_rot != other.n_rot) return true;
|
1203
|
+
if (this->n_ff != other.n_ff) return true;
|
1204
|
+
if (this->n_expert != other.n_expert) return true;
|
1205
|
+
if (this->n_expert_used != other.n_expert_used) return true;
|
1206
|
+
|
1181
1207
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1182
1208
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1183
1209
|
|
@@ -1222,6 +1248,7 @@ struct llama_cparams {
|
|
1222
1248
|
float yarn_beta_slow;
|
1223
1249
|
|
1224
1250
|
bool mul_mat_q;
|
1251
|
+
bool offload_kqv;
|
1225
1252
|
};
|
1226
1253
|
|
1227
1254
|
struct llama_layer {
|
@@ -1243,6 +1270,9 @@ struct llama_layer {
|
|
1243
1270
|
struct ggml_tensor * wqkv;
|
1244
1271
|
|
1245
1272
|
// attention bias
|
1273
|
+
struct ggml_tensor * bq;
|
1274
|
+
struct ggml_tensor * bk;
|
1275
|
+
struct ggml_tensor * bv;
|
1246
1276
|
struct ggml_tensor * bo;
|
1247
1277
|
struct ggml_tensor * bqkv;
|
1248
1278
|
|
@@ -1255,6 +1285,12 @@ struct llama_layer {
|
|
1255
1285
|
struct ggml_tensor * ffn_down; // w2
|
1256
1286
|
struct ggml_tensor * ffn_up; // w3
|
1257
1287
|
|
1288
|
+
// ff MoE
|
1289
|
+
struct ggml_tensor * ffn_gate_inp;
|
1290
|
+
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
1291
|
+
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
1292
|
+
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
1293
|
+
|
1258
1294
|
// ff bias
|
1259
1295
|
struct ggml_tensor * ffn_down_b; // b2
|
1260
1296
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -1287,8 +1323,8 @@ struct llama_kv_cache {
|
|
1287
1323
|
|
1288
1324
|
std::vector<llama_kv_cell> cells;
|
1289
1325
|
|
1290
|
-
struct ggml_tensor
|
1291
|
-
struct ggml_tensor
|
1326
|
+
std::vector<struct ggml_tensor *> k_l; // per layer
|
1327
|
+
std::vector<struct ggml_tensor *> v_l;
|
1292
1328
|
|
1293
1329
|
struct ggml_context * ctx = NULL;
|
1294
1330
|
|
@@ -1301,8 +1337,10 @@ struct llama_kv_cache {
|
|
1301
1337
|
|
1302
1338
|
#ifdef GGML_USE_CUBLAS
|
1303
1339
|
if (ggml_cublas_loaded()) {
|
1304
|
-
|
1305
|
-
|
1340
|
+
for (size_t i = 0; i < k_l.size(); ++i) {
|
1341
|
+
ggml_cuda_free_data(k_l[i]);
|
1342
|
+
ggml_cuda_free_data(v_l[i]);
|
1343
|
+
}
|
1306
1344
|
}
|
1307
1345
|
#endif
|
1308
1346
|
}
|
@@ -1492,9 +1530,11 @@ struct llama_context {
|
|
1492
1530
|
static bool llama_kv_cache_init(
|
1493
1531
|
const struct llama_hparams & hparams,
|
1494
1532
|
struct llama_kv_cache & cache,
|
1495
|
-
ggml_type
|
1533
|
+
ggml_type ktype,
|
1534
|
+
ggml_type vtype,
|
1496
1535
|
uint32_t n_ctx,
|
1497
|
-
int n_gpu_layers
|
1536
|
+
int n_gpu_layers,
|
1537
|
+
bool offload) {
|
1498
1538
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1499
1539
|
const uint32_t n_layer = hparams.n_layer;
|
1500
1540
|
|
@@ -1510,7 +1550,7 @@ static bool llama_kv_cache_init(
|
|
1510
1550
|
cache.cells.clear();
|
1511
1551
|
cache.cells.resize(n_ctx);
|
1512
1552
|
|
1513
|
-
cache.buf.resize(
|
1553
|
+
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1514
1554
|
memset(cache.buf.data, 0, cache.buf.size);
|
1515
1555
|
|
1516
1556
|
struct ggml_init_params params;
|
@@ -1520,37 +1560,44 @@ static bool llama_kv_cache_init(
|
|
1520
1560
|
|
1521
1561
|
cache.ctx = ggml_init(params);
|
1522
1562
|
|
1563
|
+
size_t vram_kv_cache = 0;
|
1564
|
+
|
1523
1565
|
if (!cache.ctx) {
|
1524
1566
|
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
1525
1567
|
return false;
|
1526
1568
|
}
|
1527
1569
|
|
1528
|
-
cache.
|
1529
|
-
cache.
|
1530
|
-
ggml_set_name(cache.k, "cache_k");
|
1531
|
-
ggml_set_name(cache.v, "cache_v");
|
1570
|
+
cache.k_l.reserve(n_layer);
|
1571
|
+
cache.v_l.reserve(n_layer);
|
1532
1572
|
|
1533
|
-
(
|
1573
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
1534
1574
|
|
1535
|
-
|
1536
|
-
if (ggml_cublas_loaded()) {
|
1537
|
-
size_t vram_kv_cache = 0;
|
1575
|
+
GGML_UNUSED(offload);
|
1538
1576
|
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1577
|
+
for (int i = 0; i < (int) n_layer; i++) {
|
1578
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
1579
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
1580
|
+
ggml_format_name(k, "cache_k_l%d", i);
|
1581
|
+
ggml_format_name(v, "cache_v_l%d", i);
|
1582
|
+
cache.k_l.push_back(k);
|
1583
|
+
cache.v_l.push_back(v);
|
1584
|
+
#ifdef GGML_USE_CUBLAS
|
1585
|
+
if (i >= i_gpu_start) {
|
1586
|
+
if (offload) {
|
1587
|
+
ggml_cuda_assign_buffers_no_scratch(k);
|
1588
|
+
vram_kv_cache += ggml_nbytes(k);
|
1589
|
+
ggml_cuda_assign_buffers_no_scratch(v);
|
1590
|
+
vram_kv_cache += ggml_nbytes(v);
|
1591
|
+
}
|
1551
1592
|
}
|
1593
|
+
#endif // GGML_USE_CUBLAS
|
1552
1594
|
}
|
1553
|
-
|
1595
|
+
|
1596
|
+
if (vram_kv_cache > 0) {
|
1597
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
GGML_UNUSED(n_gpu_layers);
|
1554
1601
|
|
1555
1602
|
return true;
|
1556
1603
|
}
|
@@ -1771,6 +1818,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
1771
1818
|
return buf;
|
1772
1819
|
}
|
1773
1820
|
|
1821
|
+
namespace GGUFMeta {
|
1822
|
+
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
1823
|
+
struct GKV_Base_Type {
|
1824
|
+
static constexpr gguf_type gt = gt_;
|
1825
|
+
|
1826
|
+
static T getter(const gguf_context * ctx, const int kid) {
|
1827
|
+
return gfun(ctx, kid);
|
1828
|
+
}
|
1829
|
+
};
|
1830
|
+
|
1831
|
+
template<typename T> struct GKV_Base;
|
1832
|
+
|
1833
|
+
template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
|
1834
|
+
template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
|
1835
|
+
template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
|
1836
|
+
template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
|
1837
|
+
template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
|
1838
|
+
template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
|
1839
|
+
template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
|
1840
|
+
template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
|
1841
|
+
template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
|
1842
|
+
template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
|
1843
|
+
template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
|
1844
|
+
template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
|
1845
|
+
|
1846
|
+
template<> struct GKV_Base<std::string> {
|
1847
|
+
static constexpr gguf_type gt = GGUF_TYPE_STRING;
|
1848
|
+
|
1849
|
+
static std::string getter(const gguf_context * ctx, const int kid) {
|
1850
|
+
return gguf_get_val_str(ctx, kid);
|
1851
|
+
}
|
1852
|
+
};
|
1853
|
+
|
1854
|
+
struct ArrayInfo{
|
1855
|
+
const gguf_type gt;
|
1856
|
+
const size_t length;
|
1857
|
+
const void * data;
|
1858
|
+
};
|
1859
|
+
|
1860
|
+
template<> struct GKV_Base<ArrayInfo> {
|
1861
|
+
public:
|
1862
|
+
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
1863
|
+
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
1864
|
+
return ArrayInfo {
|
1865
|
+
gguf_get_arr_type(ctx, k),
|
1866
|
+
size_t(gguf_get_arr_n(ctx, k)),
|
1867
|
+
gguf_get_arr_data(ctx, k),
|
1868
|
+
};
|
1869
|
+
}
|
1870
|
+
};
|
1871
|
+
|
1872
|
+
template<typename T>
|
1873
|
+
class GKV: public GKV_Base<T> {
|
1874
|
+
GKV() = delete;
|
1875
|
+
|
1876
|
+
public:
|
1877
|
+
static T get_kv(const gguf_context * ctx, const int k) {
|
1878
|
+
const enum gguf_type kt = gguf_get_kv_type(ctx, k);
|
1879
|
+
|
1880
|
+
if (kt != GKV::gt) {
|
1881
|
+
throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
|
1882
|
+
gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
|
1883
|
+
}
|
1884
|
+
return GKV::getter(ctx, k);
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
1888
|
+
switch (ty) {
|
1889
|
+
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
|
1890
|
+
case LLAMA_KV_OVERRIDE_INT: return "int";
|
1891
|
+
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
|
1892
|
+
}
|
1893
|
+
return "unknown";
|
1894
|
+
}
|
1895
|
+
|
1896
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
|
1897
|
+
if (!override) { return false; }
|
1898
|
+
if (override->tag == expected_type) {
|
1899
|
+
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
1900
|
+
__func__, override_type_to_str(override->tag), override->key);
|
1901
|
+
switch (override->tag) {
|
1902
|
+
case LLAMA_KV_OVERRIDE_BOOL: {
|
1903
|
+
printf("%s\n", override->bool_value ? "true" : "false");
|
1904
|
+
} break;
|
1905
|
+
case LLAMA_KV_OVERRIDE_INT: {
|
1906
|
+
printf("%" PRId64 "\n", override->int_value);
|
1907
|
+
} break;
|
1908
|
+
case LLAMA_KV_OVERRIDE_FLOAT: {
|
1909
|
+
printf("%.6f\n", override->float_value);
|
1910
|
+
} break;
|
1911
|
+
default:
|
1912
|
+
// Shouldn't be possible to end up here, but just in case...
|
1913
|
+
throw std::runtime_error(
|
1914
|
+
format("Unsupported attempt to override %s type for metadata key %s\n",
|
1915
|
+
override_type_to_str(override->tag), override->key));
|
1916
|
+
}
|
1917
|
+
return true;
|
1918
|
+
}
|
1919
|
+
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
1920
|
+
__func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
|
1921
|
+
return false;
|
1922
|
+
}
|
1923
|
+
|
1924
|
+
template<typename OT>
|
1925
|
+
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
1926
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1927
|
+
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
|
1928
|
+
target = override->bool_value;
|
1929
|
+
return true;
|
1930
|
+
}
|
1931
|
+
return true;
|
1932
|
+
}
|
1933
|
+
|
1934
|
+
template<typename OT>
|
1935
|
+
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
1936
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1937
|
+
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
|
1938
|
+
target = override->int_value;
|
1939
|
+
return true;
|
1940
|
+
}
|
1941
|
+
return false;
|
1942
|
+
}
|
1943
|
+
|
1944
|
+
template<typename OT>
|
1945
|
+
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
1946
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1947
|
+
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
|
1948
|
+
target = override->float_value;
|
1949
|
+
return true;
|
1950
|
+
}
|
1951
|
+
return false;
|
1952
|
+
}
|
1953
|
+
|
1954
|
+
template<typename OT>
|
1955
|
+
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
1956
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1957
|
+
(void)target;
|
1958
|
+
(void)override;
|
1959
|
+
if (!override) { return false; }
|
1960
|
+
// Currently, we should never end up here so it would be a bug if we do.
|
1961
|
+
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
1962
|
+
override ? override->key : "NULL"));
|
1963
|
+
}
|
1964
|
+
|
1965
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1966
|
+
if (try_override<T>(target, override)) {
|
1967
|
+
return true;
|
1968
|
+
}
|
1969
|
+
if (k < 0) { return false; }
|
1970
|
+
target = get_kv(ctx, k);
|
1971
|
+
return true;
|
1972
|
+
}
|
1973
|
+
|
1974
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1975
|
+
return set(ctx, gguf_find_key(ctx, key), target, override);
|
1976
|
+
}
|
1977
|
+
|
1978
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1979
|
+
return set(ctx, key.c_str(), target, override);
|
1980
|
+
}
|
1981
|
+
};
|
1982
|
+
}
|
1983
|
+
|
1774
1984
|
struct llama_model_loader {
|
1775
1985
|
int n_kv = 0;
|
1776
1986
|
int n_tensors = 0;
|
@@ -1786,21 +1996,34 @@ struct llama_model_loader {
|
|
1786
1996
|
llama_fver fver;
|
1787
1997
|
|
1788
1998
|
std::unique_ptr<llama_mmap> mapping;
|
1999
|
+
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
1789
2000
|
|
1790
2001
|
struct gguf_context * ctx_gguf = NULL;
|
1791
2002
|
struct ggml_context * ctx_meta = NULL;
|
1792
2003
|
|
1793
|
-
|
2004
|
+
std::string arch_name;
|
2005
|
+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2006
|
+
|
2007
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
1794
2008
|
struct gguf_init_params params = {
|
1795
2009
|
/*.no_alloc = */ true,
|
1796
2010
|
/*.ctx = */ &ctx_meta,
|
1797
2011
|
};
|
1798
2012
|
|
2013
|
+
if (param_overrides_p != nullptr) {
|
2014
|
+
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2015
|
+
kv_overrides.insert({std::string(p->key), *p});
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
|
1799
2019
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
1800
2020
|
if (!ctx_gguf) {
|
1801
2021
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
1802
2022
|
}
|
1803
2023
|
|
2024
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2025
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2026
|
+
|
1804
2027
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
1805
2028
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
1806
2029
|
|
@@ -1868,6 +2091,7 @@ struct llama_model_loader {
|
|
1868
2091
|
}
|
1869
2092
|
}
|
1870
2093
|
|
2094
|
+
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
1871
2095
|
for (int i = 0; i < n_kv; i++) {
|
1872
2096
|
const char * name = gguf_get_key(ctx_gguf, i);
|
1873
2097
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
@@ -1913,19 +2137,59 @@ struct llama_model_loader {
|
|
1913
2137
|
}
|
1914
2138
|
}
|
1915
2139
|
|
1916
|
-
|
1917
|
-
|
2140
|
+
template<typename T>
|
2141
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2142
|
+
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2143
|
+
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
1918
2144
|
|
1919
|
-
|
1920
|
-
|
2145
|
+
if (kid < 0) {
|
2146
|
+
if (required) {
|
2147
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2148
|
+
}
|
2149
|
+
return false;
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
2153
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
2154
|
+
|
2155
|
+
|
2156
|
+
result = arr_info.length;
|
2157
|
+
return true;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
template<typename T>
|
2161
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2162
|
+
get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
|
2163
|
+
return get_arr_n(llm_kv(kid), result, required);
|
2164
|
+
}
|
2165
|
+
|
2166
|
+
template<typename T>
|
2167
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
2168
|
+
auto it = kv_overrides.find(key);
|
2169
|
+
|
2170
|
+
const struct llama_model_kv_override * override =
|
2171
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
2172
|
+
|
2173
|
+
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
1921
2174
|
|
2175
|
+
if (required && !found) {
|
2176
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
return found;
|
2180
|
+
}
|
2181
|
+
|
2182
|
+
template<typename T>
|
2183
|
+
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
2184
|
+
return get_key(llm_kv(kid), result, required);
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
std::string get_arch_name() const {
|
1922
2188
|
return arch_name;
|
1923
2189
|
}
|
1924
2190
|
|
1925
2191
|
enum llm_arch get_arch() const {
|
1926
|
-
|
1927
|
-
|
1928
|
-
return llm_arch_from_string(arch_name);
|
2192
|
+
return llm_kv.arch;
|
1929
2193
|
}
|
1930
2194
|
|
1931
2195
|
const char * get_tensor_name(int i) const {
|
@@ -1965,10 +2229,13 @@ struct llama_model_loader {
|
|
1965
2229
|
return tensor;
|
1966
2230
|
}
|
1967
2231
|
|
1968
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
2232
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
|
1969
2233
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1970
2234
|
|
1971
2235
|
if (cur == NULL) {
|
2236
|
+
if (!required) {
|
2237
|
+
return NULL;
|
2238
|
+
}
|
1972
2239
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
1973
2240
|
}
|
1974
2241
|
|
@@ -2172,11 +2439,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
2172
2439
|
static void llm_load_hparams(
|
2173
2440
|
llama_model_loader & ml,
|
2174
2441
|
llama_model & model) {
|
2175
|
-
struct gguf_context * ctx = ml.ctx_gguf;
|
2176
|
-
|
2177
|
-
const auto kv = LLM_KV(model.arch);
|
2178
|
-
|
2179
2442
|
auto & hparams = model.hparams;
|
2443
|
+
const gguf_context * ctx = ml.ctx_gguf;
|
2180
2444
|
|
2181
2445
|
// get metadata as string
|
2182
2446
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -2190,42 +2454,51 @@ static void llm_load_hparams(
|
|
2190
2454
|
}
|
2191
2455
|
|
2192
2456
|
// get general kv
|
2193
|
-
|
2457
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
2194
2458
|
|
2195
2459
|
// get hparams kv
|
2196
|
-
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
2201
|
-
|
2460
|
+
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
2461
|
+
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
2462
|
+
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
2463
|
+
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2464
|
+
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2465
|
+
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2466
|
+
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
2467
|
+
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
2468
|
+
|
2469
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
2470
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
2471
|
+
if (hparams.n_expert > 0) {
|
2472
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
2473
|
+
} else {
|
2474
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
2475
|
+
}
|
2202
2476
|
|
2203
2477
|
// n_head_kv is optional, default to n_head
|
2204
2478
|
hparams.n_head_kv = hparams.n_head;
|
2205
|
-
|
2479
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
|
2206
2480
|
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2481
|
+
bool rope_finetuned = false;
|
2482
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
2483
|
+
hparams.rope_finetuned = rope_finetuned;
|
2210
2484
|
|
2211
2485
|
hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
2212
|
-
|
2213
|
-
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
2486
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
|
2214
2487
|
|
2215
2488
|
// rope_freq_base (optional)
|
2216
2489
|
hparams.rope_freq_base_train = 10000.0f;
|
2217
|
-
|
2490
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
2218
2491
|
|
2219
2492
|
std::string rope_scaling("linear");
|
2220
|
-
|
2493
|
+
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2221
2494
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2222
2495
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
2223
2496
|
|
2224
2497
|
// rope_freq_scale (inverse of the kv) is optional
|
2225
2498
|
float ropescale = 0.0f;
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2499
|
+
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
2500
|
+
// try the old key name
|
2501
|
+
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
2229
2502
|
}
|
2230
2503
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
2231
2504
|
|
@@ -2233,7 +2506,7 @@ static void llm_load_hparams(
|
|
2233
2506
|
{
|
2234
2507
|
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
2235
2508
|
|
2236
|
-
|
2509
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
2237
2510
|
|
2238
2511
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
2239
2512
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
@@ -2248,7 +2521,7 @@ static void llm_load_hparams(
|
|
2248
2521
|
switch (model.arch) {
|
2249
2522
|
case LLM_ARCH_LLAMA:
|
2250
2523
|
{
|
2251
|
-
|
2524
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2252
2525
|
|
2253
2526
|
switch (hparams.n_layer) {
|
2254
2527
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -2262,7 +2535,7 @@ static void llm_load_hparams(
|
|
2262
2535
|
} break;
|
2263
2536
|
case LLM_ARCH_FALCON:
|
2264
2537
|
{
|
2265
|
-
|
2538
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2266
2539
|
|
2267
2540
|
switch (hparams.n_layer) {
|
2268
2541
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2272,7 +2545,7 @@ static void llm_load_hparams(
|
|
2272
2545
|
} break;
|
2273
2546
|
case LLM_ARCH_BAICHUAN:
|
2274
2547
|
{
|
2275
|
-
|
2548
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2276
2549
|
switch (hparams.n_layer) {
|
2277
2550
|
case 32: model.type = e_model::MODEL_7B; break;
|
2278
2551
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2281,7 +2554,7 @@ static void llm_load_hparams(
|
|
2281
2554
|
} break;
|
2282
2555
|
case LLM_ARCH_STARCODER:
|
2283
2556
|
{
|
2284
|
-
|
2557
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2285
2558
|
switch (hparams.n_layer) {
|
2286
2559
|
case 24: model.type = e_model::MODEL_1B; break;
|
2287
2560
|
case 36: model.type = e_model::MODEL_3B; break;
|
@@ -2292,7 +2565,7 @@ static void llm_load_hparams(
|
|
2292
2565
|
} break;
|
2293
2566
|
case LLM_ARCH_PERSIMMON:
|
2294
2567
|
{
|
2295
|
-
|
2568
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2296
2569
|
switch (hparams.n_layer) {
|
2297
2570
|
case 36: model.type = e_model::MODEL_8B; break;
|
2298
2571
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2300,7 +2573,7 @@ static void llm_load_hparams(
|
|
2300
2573
|
} break;
|
2301
2574
|
case LLM_ARCH_REFACT:
|
2302
2575
|
{
|
2303
|
-
|
2576
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2304
2577
|
switch (hparams.n_layer) {
|
2305
2578
|
case 32: model.type = e_model::MODEL_1B; break;
|
2306
2579
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2308,7 +2581,7 @@ static void llm_load_hparams(
|
|
2308
2581
|
} break;
|
2309
2582
|
case LLM_ARCH_BLOOM:
|
2310
2583
|
{
|
2311
|
-
|
2584
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2312
2585
|
|
2313
2586
|
switch (hparams.n_layer) {
|
2314
2587
|
case 24: model.type = e_model::MODEL_1B; break;
|
@@ -2323,9 +2596,9 @@ static void llm_load_hparams(
|
|
2323
2596
|
{
|
2324
2597
|
hparams.f_clamp_kqv = 0.0f;
|
2325
2598
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2599
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2600
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
2601
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
2329
2602
|
|
2330
2603
|
switch (hparams.n_layer) {
|
2331
2604
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2335,13 +2608,23 @@ static void llm_load_hparams(
|
|
2335
2608
|
} break;
|
2336
2609
|
case LLM_ARCH_STABLELM:
|
2337
2610
|
{
|
2338
|
-
|
2611
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2339
2612
|
|
2340
2613
|
switch (hparams.n_layer) {
|
2341
2614
|
case 32: model.type = e_model::MODEL_3B; break;
|
2342
2615
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2343
2616
|
}
|
2344
2617
|
} break;
|
2618
|
+
case LLM_ARCH_QWEN:
|
2619
|
+
{
|
2620
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2621
|
+
|
2622
|
+
switch (hparams.n_layer) {
|
2623
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2624
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
2625
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2626
|
+
}
|
2627
|
+
} break;
|
2345
2628
|
|
2346
2629
|
default: (void)0;
|
2347
2630
|
}
|
@@ -2383,7 +2666,7 @@ static void llm_load_vocab(
|
|
2383
2666
|
{
|
2384
2667
|
std::string tokenizer_name;
|
2385
2668
|
|
2386
|
-
|
2669
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
2387
2670
|
|
2388
2671
|
if (tokenizer_name == "llama") {
|
2389
2672
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2473,34 +2756,31 @@ static void llm_load_vocab(
|
|
2473
2756
|
};
|
2474
2757
|
for (const auto & it : special_token_types) {
|
2475
2758
|
const std::string & key = kv(std::get<0>(it));
|
2476
|
-
int32_t & id = std::get<1>(it)
|
2759
|
+
int32_t & id = std::get<1>(it);
|
2477
2760
|
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
if (
|
2483
|
-
LLAMA_LOG_WARN("%s: bad special token: '%s' = %
|
2484
|
-
__func__, key.c_str(),
|
2485
|
-
|
2761
|
+
uint32_t new_id;
|
2762
|
+
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
2763
|
+
continue;
|
2764
|
+
}
|
2765
|
+
if (new_id >= vocab.id_to_token.size()) {
|
2766
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
|
2767
|
+
__func__, key.c_str(), new_id, id);
|
2768
|
+
} else {
|
2769
|
+
id = new_id;
|
2486
2770
|
}
|
2487
2771
|
|
2488
2772
|
}
|
2489
2773
|
|
2490
2774
|
// Handle add_bos_token and add_eos_token
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2496
|
-
|
2497
|
-
|
2498
|
-
|
2499
|
-
|
2500
|
-
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2501
|
-
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2502
|
-
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2503
|
-
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2775
|
+
{
|
2776
|
+
bool temp = true;
|
2777
|
+
|
2778
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
2779
|
+
vocab.special_add_bos = int(temp);
|
2780
|
+
}
|
2781
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
2782
|
+
vocab.special_add_eos = int(temp);
|
2783
|
+
}
|
2504
2784
|
}
|
2505
2785
|
}
|
2506
2786
|
|
@@ -2511,7 +2791,7 @@ static void llm_load_vocab(
|
|
2511
2791
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2512
2792
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2513
2793
|
// are special tokens.
|
2514
|
-
// From testing, this appears to
|
2794
|
+
// From testing, this appears to correlate 1:1 with special tokens.
|
2515
2795
|
//
|
2516
2796
|
|
2517
2797
|
// Counting special tokens and verifying in only one direction
|
@@ -2624,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2624
2904
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2625
2905
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2626
2906
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2907
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
2908
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
2627
2909
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
2628
2910
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2629
2911
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2733,14 +3015,7 @@ static void llm_load_tensors(
|
|
2733
3015
|
ggml_backend_type backend_output;
|
2734
3016
|
|
2735
3017
|
if (n_gpu_layers > int(n_layer)) {
|
2736
|
-
|
2737
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2738
|
-
#ifndef _WIN32
|
2739
|
-
backend_norm = llama_backend_offload;
|
2740
|
-
#else
|
2741
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2742
|
-
#endif // _WIN32
|
2743
|
-
|
3018
|
+
backend_norm = llama_backend_offload;
|
2744
3019
|
backend_output = llama_backend_offload_split;
|
2745
3020
|
} else {
|
2746
3021
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2777,17 +3052,55 @@ static void llm_load_tensors(
|
|
2777
3052
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2778
3053
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2779
3054
|
|
3055
|
+
// optional bias tensors
|
3056
|
+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
|
3057
|
+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
|
3058
|
+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
|
3059
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
|
3060
|
+
|
2780
3061
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2781
3062
|
|
2782
|
-
layer.
|
2783
|
-
|
2784
|
-
layer.
|
3063
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
3064
|
+
|
3065
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3066
|
+
GGML_ASSERT(hparams.n_expert == 0);
|
3067
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
3068
|
+
|
3069
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3070
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3071
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3072
|
+
} else {
|
3073
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
3074
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
3075
|
+
|
3076
|
+
// MoE branch
|
3077
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3078
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3079
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
3080
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
|
+
}
|
3082
|
+
}
|
2785
3083
|
|
2786
3084
|
if (backend == GGML_BACKEND_GPU) {
|
2787
3085
|
vram_weights +=
|
2788
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)
|
2789
|
-
ggml_nbytes(layer.wv)
|
2790
|
-
|
3086
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3087
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3088
|
+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3089
|
+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3090
|
+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3091
|
+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3092
|
+
ggml_nbytes(layer.ffn_norm);
|
3093
|
+
|
3094
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
+
vram_weights +=
|
3096
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
+
} else {
|
3098
|
+
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
+
vram_weights +=
|
3101
|
+
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
+
}
|
3103
|
+
}
|
2791
3104
|
}
|
2792
3105
|
}
|
2793
3106
|
} break;
|
@@ -2799,14 +3112,7 @@ static void llm_load_tensors(
|
|
2799
3112
|
ggml_backend_type backend_output;
|
2800
3113
|
|
2801
3114
|
if (n_gpu_layers > int(n_layer)) {
|
2802
|
-
|
2803
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2804
|
-
#ifndef _WIN32
|
2805
|
-
backend_norm = llama_backend_offload;
|
2806
|
-
#else
|
2807
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2808
|
-
#endif // _WIN32
|
2809
|
-
|
3115
|
+
backend_norm = llama_backend_offload;
|
2810
3116
|
backend_output = llama_backend_offload_split;
|
2811
3117
|
} else {
|
2812
3118
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2869,14 +3175,7 @@ static void llm_load_tensors(
|
|
2869
3175
|
ggml_backend_type backend_output;
|
2870
3176
|
|
2871
3177
|
if (n_gpu_layers > int(n_layer)) {
|
2872
|
-
|
2873
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2874
|
-
#ifndef _WIN32
|
2875
|
-
backend_norm = llama_backend_offload;
|
2876
|
-
#else
|
2877
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2878
|
-
#endif // _WIN32
|
2879
|
-
|
3178
|
+
backend_norm = llama_backend_offload;
|
2880
3179
|
backend_output = llama_backend_offload_split;
|
2881
3180
|
} else {
|
2882
3181
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2946,14 +3245,7 @@ static void llm_load_tensors(
|
|
2946
3245
|
ggml_backend_type backend_output;
|
2947
3246
|
|
2948
3247
|
if (n_gpu_layers > int(n_layer)) {
|
2949
|
-
|
2950
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2951
|
-
#ifndef _WIN32
|
2952
|
-
backend_norm = llama_backend_offload;
|
2953
|
-
#else
|
2954
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2955
|
-
#endif // _WIN32
|
2956
|
-
|
3248
|
+
backend_norm = llama_backend_offload;
|
2957
3249
|
backend_output = llama_backend_offload_split;
|
2958
3250
|
} else {
|
2959
3251
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3023,21 +3315,7 @@ static void llm_load_tensors(
|
|
3023
3315
|
ggml_backend_type backend_output;
|
3024
3316
|
|
3025
3317
|
if (n_gpu_layers > int(n_layer)) {
|
3026
|
-
|
3027
|
-
if (n_gpu_layers > int(n_layer + 1)) {
|
3028
|
-
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3029
|
-
__func__, n_layer + 1);
|
3030
|
-
throw std::runtime_error("Persimmon CUDA offload failed");
|
3031
|
-
}
|
3032
|
-
#endif
|
3033
|
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3034
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3035
|
-
#ifndef _WIN32
|
3036
|
-
backend_norm = llama_backend_offload;
|
3037
|
-
#else
|
3038
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3039
|
-
#endif // _WIN32
|
3040
|
-
|
3318
|
+
backend_norm = llama_backend_offload;
|
3041
3319
|
backend_output = llama_backend_offload_split;
|
3042
3320
|
} else {
|
3043
3321
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3096,14 +3374,7 @@ static void llm_load_tensors(
|
|
3096
3374
|
ggml_backend_type backend_output;
|
3097
3375
|
|
3098
3376
|
if (n_gpu_layers > int(n_layer)) {
|
3099
|
-
|
3100
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3101
|
-
#ifndef _WIN32
|
3102
|
-
backend_norm = llama_backend_offload;
|
3103
|
-
#else
|
3104
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3105
|
-
#endif // _WIN32
|
3106
|
-
|
3377
|
+
backend_norm = llama_backend_offload;
|
3107
3378
|
backend_output = llama_backend_offload_split;
|
3108
3379
|
} else {
|
3109
3380
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3174,14 +3445,7 @@ static void llm_load_tensors(
|
|
3174
3445
|
ggml_backend_type backend_output;
|
3175
3446
|
|
3176
3447
|
if (n_gpu_layers > int(n_layer)) {
|
3177
|
-
|
3178
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3179
|
-
#ifndef _WIN32
|
3180
|
-
backend_norm = llama_backend_offload;
|
3181
|
-
#else
|
3182
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3183
|
-
#endif // _WIN32
|
3184
|
-
|
3448
|
+
backend_norm = llama_backend_offload;
|
3185
3449
|
backend_output = llama_backend_offload_split;
|
3186
3450
|
} else {
|
3187
3451
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3241,14 +3505,7 @@ static void llm_load_tensors(
|
|
3241
3505
|
ggml_backend_type backend_output;
|
3242
3506
|
|
3243
3507
|
if (n_gpu_layers > int(n_layer)) {
|
3244
|
-
|
3245
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3246
|
-
#ifndef _WIN32
|
3247
|
-
backend_norm = llama_backend_offload;
|
3248
|
-
#else
|
3249
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3250
|
-
#endif // _WIN32
|
3251
|
-
|
3508
|
+
backend_norm = llama_backend_offload;
|
3252
3509
|
backend_output = llama_backend_offload_split;
|
3253
3510
|
} else {
|
3254
3511
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3305,6 +3562,64 @@ static void llm_load_tensors(
|
|
3305
3562
|
}
|
3306
3563
|
}
|
3307
3564
|
} break;
|
3565
|
+
case LLM_ARCH_QWEN:
|
3566
|
+
{
|
3567
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3568
|
+
{
|
3569
|
+
ggml_backend_type backend_norm;
|
3570
|
+
ggml_backend_type backend_output;
|
3571
|
+
|
3572
|
+
if (n_gpu_layers > int(n_layer)) {
|
3573
|
+
backend_norm = llama_backend_offload;
|
3574
|
+
backend_output = llama_backend_offload_split;
|
3575
|
+
} else {
|
3576
|
+
backend_norm = GGML_BACKEND_CPU;
|
3577
|
+
backend_output = GGML_BACKEND_CPU;
|
3578
|
+
}
|
3579
|
+
|
3580
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3581
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3582
|
+
|
3583
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3584
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3585
|
+
}
|
3586
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3587
|
+
vram_weights += ggml_nbytes(model.output);
|
3588
|
+
}
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
const uint32_t n_ff = hparams.n_ff / 2;
|
3592
|
+
|
3593
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3594
|
+
|
3595
|
+
model.layers.resize(n_layer);
|
3596
|
+
|
3597
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3598
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3599
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3600
|
+
|
3601
|
+
auto & layer = model.layers[i];
|
3602
|
+
|
3603
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3604
|
+
|
3605
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
|
3606
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
|
3607
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3608
|
+
|
3609
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3610
|
+
|
3611
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3612
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3613
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3614
|
+
|
3615
|
+
if (backend == GGML_BACKEND_GPU) {
|
3616
|
+
vram_weights +=
|
3617
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3618
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3619
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3620
|
+
}
|
3621
|
+
}
|
3622
|
+
} break;
|
3308
3623
|
|
3309
3624
|
default:
|
3310
3625
|
throw std::runtime_error("unknown architecture");
|
@@ -3331,8 +3646,8 @@ static void llm_load_tensors(
|
|
3331
3646
|
}
|
3332
3647
|
|
3333
3648
|
#ifdef GGML_USE_CUBLAS
|
3334
|
-
const int max_backend_supported_layers = hparams.n_layer +
|
3335
|
-
const int max_offloadable_layers = hparams.n_layer +
|
3649
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3650
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
3336
3651
|
#elif GGML_USE_CLBLAST
|
3337
3652
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3338
3653
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
@@ -3373,7 +3688,7 @@ static void llm_load_tensors(
|
|
3373
3688
|
|
3374
3689
|
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3375
3690
|
try {
|
3376
|
-
llama_model_loader ml(fname, params.use_mmap);
|
3691
|
+
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3377
3692
|
|
3378
3693
|
model.hparams.vocab_only = params.vocab_only;
|
3379
3694
|
|
@@ -3500,11 +3815,11 @@ static void llm_build_k_shift(
|
|
3500
3815
|
struct ggml_tensor * tmp =
|
3501
3816
|
// we rotate only the first n_rot dimensions
|
3502
3817
|
ggml_rope_custom_inplace(ctx,
|
3503
|
-
ggml_view_3d(ctx, kv.
|
3818
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3504
3819
|
n_embd_head, n_head_kv, n_ctx,
|
3505
|
-
|
3506
|
-
|
3507
|
-
|
3820
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3821
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
3822
|
+
0),
|
3508
3823
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3509
3824
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
3510
3825
|
cb(tmp, "K_shifted", il);
|
@@ -3531,13 +3846,13 @@ static void llm_build_kv_store(
|
|
3531
3846
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
3532
3847
|
cb(v_cur_t, "v_cur_t", il);
|
3533
3848
|
|
3534
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.
|
3535
|
-
(
|
3849
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3850
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
3536
3851
|
cb(k_cache_view, "k_cache_view", il);
|
3537
3852
|
|
3538
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.
|
3539
|
-
(
|
3540
|
-
(
|
3853
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
3854
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
3855
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
3541
3856
|
cb(v_cache_view, "v_cache_view", il);
|
3542
3857
|
|
3543
3858
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -3689,11 +4004,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3689
4004
|
cb(q, "q", il);
|
3690
4005
|
|
3691
4006
|
struct ggml_tensor * k =
|
3692
|
-
ggml_view_3d(ctx, kv.
|
4007
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3693
4008
|
n_embd_head, n_kv, n_head_kv,
|
3694
|
-
|
3695
|
-
|
3696
|
-
|
4009
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
4010
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
4011
|
+
0);
|
3697
4012
|
cb(k, "k", il);
|
3698
4013
|
|
3699
4014
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
@@ -3724,11 +4039,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3724
4039
|
|
3725
4040
|
// split cached v into n_head heads
|
3726
4041
|
struct ggml_tensor * v =
|
3727
|
-
ggml_view_3d(ctx, kv.
|
4042
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
3728
4043
|
n_kv, n_embd_head, n_head_kv,
|
3729
|
-
ggml_element_size(kv.
|
3730
|
-
ggml_element_size(kv.
|
3731
|
-
|
4044
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
4045
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
4046
|
+
0);
|
3732
4047
|
cb(v, "v", il);
|
3733
4048
|
|
3734
4049
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
@@ -3766,6 +4081,8 @@ struct llm_build_context {
|
|
3766
4081
|
const int64_t n_head_kv;
|
3767
4082
|
const int64_t n_embd_head;
|
3768
4083
|
const int64_t n_embd_gqa;
|
4084
|
+
const int64_t n_expert;
|
4085
|
+
const int64_t n_expert_used;
|
3769
4086
|
|
3770
4087
|
const float freq_base;
|
3771
4088
|
const float freq_scale;
|
@@ -3807,6 +4124,8 @@ struct llm_build_context {
|
|
3807
4124
|
n_head_kv (hparams.n_head_kv),
|
3808
4125
|
n_embd_head (hparams.n_embd_head()),
|
3809
4126
|
n_embd_gqa (hparams.n_embd_gqa()),
|
4127
|
+
n_expert (hparams.n_expert),
|
4128
|
+
n_expert_used (hparams.n_expert_used),
|
3810
4129
|
freq_base (cparams.rope_freq_base),
|
3811
4130
|
freq_scale (cparams.rope_freq_scale),
|
3812
4131
|
ext_factor (cparams.yarn_ext_factor),
|
@@ -3886,12 +4205,24 @@ struct llm_build_context {
|
|
3886
4205
|
// compute Q and K and RoPE them
|
3887
4206
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3888
4207
|
cb(Qcur, "Qcur", il);
|
4208
|
+
if (model.layers[il].bq) {
|
4209
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4210
|
+
cb(Qcur, "Qcur", il);
|
4211
|
+
}
|
3889
4212
|
|
3890
4213
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3891
4214
|
cb(Kcur, "Kcur", il);
|
4215
|
+
if (model.layers[il].bk) {
|
4216
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4217
|
+
cb(Kcur, "Kcur", il);
|
4218
|
+
}
|
3892
4219
|
|
3893
4220
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3894
4221
|
cb(Vcur, "Vcur", il);
|
4222
|
+
if (model.layers[il].bv) {
|
4223
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4224
|
+
cb(Vcur, "Vcur", il);
|
4225
|
+
}
|
3895
4226
|
|
3896
4227
|
Qcur = ggml_rope_custom(
|
3897
4228
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
@@ -3910,7 +4241,7 @@ struct llm_build_context {
|
|
3910
4241
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
3911
4242
|
|
3912
4243
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
3913
|
-
model.layers[il].wo,
|
4244
|
+
model.layers[il].wo, model.layers[il].bo,
|
3914
4245
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
3915
4246
|
cb(cur, "kqv_out", il);
|
3916
4247
|
}
|
@@ -3919,7 +4250,7 @@ struct llm_build_context {
|
|
3919
4250
|
cb(ffn_inp, "ffn_inp", il);
|
3920
4251
|
|
3921
4252
|
// feed-forward network
|
3922
|
-
{
|
4253
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
3923
4254
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
3924
4255
|
model.layers[il].ffn_norm, NULL,
|
3925
4256
|
LLM_NORM_RMS, cb, il);
|
@@ -3931,6 +4262,69 @@ struct llm_build_context {
|
|
3931
4262
|
model.layers[il].ffn_down, NULL,
|
3932
4263
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
3933
4264
|
cb(cur, "ffn_out", il);
|
4265
|
+
} else {
|
4266
|
+
// MoE branch
|
4267
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4268
|
+
model.layers[il].ffn_norm, NULL,
|
4269
|
+
LLM_NORM_RMS, cb, il);
|
4270
|
+
cb(cur, "ffn_norm", il);
|
4271
|
+
|
4272
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
4273
|
+
cb(logits, "ffn_moe_logits", il);
|
4274
|
+
|
4275
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
4276
|
+
cb(probs, "ffn_moe_probs", il);
|
4277
|
+
|
4278
|
+
// select experts
|
4279
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
4280
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
4281
|
+
|
4282
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
4283
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
4284
|
+
cb(weights, "ffn_moe_weights", il);
|
4285
|
+
|
4286
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
4287
|
+
|
4288
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
4289
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
4290
|
+
|
4291
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
4292
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
4293
|
+
|
4294
|
+
// compute expert outputs
|
4295
|
+
ggml_tensor * moe_out = nullptr;
|
4296
|
+
|
4297
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
4298
|
+
ggml_tensor * cur_expert;
|
4299
|
+
|
4300
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
4301
|
+
cb(cur_up, "ffn_moe_up", il);
|
4302
|
+
|
4303
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
4304
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
4305
|
+
|
4306
|
+
cur_gate = ggml_silu(ctx0, cur_gate);
|
4307
|
+
cb(cur_gate, "ffn_moe_silu", il);
|
4308
|
+
|
4309
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
4310
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
4311
|
+
|
4312
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
4313
|
+
cb(cur_expert, "ffn_moe_down", il);
|
4314
|
+
|
4315
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
4316
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
4317
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
4318
|
+
|
4319
|
+
if (i == 0) {
|
4320
|
+
moe_out = cur_expert;
|
4321
|
+
} else {
|
4322
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
4323
|
+
cb(moe_out, "ffn_moe_out", il);
|
4324
|
+
}
|
4325
|
+
}
|
4326
|
+
|
4327
|
+
cur = moe_out;
|
3934
4328
|
}
|
3935
4329
|
|
3936
4330
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -4308,6 +4702,7 @@ struct llm_build_context {
|
|
4308
4702
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4309
4703
|
cb(inpL, "imp_embd", -1);
|
4310
4704
|
|
4705
|
+
// inp_pos - contains the positions
|
4311
4706
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4312
4707
|
cb(inp_pos, "inp_pos", -1);
|
4313
4708
|
|
@@ -4315,6 +4710,7 @@ struct llm_build_context {
|
|
4315
4710
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4316
4711
|
cb(KQ_scale, "KQ_scale", -1);
|
4317
4712
|
|
4713
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4318
4714
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4319
4715
|
cb(KQ_mask, "KQ_mask", -1);
|
4320
4716
|
|
@@ -4903,6 +5299,121 @@ struct llm_build_context {
|
|
4903
5299
|
|
4904
5300
|
return gf;
|
4905
5301
|
}
|
5302
|
+
|
5303
|
+
struct ggml_cgraph * build_qwen() {
|
5304
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5305
|
+
|
5306
|
+
struct ggml_tensor * cur;
|
5307
|
+
struct ggml_tensor * inpL;
|
5308
|
+
|
5309
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5310
|
+
cb(inpL, "inp_embd", -1);
|
5311
|
+
|
5312
|
+
// inp_pos - contains the positions
|
5313
|
+
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5314
|
+
cb(inp_pos, "inp_pos", -1);
|
5315
|
+
|
5316
|
+
// KQ_scale
|
5317
|
+
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5318
|
+
cb(KQ_scale, "KQ_scale", -1);
|
5319
|
+
|
5320
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5321
|
+
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5322
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5323
|
+
|
5324
|
+
// shift the entire K-cache if needed
|
5325
|
+
if (do_rope_shift) {
|
5326
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5327
|
+
}
|
5328
|
+
|
5329
|
+
for (int il = 0; il < n_layer; ++il) {
|
5330
|
+
struct ggml_tensor * inpSA = inpL;
|
5331
|
+
|
5332
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5333
|
+
model.layers[il].attn_norm, NULL,
|
5334
|
+
LLM_NORM_RMS, cb, il);
|
5335
|
+
cb(cur, "attn_norm", il);
|
5336
|
+
|
5337
|
+
// self-attention
|
5338
|
+
{
|
5339
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5340
|
+
cb(cur, "wqkv", il);
|
5341
|
+
|
5342
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5343
|
+
cb(cur, "bqkv", il);
|
5344
|
+
|
5345
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5346
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5347
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
5348
|
+
|
5349
|
+
cb(Qcur, "Qcur", il);
|
5350
|
+
cb(Kcur, "Kcur", il);
|
5351
|
+
cb(Vcur, "Vcur", il);
|
5352
|
+
|
5353
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5354
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5355
|
+
|
5356
|
+
// using mode = 2 for neox mode
|
5357
|
+
Qcur = ggml_rope_custom(
|
5358
|
+
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5359
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5360
|
+
);
|
5361
|
+
cb(Qcur, "Qcur", il);
|
5362
|
+
|
5363
|
+
Kcur = ggml_rope_custom(
|
5364
|
+
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5365
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5366
|
+
);
|
5367
|
+
cb(Kcur, "Kcur", il);
|
5368
|
+
|
5369
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5370
|
+
|
5371
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5372
|
+
model.layers[il].wo, NULL,
|
5373
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
5374
|
+
cb(cur, "kqv_out", il);
|
5375
|
+
}
|
5376
|
+
|
5377
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5378
|
+
cb(ffn_inp, "ffn_inp", il);
|
5379
|
+
|
5380
|
+
// feed-forward forward
|
5381
|
+
{
|
5382
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5383
|
+
model.layers[il].ffn_norm, NULL,
|
5384
|
+
LLM_NORM_RMS, cb, il);
|
5385
|
+
cb(cur, "ffn_norm", il);
|
5386
|
+
|
5387
|
+
cur = llm_build_ffn(ctx0, cur,
|
5388
|
+
model.layers[il].ffn_up, NULL,
|
5389
|
+
model.layers[il].ffn_gate, NULL,
|
5390
|
+
model.layers[il].ffn_down, NULL,
|
5391
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5392
|
+
cb(cur, "ffn_out", il);
|
5393
|
+
}
|
5394
|
+
|
5395
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5396
|
+
cb(cur, "l_out", il);
|
5397
|
+
|
5398
|
+
// input for next layer
|
5399
|
+
inpL = cur;
|
5400
|
+
}
|
5401
|
+
|
5402
|
+
cur = inpL;
|
5403
|
+
|
5404
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5405
|
+
model.output_norm, NULL,
|
5406
|
+
LLM_NORM_RMS, cb, -1);
|
5407
|
+
cb(cur, "result_norm", -1);
|
5408
|
+
|
5409
|
+
// lm_head
|
5410
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5411
|
+
cb(cur, "result_output", -1);
|
5412
|
+
|
5413
|
+
ggml_build_forward_expand(gf, cur);
|
5414
|
+
|
5415
|
+
return gf;
|
5416
|
+
}
|
4906
5417
|
};
|
4907
5418
|
|
4908
5419
|
//
|
@@ -4913,8 +5424,8 @@ struct llm_build_context {
|
|
4913
5424
|
enum llm_offload_func_e {
|
4914
5425
|
OFFLOAD_FUNC_NOP,
|
4915
5426
|
OFFLOAD_FUNC,
|
4916
|
-
|
4917
|
-
|
5427
|
+
OFFLOAD_FUNC_FRC, // force offload
|
5428
|
+
OFFLOAD_FUNC_KQV,
|
4918
5429
|
OFFLOAD_FUNC_NR,
|
4919
5430
|
OFFLOAD_FUNC_EMB,
|
4920
5431
|
OFFLOAD_FUNC_OUT,
|
@@ -5000,11 +5511,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5000
5511
|
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
5001
5512
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5002
5513
|
|
5003
|
-
{ "inp_pos",
|
5004
|
-
{ "KQ_scale",
|
5005
|
-
{ "KQ_mask",
|
5006
|
-
{ "K_shift",
|
5007
|
-
|
5514
|
+
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5515
|
+
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5516
|
+
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5517
|
+
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5518
|
+
|
5519
|
+
{ "K_shifted", OFFLOAD_FUNC },
|
5008
5520
|
|
5009
5521
|
{ "inp_norm", OFFLOAD_FUNC_NR },
|
5010
5522
|
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
@@ -5017,38 +5529,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5017
5529
|
{ "attn_norm", OFFLOAD_FUNC },
|
5018
5530
|
{ "attn_norm_2", OFFLOAD_FUNC },
|
5019
5531
|
|
5020
|
-
{ "wqkv",
|
5021
|
-
{ "bqkv",
|
5022
|
-
{ "wqkv_clamped",
|
5023
|
-
|
5024
|
-
{ "tmpk",
|
5025
|
-
{ "tmpq",
|
5026
|
-
{ "tmpv",
|
5027
|
-
{ "Kcur",
|
5028
|
-
{ "Qcur",
|
5029
|
-
{ "Vcur",
|
5030
|
-
|
5031
|
-
{ "krot",
|
5032
|
-
{ "qrot",
|
5033
|
-
{ "kpass",
|
5034
|
-
{ "qpass",
|
5035
|
-
{ "krotated",
|
5036
|
-
{ "qrotated",
|
5037
|
-
|
5038
|
-
{ "q",
|
5039
|
-
{ "k",
|
5040
|
-
{ "kq",
|
5041
|
-
{ "kq_scaled",
|
5042
|
-
{ "kq_scaled_alibi",
|
5043
|
-
{ "kq_masked",
|
5044
|
-
{ "kq_soft_max",
|
5045
|
-
{ "kq_soft_max_ext",
|
5046
|
-
{ "v",
|
5047
|
-
{ "kqv",
|
5048
|
-
{ "kqv_merged",
|
5049
|
-
{ "kqv_merged_cont",
|
5050
|
-
{ "kqv_wo",
|
5051
|
-
{ "kqv_out",
|
5532
|
+
{ "wqkv", OFFLOAD_FUNC_KQV },
|
5533
|
+
{ "bqkv", OFFLOAD_FUNC_KQV },
|
5534
|
+
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
5535
|
+
|
5536
|
+
{ "tmpk", OFFLOAD_FUNC_KQV },
|
5537
|
+
{ "tmpq", OFFLOAD_FUNC_KQV },
|
5538
|
+
{ "tmpv", OFFLOAD_FUNC_KQV },
|
5539
|
+
{ "Kcur", OFFLOAD_FUNC_KQV },
|
5540
|
+
{ "Qcur", OFFLOAD_FUNC_KQV },
|
5541
|
+
{ "Vcur", OFFLOAD_FUNC_KQV },
|
5542
|
+
|
5543
|
+
{ "krot", OFFLOAD_FUNC_KQV },
|
5544
|
+
{ "qrot", OFFLOAD_FUNC_KQV },
|
5545
|
+
{ "kpass", OFFLOAD_FUNC_KQV },
|
5546
|
+
{ "qpass", OFFLOAD_FUNC_KQV },
|
5547
|
+
{ "krotated", OFFLOAD_FUNC_KQV },
|
5548
|
+
{ "qrotated", OFFLOAD_FUNC_KQV },
|
5549
|
+
|
5550
|
+
{ "q", OFFLOAD_FUNC_KQV },
|
5551
|
+
{ "k", OFFLOAD_FUNC_KQV },
|
5552
|
+
{ "kq", OFFLOAD_FUNC_KQV },
|
5553
|
+
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
5554
|
+
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
5555
|
+
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
5556
|
+
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
5557
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
5558
|
+
{ "v", OFFLOAD_FUNC_KQV },
|
5559
|
+
{ "kqv", OFFLOAD_FUNC_KQV },
|
5560
|
+
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
5561
|
+
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
5562
|
+
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
5563
|
+
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
5052
5564
|
|
5053
5565
|
{ "ffn_inp", OFFLOAD_FUNC },
|
5054
5566
|
{ "ffn_norm", OFFLOAD_FUNC },
|
@@ -5067,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5067
5579
|
{ "ffn_relu", OFFLOAD_FUNC },
|
5068
5580
|
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
5069
5581
|
|
5582
|
+
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
5583
|
+
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
5584
|
+
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
5585
|
+
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
5586
|
+
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
5587
|
+
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
5588
|
+
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
5589
|
+
{ "ffn_moe_up", OFFLOAD_FUNC },
|
5590
|
+
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
5591
|
+
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
5592
|
+
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
5593
|
+
{ "ffn_moe_down", OFFLOAD_FUNC },
|
5594
|
+
{ "ffn_moe_out", OFFLOAD_FUNC },
|
5595
|
+
|
5070
5596
|
{ "l_out", OFFLOAD_FUNC },
|
5071
5597
|
|
5072
5598
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
@@ -5240,15 +5766,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5240
5766
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5241
5767
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5242
5768
|
#ifdef GGML_USE_CUBLAS
|
5243
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)"
|
5244
|
-
{
|
5245
|
-
{
|
5246
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR"
|
5769
|
+
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5770
|
+
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5771
|
+
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
5772
|
+
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
5247
5773
|
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
5248
5774
|
#else
|
5249
5775
|
{ OFFLOAD_FUNC, "CPU" },
|
5250
|
-
{
|
5251
|
-
{
|
5776
|
+
{ OFFLOAD_FUNC_FRC, "CPU" },
|
5777
|
+
{ OFFLOAD_FUNC_KQV, "CPU" },
|
5252
5778
|
{ OFFLOAD_FUNC_NR, "CPU" },
|
5253
5779
|
{ OFFLOAD_FUNC_EMB, "CPU" },
|
5254
5780
|
#endif // GGML_USE_CUBLAS
|
@@ -5281,18 +5807,23 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5281
5807
|
}
|
5282
5808
|
}
|
5283
5809
|
break;
|
5284
|
-
case
|
5285
|
-
if (
|
5810
|
+
case OFFLOAD_FUNC_FRC:
|
5811
|
+
if (!lctx.cparams.offload_kqv) {
|
5286
5812
|
func_e = OFFLOAD_FUNC_NOP;
|
5287
|
-
}
|
5288
|
-
|
5289
|
-
|
5290
|
-
if (n_gpu_layers <= n_layer + 1) {
|
5813
|
+
} break;
|
5814
|
+
case OFFLOAD_FUNC_KQV:
|
5815
|
+
if (!lctx.cparams.offload_kqv) {
|
5291
5816
|
func_e = OFFLOAD_FUNC_NOP;
|
5817
|
+
} else {
|
5818
|
+
if (n_gpu_layers < n_layer) {
|
5819
|
+
if (il < i_gpu_start) {
|
5820
|
+
func_e = OFFLOAD_FUNC_NOP;
|
5821
|
+
}
|
5822
|
+
}
|
5292
5823
|
}
|
5293
5824
|
break;
|
5294
|
-
case
|
5295
|
-
if (n_gpu_layers <= n_layer +
|
5825
|
+
case OFFLOAD_FUNC_NR:
|
5826
|
+
if (n_gpu_layers <= n_layer + 0) {
|
5296
5827
|
func_e = OFFLOAD_FUNC_NOP;
|
5297
5828
|
}
|
5298
5829
|
break;
|
@@ -5317,8 +5848,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5317
5848
|
case OFFLOAD_FUNC_NOP:
|
5318
5849
|
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
5319
5850
|
case OFFLOAD_FUNC:
|
5320
|
-
case
|
5321
|
-
case
|
5851
|
+
case OFFLOAD_FUNC_KQV:
|
5852
|
+
case OFFLOAD_FUNC_FRC:
|
5322
5853
|
case OFFLOAD_FUNC_NR:
|
5323
5854
|
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
5324
5855
|
default: GGML_ASSERT(false);
|
@@ -5377,6 +5908,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5377
5908
|
{
|
5378
5909
|
result = llm.build_stablelm();
|
5379
5910
|
} break;
|
5911
|
+
case LLM_ARCH_QWEN:
|
5912
|
+
{
|
5913
|
+
result = llm.build_qwen();
|
5914
|
+
} break;
|
5380
5915
|
default:
|
5381
5916
|
GGML_ASSERT(false);
|
5382
5917
|
}
|
@@ -5454,7 +5989,7 @@ static int llama_decode_internal(
|
|
5454
5989
|
const int64_t n_embd = hparams.n_embd;
|
5455
5990
|
const int64_t n_vocab = hparams.n_vocab;
|
5456
5991
|
|
5457
|
-
// helpers for smoother batch API
|
5992
|
+
// helpers for smoother batch API transition
|
5458
5993
|
// after deprecating the llama_eval calls, these will be removed
|
5459
5994
|
std::vector<llama_pos> pos;
|
5460
5995
|
|
@@ -5499,8 +6034,8 @@ static int llama_decode_internal(
|
|
5499
6034
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
5500
6035
|
// after enough generations, the benefit from this heuristic disappears
|
5501
6036
|
// if we start defragmenting the cache, the benefit from this will be more important
|
5502
|
-
|
5503
|
-
kv_self.n =
|
6037
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
6038
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
5504
6039
|
|
5505
6040
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5506
6041
|
|
@@ -5551,7 +6086,7 @@ static int llama_decode_internal(
|
|
5551
6086
|
n_threads = std::min(4, n_threads);
|
5552
6087
|
}
|
5553
6088
|
|
5554
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer +
|
6089
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
5555
6090
|
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5556
6091
|
n_threads = 1;
|
5557
6092
|
}
|
@@ -6233,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
6233
6768
|
|
6234
6769
|
// loop over the text
|
6235
6770
|
while (true) {
|
6236
|
-
// find the first
|
6771
|
+
// find the first occurrence of a given special token in this fragment
|
6237
6772
|
// passing offset argument only limit the "search area" but match coordinates
|
6238
6773
|
// are still relative to the source full raw_text
|
6239
6774
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6240
6775
|
|
6241
|
-
// no
|
6776
|
+
// no occurrences found, stop processing this fragment for a given special token
|
6242
6777
|
if (match == std::string::npos) break;
|
6243
6778
|
|
6244
6779
|
// check if match is within bounds of offset <-> length
|
@@ -6410,14 +6945,13 @@ struct llama_grammar_candidate {
|
|
6410
6945
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
6411
6946
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6412
6947
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6413
|
-
const
|
6414
|
-
size_t n_src,
|
6948
|
+
const std::string & src,
|
6415
6949
|
llama_partial_utf8 partial_start) {
|
6416
6950
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6417
|
-
const char * pos = src;
|
6951
|
+
const char * pos = src.c_str();
|
6418
6952
|
std::vector<uint32_t> code_points;
|
6419
6953
|
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
-
code_points.reserve(
|
6954
|
+
code_points.reserve(src.size() + 1);
|
6421
6955
|
uint32_t value = partial_start.value;
|
6422
6956
|
int n_remain = partial_start.n_remain;
|
6423
6957
|
|
@@ -6468,13 +7002,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6468
7002
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6469
7003
|
}
|
6470
7004
|
|
6471
|
-
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
-
std::string src,
|
6473
|
-
llama_partial_utf8 partial_start
|
6474
|
-
) {
|
6475
|
-
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
-
}
|
6477
|
-
|
6478
7005
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6479
7006
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6480
7007
|
switch (pos->type) {
|
@@ -7113,7 +7640,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7113
7640
|
const llama_token eos = llama_token_eos(&ctx->model);
|
7114
7641
|
|
7115
7642
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7643
|
+
candidates_decoded.reserve(candidates->size);
|
7116
7644
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7645
|
+
candidates_grammar.reserve(candidates->size);
|
7117
7646
|
|
7118
7647
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7119
7648
|
const llama_token id = candidates->data[i].id;
|
@@ -7443,7 +7972,7 @@ struct llama_beam_search_data {
|
|
7443
7972
|
}
|
7444
7973
|
|
7445
7974
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
7446
|
-
// The
|
7975
|
+
// The repetitive patterns below reflect the 2 stages of heaps:
|
7447
7976
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
7448
7977
|
// * If the heap is full and a new element is found that should be included, pop the
|
7449
7978
|
// least element to the back(), replace it with the new, then push it into the heap.
|
@@ -7650,18 +8179,21 @@ static void llama_convert_tensor_internal(
|
|
7650
8179
|
return;
|
7651
8180
|
}
|
7652
8181
|
|
7653
|
-
|
7654
|
-
|
8182
|
+
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
8183
|
+
size_t block_size_bytes = ggml_type_size(tensor->type);
|
7655
8184
|
|
7656
8185
|
GGML_ASSERT(nelements % block_size == 0);
|
7657
|
-
|
7658
|
-
|
7659
|
-
|
8186
|
+
size_t nblocks = nelements / block_size;
|
8187
|
+
size_t blocks_per_thread = nblocks / nthread;
|
8188
|
+
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
8189
|
+
|
8190
|
+
size_t in_buff_offs = 0;
|
8191
|
+
size_t out_buff_offs = 0;
|
7660
8192
|
|
7661
|
-
for (
|
7662
|
-
|
7663
|
-
|
7664
|
-
|
8193
|
+
for (int tnum = 0; tnum < nthread; tnum++) {
|
8194
|
+
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
8195
|
+
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
8196
|
+
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
7665
8197
|
|
7666
8198
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
7667
8199
|
if (typ == GGML_TYPE_F16) {
|
@@ -7678,11 +8210,9 @@ static void llama_convert_tensor_internal(
|
|
7678
8210
|
workers.clear();
|
7679
8211
|
}
|
7680
8212
|
|
7681
|
-
static ggml_type get_k_quant_type(
|
7682
|
-
quantize_state_internal & qs,
|
7683
|
-
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
7684
|
-
) {
|
8213
|
+
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
7685
8214
|
const std::string name = ggml_get_name(tensor);
|
8215
|
+
|
7686
8216
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7687
8217
|
const llm_arch arch = qs.model.arch;
|
7688
8218
|
const auto tn = LLM_TN(arch);
|
@@ -7716,7 +8246,18 @@ static ggml_type get_k_quant_type(
|
|
7716
8246
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
7717
8247
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
7718
8248
|
}
|
8249
|
+
if (qs.model.hparams.n_expert == 8) {
|
8250
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8251
|
+
// TODO: explore better strategies
|
8252
|
+
new_type = GGML_TYPE_Q8_0;
|
8253
|
+
}
|
7719
8254
|
++qs.i_attention_wv;
|
8255
|
+
} else if (name.find("attn_k.weight") != std::string::npos) {
|
8256
|
+
if (qs.model.hparams.n_expert == 8) {
|
8257
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8258
|
+
// TODO: explore better strategies
|
8259
|
+
new_type = GGML_TYPE_Q8_0;
|
8260
|
+
}
|
7720
8261
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
7721
8262
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
7722
8263
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -7831,7 +8372,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7831
8372
|
constexpr bool use_mmap = false;
|
7832
8373
|
#endif
|
7833
8374
|
|
7834
|
-
llama_model_loader ml(fname_inp, use_mmap);
|
8375
|
+
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
7835
8376
|
if (ml.use_mmap) {
|
7836
8377
|
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7837
8378
|
}
|
@@ -7925,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7925
8466
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
7926
8467
|
|
7927
8468
|
// quantize only 2D tensors
|
7928
|
-
quantize &= (tensor
|
8469
|
+
quantize &= (ggml_n_dims(tensor) == 2);
|
7929
8470
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
7930
8471
|
quantize &= !params->only_copy;
|
7931
8472
|
|
8473
|
+
// do not quantize expert gating tensors
|
8474
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
8475
|
+
|
7932
8476
|
enum ggml_type new_type;
|
7933
8477
|
void * new_data;
|
7934
8478
|
size_t new_size;
|
@@ -8127,7 +8671,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8127
8671
|
std::vector<uint8_t> base_buf;
|
8128
8672
|
if (path_base_model) {
|
8129
8673
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8130
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
8674
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8131
8675
|
|
8132
8676
|
size_t ctx_size;
|
8133
8677
|
size_t mmapped_size;
|
@@ -8355,6 +8899,7 @@ struct llama_model_params llama_model_default_params() {
|
|
8355
8899
|
/*.tensor_split =*/ nullptr,
|
8356
8900
|
/*.progress_callback =*/ nullptr,
|
8357
8901
|
/*.progress_callback_user_data =*/ nullptr,
|
8902
|
+
/*.kv_overrides =*/ nullptr,
|
8358
8903
|
/*.vocab_only =*/ false,
|
8359
8904
|
/*.use_mmap =*/ true,
|
8360
8905
|
/*.use_mlock =*/ false,
|
@@ -8382,10 +8927,12 @@ struct llama_context_params llama_context_default_params() {
|
|
8382
8927
|
/*.yarn_beta_fast =*/ 32.0f,
|
8383
8928
|
/*.yarn_beta_slow =*/ 1.0f,
|
8384
8929
|
/*.yarn_orig_ctx =*/ 0,
|
8930
|
+
/*.type_k =*/ GGML_TYPE_F16,
|
8931
|
+
/*.type_v =*/ GGML_TYPE_F16,
|
8385
8932
|
/*.mul_mat_q =*/ true,
|
8386
|
-
/*.f16_kv =*/ true,
|
8387
8933
|
/*.logits_all =*/ false,
|
8388
8934
|
/*.embedding =*/ false,
|
8935
|
+
/*.offload_kqv =*/ true,
|
8389
8936
|
};
|
8390
8937
|
|
8391
8938
|
return result;
|
@@ -8502,6 +9049,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8502
9049
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
8503
9050
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
8504
9051
|
cparams.mul_mat_q = params.mul_mat_q;
|
9052
|
+
cparams.offload_kqv = params.offload_kqv;
|
8505
9053
|
|
8506
9054
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
8507
9055
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -8535,19 +9083,36 @@ struct llama_context * llama_new_context_with_model(
|
|
8535
9083
|
ctx->rng = std::mt19937(params.seed);
|
8536
9084
|
ctx->logits_all = params.logits_all;
|
8537
9085
|
|
8538
|
-
ggml_type
|
9086
|
+
const ggml_type type_k = params.type_k;
|
9087
|
+
const ggml_type type_v = params.type_v;
|
9088
|
+
|
9089
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
9090
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
8539
9091
|
|
8540
9092
|
// reserve memory for context buffers
|
8541
9093
|
if (!hparams.vocab_only) {
|
8542
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self,
|
9094
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
8543
9095
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
8544
9096
|
llama_free(ctx);
|
8545
9097
|
return nullptr;
|
8546
9098
|
}
|
8547
9099
|
|
8548
9100
|
{
|
8549
|
-
|
8550
|
-
|
9101
|
+
size_t memory_size_k = 0;
|
9102
|
+
size_t memory_size_v = 0;
|
9103
|
+
|
9104
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9105
|
+
memory_size_k += ggml_nbytes(k);
|
9106
|
+
}
|
9107
|
+
|
9108
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9109
|
+
memory_size_v += ggml_nbytes(v);
|
9110
|
+
}
|
9111
|
+
|
9112
|
+
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
9113
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
9114
|
+
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
9115
|
+
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
8551
9116
|
}
|
8552
9117
|
|
8553
9118
|
// resized during inference
|
@@ -8618,8 +9183,12 @@ struct llama_context * llama_new_context_with_model(
|
|
8618
9183
|
}
|
8619
9184
|
|
8620
9185
|
size_t kv_vram_size = 0;
|
8621
|
-
|
8622
|
-
|
9186
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9187
|
+
add_tensor(k, kv_vram_size);
|
9188
|
+
}
|
9189
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9190
|
+
add_tensor(v, kv_vram_size);
|
9191
|
+
}
|
8623
9192
|
|
8624
9193
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8625
9194
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
@@ -9089,37 +9658,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9089
9658
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
9090
9659
|
|
9091
9660
|
if (kv_buf_size) {
|
9092
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9661
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9093
9662
|
|
9094
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9663
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9095
9664
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9096
9665
|
|
9097
|
-
|
9098
|
-
std::vector<uint8_t
|
9099
|
-
kout3d->data = kout3d_data.data();
|
9666
|
+
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
9667
|
+
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
9100
9668
|
|
9101
|
-
|
9102
|
-
|
9103
|
-
|
9669
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9670
|
+
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9671
|
+
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
9672
|
+
kout2d->data = kout2d_data[il].data();
|
9104
9673
|
|
9105
|
-
|
9106
|
-
|
9107
|
-
|
9674
|
+
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9675
|
+
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9676
|
+
vout2d->data = vout2d_data[il].data();
|
9108
9677
|
|
9109
|
-
|
9110
|
-
|
9111
|
-
|
9678
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9679
|
+
n_embd, kv_head,
|
9680
|
+
elt_size*n_embd, 0);
|
9681
|
+
|
9682
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9683
|
+
kv_head, n_embd,
|
9684
|
+
elt_size*n_ctx, 0);
|
9685
|
+
|
9686
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9687
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9688
|
+
}
|
9112
9689
|
|
9113
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9114
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9115
9690
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9116
9691
|
|
9117
9692
|
ggml_free(cpy_ctx);
|
9118
9693
|
|
9119
|
-
// our data is now in the
|
9694
|
+
// our data is now in the kout2d_data and vout2d_data buffers
|
9120
9695
|
// write them to file
|
9121
|
-
|
9122
|
-
|
9696
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
9697
|
+
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9698
|
+
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9699
|
+
}
|
9123
9700
|
}
|
9124
9701
|
|
9125
9702
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9219,29 +9796,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9219
9796
|
if (kv_buf_size) {
|
9220
9797
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
9221
9798
|
|
9222
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9799
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9223
9800
|
|
9224
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9801
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9225
9802
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9226
9803
|
|
9227
|
-
|
9228
|
-
|
9229
|
-
|
9804
|
+
for (int il = 0; il < n_layer; ++il) {
|
9805
|
+
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9806
|
+
kin2d->data = (void *) inp;
|
9807
|
+
inp += ggml_nbytes(kin2d);
|
9808
|
+
|
9809
|
+
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9810
|
+
vin2d->data = (void *) inp;
|
9811
|
+
inp += ggml_nbytes(vin2d);
|
9230
9812
|
|
9231
|
-
|
9232
|
-
|
9233
|
-
|
9813
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9814
|
+
n_embd, kv_head,
|
9815
|
+
elt_size*n_embd, 0);
|
9234
9816
|
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9817
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9818
|
+
kv_head, n_embd,
|
9819
|
+
elt_size*n_ctx, 0);
|
9238
9820
|
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9821
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9822
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9823
|
+
}
|
9242
9824
|
|
9243
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9244
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9245
9825
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9246
9826
|
|
9247
9827
|
ggml_free(cpy_ctx);
|