llama_cpp 0.9.5 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -74,6 +74,7 @@
|
|
74
74
|
#include <set>
|
75
75
|
#include <sstream>
|
76
76
|
#include <thread>
|
77
|
+
#include <type_traits>
|
77
78
|
#include <unordered_map>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
@@ -90,7 +91,8 @@
|
|
90
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
91
92
|
#endif
|
92
93
|
|
93
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
#define LLAMA_MAX_EXPERTS 8
|
94
96
|
|
95
97
|
//
|
96
98
|
// logging
|
@@ -192,6 +194,7 @@ enum llm_arch {
|
|
192
194
|
LLM_ARCH_REFACT,
|
193
195
|
LLM_ARCH_BLOOM,
|
194
196
|
LLM_ARCH_STABLELM,
|
197
|
+
LLM_ARCH_QWEN,
|
195
198
|
LLM_ARCH_UNKNOWN,
|
196
199
|
};
|
197
200
|
|
@@ -208,6 +211,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
208
211
|
{ LLM_ARCH_REFACT, "refact" },
|
209
212
|
{ LLM_ARCH_BLOOM, "bloom" },
|
210
213
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
214
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
211
215
|
};
|
212
216
|
|
213
217
|
enum llm_kv {
|
@@ -228,6 +232,8 @@ enum llm_kv {
|
|
228
232
|
LLM_KV_FEED_FORWARD_LENGTH,
|
229
233
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
230
234
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
235
|
+
LLM_KV_EXPERT_COUNT,
|
236
|
+
LLM_KV_EXPERT_USED_COUNT,
|
231
237
|
|
232
238
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
233
239
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -278,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
278
284
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
279
285
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
280
286
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
287
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
288
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
281
289
|
|
282
290
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
283
291
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -335,10 +343,14 @@ enum llm_tensor {
|
|
335
343
|
LLM_TENSOR_ATTN_NORM,
|
336
344
|
LLM_TENSOR_ATTN_NORM_2,
|
337
345
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
346
|
+
LLM_TENSOR_FFN_GATE_INP,
|
347
|
+
LLM_TENSOR_FFN_NORM,
|
338
348
|
LLM_TENSOR_FFN_GATE,
|
339
349
|
LLM_TENSOR_FFN_DOWN,
|
340
350
|
LLM_TENSOR_FFN_UP,
|
341
|
-
|
351
|
+
LLM_TENSOR_FFN_DOWN_EXP,
|
352
|
+
LLM_TENSOR_FFN_GATE_EXP,
|
353
|
+
LLM_TENSOR_FFN_UP_EXP,
|
342
354
|
LLM_TENSOR_ATTN_Q_NORM,
|
343
355
|
LLM_TENSOR_ATTN_K_NORM,
|
344
356
|
};
|
@@ -357,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
357
369
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
358
370
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
359
371
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
372
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
360
373
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
361
374
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
362
375
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
363
376
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
377
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
378
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
379
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
364
380
|
},
|
365
381
|
},
|
366
382
|
{
|
@@ -518,6 +534,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
518
534
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
519
535
|
},
|
520
536
|
},
|
537
|
+
{
|
538
|
+
LLM_ARCH_QWEN,
|
539
|
+
{
|
540
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
541
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
542
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
543
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
544
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
545
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
546
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
547
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
548
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
549
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
550
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
551
|
+
},
|
552
|
+
},
|
521
553
|
|
522
554
|
{
|
523
555
|
LLM_ARCH_UNKNOWN,
|
@@ -566,27 +598,16 @@ struct LLM_TN {
|
|
566
598
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
567
599
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
568
600
|
}
|
601
|
+
|
602
|
+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
603
|
+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
604
|
+
}
|
569
605
|
};
|
570
606
|
|
571
607
|
//
|
572
608
|
// gguf helpers
|
573
609
|
//
|
574
610
|
|
575
|
-
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
576
|
-
do { \
|
577
|
-
const std::string skey(key); \
|
578
|
-
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
579
|
-
if (kid >= 0) { \
|
580
|
-
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
581
|
-
if (ktype != (type)) { \
|
582
|
-
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
583
|
-
} \
|
584
|
-
(dst) = func(ctx, kid); \
|
585
|
-
} else if (req) { \
|
586
|
-
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
587
|
-
} \
|
588
|
-
} while (0)
|
589
|
-
|
590
611
|
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
591
612
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
592
613
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
@@ -620,7 +641,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
620
641
|
}
|
621
642
|
}
|
622
643
|
|
623
|
-
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
644
|
+
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
624
645
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
625
646
|
|
626
647
|
switch (type) {
|
@@ -1155,6 +1176,8 @@ struct llama_hparams {
|
|
1155
1176
|
uint32_t n_layer;
|
1156
1177
|
uint32_t n_rot;
|
1157
1178
|
uint32_t n_ff;
|
1179
|
+
uint32_t n_expert = 0;
|
1180
|
+
uint32_t n_expert_used = 0;
|
1158
1181
|
|
1159
1182
|
float f_norm_eps;
|
1160
1183
|
float f_norm_rms_eps;
|
@@ -1169,15 +1192,18 @@ struct llama_hparams {
|
|
1169
1192
|
float f_max_alibi_bias;
|
1170
1193
|
|
1171
1194
|
bool operator!=(const llama_hparams & other) const {
|
1172
|
-
if (this->vocab_only
|
1173
|
-
if (this->n_vocab
|
1174
|
-
if (this->n_ctx_train
|
1175
|
-
if (this->n_embd
|
1176
|
-
if (this->n_head
|
1177
|
-
if (this->n_head_kv
|
1178
|
-
if (this->n_layer
|
1179
|
-
if (this->n_rot
|
1180
|
-
if (this->n_ff
|
1195
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1196
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1197
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1198
|
+
if (this->n_embd != other.n_embd) return true;
|
1199
|
+
if (this->n_head != other.n_head) return true;
|
1200
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1201
|
+
if (this->n_layer != other.n_layer) return true;
|
1202
|
+
if (this->n_rot != other.n_rot) return true;
|
1203
|
+
if (this->n_ff != other.n_ff) return true;
|
1204
|
+
if (this->n_expert != other.n_expert) return true;
|
1205
|
+
if (this->n_expert_used != other.n_expert_used) return true;
|
1206
|
+
|
1181
1207
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1182
1208
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1183
1209
|
|
@@ -1222,6 +1248,7 @@ struct llama_cparams {
|
|
1222
1248
|
float yarn_beta_slow;
|
1223
1249
|
|
1224
1250
|
bool mul_mat_q;
|
1251
|
+
bool offload_kqv;
|
1225
1252
|
};
|
1226
1253
|
|
1227
1254
|
struct llama_layer {
|
@@ -1243,6 +1270,9 @@ struct llama_layer {
|
|
1243
1270
|
struct ggml_tensor * wqkv;
|
1244
1271
|
|
1245
1272
|
// attention bias
|
1273
|
+
struct ggml_tensor * bq;
|
1274
|
+
struct ggml_tensor * bk;
|
1275
|
+
struct ggml_tensor * bv;
|
1246
1276
|
struct ggml_tensor * bo;
|
1247
1277
|
struct ggml_tensor * bqkv;
|
1248
1278
|
|
@@ -1255,6 +1285,12 @@ struct llama_layer {
|
|
1255
1285
|
struct ggml_tensor * ffn_down; // w2
|
1256
1286
|
struct ggml_tensor * ffn_up; // w3
|
1257
1287
|
|
1288
|
+
// ff MoE
|
1289
|
+
struct ggml_tensor * ffn_gate_inp;
|
1290
|
+
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
1291
|
+
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
1292
|
+
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
1293
|
+
|
1258
1294
|
// ff bias
|
1259
1295
|
struct ggml_tensor * ffn_down_b; // b2
|
1260
1296
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -1287,8 +1323,8 @@ struct llama_kv_cache {
|
|
1287
1323
|
|
1288
1324
|
std::vector<llama_kv_cell> cells;
|
1289
1325
|
|
1290
|
-
struct ggml_tensor
|
1291
|
-
struct ggml_tensor
|
1326
|
+
std::vector<struct ggml_tensor *> k_l; // per layer
|
1327
|
+
std::vector<struct ggml_tensor *> v_l;
|
1292
1328
|
|
1293
1329
|
struct ggml_context * ctx = NULL;
|
1294
1330
|
|
@@ -1301,8 +1337,10 @@ struct llama_kv_cache {
|
|
1301
1337
|
|
1302
1338
|
#ifdef GGML_USE_CUBLAS
|
1303
1339
|
if (ggml_cublas_loaded()) {
|
1304
|
-
|
1305
|
-
|
1340
|
+
for (size_t i = 0; i < k_l.size(); ++i) {
|
1341
|
+
ggml_cuda_free_data(k_l[i]);
|
1342
|
+
ggml_cuda_free_data(v_l[i]);
|
1343
|
+
}
|
1306
1344
|
}
|
1307
1345
|
#endif
|
1308
1346
|
}
|
@@ -1492,9 +1530,11 @@ struct llama_context {
|
|
1492
1530
|
static bool llama_kv_cache_init(
|
1493
1531
|
const struct llama_hparams & hparams,
|
1494
1532
|
struct llama_kv_cache & cache,
|
1495
|
-
ggml_type
|
1533
|
+
ggml_type ktype,
|
1534
|
+
ggml_type vtype,
|
1496
1535
|
uint32_t n_ctx,
|
1497
|
-
int n_gpu_layers
|
1536
|
+
int n_gpu_layers,
|
1537
|
+
bool offload) {
|
1498
1538
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1499
1539
|
const uint32_t n_layer = hparams.n_layer;
|
1500
1540
|
|
@@ -1510,7 +1550,7 @@ static bool llama_kv_cache_init(
|
|
1510
1550
|
cache.cells.clear();
|
1511
1551
|
cache.cells.resize(n_ctx);
|
1512
1552
|
|
1513
|
-
cache.buf.resize(
|
1553
|
+
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1514
1554
|
memset(cache.buf.data, 0, cache.buf.size);
|
1515
1555
|
|
1516
1556
|
struct ggml_init_params params;
|
@@ -1520,37 +1560,44 @@ static bool llama_kv_cache_init(
|
|
1520
1560
|
|
1521
1561
|
cache.ctx = ggml_init(params);
|
1522
1562
|
|
1563
|
+
size_t vram_kv_cache = 0;
|
1564
|
+
|
1523
1565
|
if (!cache.ctx) {
|
1524
1566
|
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
1525
1567
|
return false;
|
1526
1568
|
}
|
1527
1569
|
|
1528
|
-
cache.
|
1529
|
-
cache.
|
1530
|
-
ggml_set_name(cache.k, "cache_k");
|
1531
|
-
ggml_set_name(cache.v, "cache_v");
|
1570
|
+
cache.k_l.reserve(n_layer);
|
1571
|
+
cache.v_l.reserve(n_layer);
|
1532
1572
|
|
1533
|
-
(
|
1573
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
1534
1574
|
|
1535
|
-
|
1536
|
-
if (ggml_cublas_loaded()) {
|
1537
|
-
size_t vram_kv_cache = 0;
|
1575
|
+
GGML_UNUSED(offload);
|
1538
1576
|
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1577
|
+
for (int i = 0; i < (int) n_layer; i++) {
|
1578
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
1579
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
1580
|
+
ggml_format_name(k, "cache_k_l%d", i);
|
1581
|
+
ggml_format_name(v, "cache_v_l%d", i);
|
1582
|
+
cache.k_l.push_back(k);
|
1583
|
+
cache.v_l.push_back(v);
|
1584
|
+
#ifdef GGML_USE_CUBLAS
|
1585
|
+
if (i >= i_gpu_start) {
|
1586
|
+
if (offload) {
|
1587
|
+
ggml_cuda_assign_buffers_no_scratch(k);
|
1588
|
+
vram_kv_cache += ggml_nbytes(k);
|
1589
|
+
ggml_cuda_assign_buffers_no_scratch(v);
|
1590
|
+
vram_kv_cache += ggml_nbytes(v);
|
1591
|
+
}
|
1551
1592
|
}
|
1593
|
+
#endif // GGML_USE_CUBLAS
|
1552
1594
|
}
|
1553
|
-
|
1595
|
+
|
1596
|
+
if (vram_kv_cache > 0) {
|
1597
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1598
|
+
}
|
1599
|
+
|
1600
|
+
GGML_UNUSED(n_gpu_layers);
|
1554
1601
|
|
1555
1602
|
return true;
|
1556
1603
|
}
|
@@ -1771,6 +1818,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
1771
1818
|
return buf;
|
1772
1819
|
}
|
1773
1820
|
|
1821
|
+
namespace GGUFMeta {
|
1822
|
+
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
1823
|
+
struct GKV_Base_Type {
|
1824
|
+
static constexpr gguf_type gt = gt_;
|
1825
|
+
|
1826
|
+
static T getter(const gguf_context * ctx, const int kid) {
|
1827
|
+
return gfun(ctx, kid);
|
1828
|
+
}
|
1829
|
+
};
|
1830
|
+
|
1831
|
+
template<typename T> struct GKV_Base;
|
1832
|
+
|
1833
|
+
template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
|
1834
|
+
template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
|
1835
|
+
template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
|
1836
|
+
template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
|
1837
|
+
template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
|
1838
|
+
template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
|
1839
|
+
template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
|
1840
|
+
template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
|
1841
|
+
template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
|
1842
|
+
template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
|
1843
|
+
template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
|
1844
|
+
template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
|
1845
|
+
|
1846
|
+
template<> struct GKV_Base<std::string> {
|
1847
|
+
static constexpr gguf_type gt = GGUF_TYPE_STRING;
|
1848
|
+
|
1849
|
+
static std::string getter(const gguf_context * ctx, const int kid) {
|
1850
|
+
return gguf_get_val_str(ctx, kid);
|
1851
|
+
}
|
1852
|
+
};
|
1853
|
+
|
1854
|
+
struct ArrayInfo{
|
1855
|
+
const gguf_type gt;
|
1856
|
+
const size_t length;
|
1857
|
+
const void * data;
|
1858
|
+
};
|
1859
|
+
|
1860
|
+
template<> struct GKV_Base<ArrayInfo> {
|
1861
|
+
public:
|
1862
|
+
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
1863
|
+
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
1864
|
+
return ArrayInfo {
|
1865
|
+
gguf_get_arr_type(ctx, k),
|
1866
|
+
size_t(gguf_get_arr_n(ctx, k)),
|
1867
|
+
gguf_get_arr_data(ctx, k),
|
1868
|
+
};
|
1869
|
+
}
|
1870
|
+
};
|
1871
|
+
|
1872
|
+
template<typename T>
|
1873
|
+
class GKV: public GKV_Base<T> {
|
1874
|
+
GKV() = delete;
|
1875
|
+
|
1876
|
+
public:
|
1877
|
+
static T get_kv(const gguf_context * ctx, const int k) {
|
1878
|
+
const enum gguf_type kt = gguf_get_kv_type(ctx, k);
|
1879
|
+
|
1880
|
+
if (kt != GKV::gt) {
|
1881
|
+
throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
|
1882
|
+
gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
|
1883
|
+
}
|
1884
|
+
return GKV::getter(ctx, k);
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
1888
|
+
switch (ty) {
|
1889
|
+
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
|
1890
|
+
case LLAMA_KV_OVERRIDE_INT: return "int";
|
1891
|
+
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
|
1892
|
+
}
|
1893
|
+
return "unknown";
|
1894
|
+
}
|
1895
|
+
|
1896
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
|
1897
|
+
if (!override) { return false; }
|
1898
|
+
if (override->tag == expected_type) {
|
1899
|
+
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
1900
|
+
__func__, override_type_to_str(override->tag), override->key);
|
1901
|
+
switch (override->tag) {
|
1902
|
+
case LLAMA_KV_OVERRIDE_BOOL: {
|
1903
|
+
printf("%s\n", override->bool_value ? "true" : "false");
|
1904
|
+
} break;
|
1905
|
+
case LLAMA_KV_OVERRIDE_INT: {
|
1906
|
+
printf("%" PRId64 "\n", override->int_value);
|
1907
|
+
} break;
|
1908
|
+
case LLAMA_KV_OVERRIDE_FLOAT: {
|
1909
|
+
printf("%.6f\n", override->float_value);
|
1910
|
+
} break;
|
1911
|
+
default:
|
1912
|
+
// Shouldn't be possible to end up here, but just in case...
|
1913
|
+
throw std::runtime_error(
|
1914
|
+
format("Unsupported attempt to override %s type for metadata key %s\n",
|
1915
|
+
override_type_to_str(override->tag), override->key));
|
1916
|
+
}
|
1917
|
+
return true;
|
1918
|
+
}
|
1919
|
+
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
1920
|
+
__func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
|
1921
|
+
return false;
|
1922
|
+
}
|
1923
|
+
|
1924
|
+
template<typename OT>
|
1925
|
+
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
1926
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1927
|
+
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
|
1928
|
+
target = override->bool_value;
|
1929
|
+
return true;
|
1930
|
+
}
|
1931
|
+
return true;
|
1932
|
+
}
|
1933
|
+
|
1934
|
+
template<typename OT>
|
1935
|
+
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
1936
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1937
|
+
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
|
1938
|
+
target = override->int_value;
|
1939
|
+
return true;
|
1940
|
+
}
|
1941
|
+
return false;
|
1942
|
+
}
|
1943
|
+
|
1944
|
+
template<typename OT>
|
1945
|
+
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
1946
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1947
|
+
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
|
1948
|
+
target = override->float_value;
|
1949
|
+
return true;
|
1950
|
+
}
|
1951
|
+
return false;
|
1952
|
+
}
|
1953
|
+
|
1954
|
+
template<typename OT>
|
1955
|
+
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
1956
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1957
|
+
(void)target;
|
1958
|
+
(void)override;
|
1959
|
+
if (!override) { return false; }
|
1960
|
+
// Currently, we should never end up here so it would be a bug if we do.
|
1961
|
+
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
1962
|
+
override ? override->key : "NULL"));
|
1963
|
+
}
|
1964
|
+
|
1965
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1966
|
+
if (try_override<T>(target, override)) {
|
1967
|
+
return true;
|
1968
|
+
}
|
1969
|
+
if (k < 0) { return false; }
|
1970
|
+
target = get_kv(ctx, k);
|
1971
|
+
return true;
|
1972
|
+
}
|
1973
|
+
|
1974
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1975
|
+
return set(ctx, gguf_find_key(ctx, key), target, override);
|
1976
|
+
}
|
1977
|
+
|
1978
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1979
|
+
return set(ctx, key.c_str(), target, override);
|
1980
|
+
}
|
1981
|
+
};
|
1982
|
+
}
|
1983
|
+
|
1774
1984
|
struct llama_model_loader {
|
1775
1985
|
int n_kv = 0;
|
1776
1986
|
int n_tensors = 0;
|
@@ -1786,21 +1996,34 @@ struct llama_model_loader {
|
|
1786
1996
|
llama_fver fver;
|
1787
1997
|
|
1788
1998
|
std::unique_ptr<llama_mmap> mapping;
|
1999
|
+
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
1789
2000
|
|
1790
2001
|
struct gguf_context * ctx_gguf = NULL;
|
1791
2002
|
struct ggml_context * ctx_meta = NULL;
|
1792
2003
|
|
1793
|
-
|
2004
|
+
std::string arch_name;
|
2005
|
+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2006
|
+
|
2007
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
1794
2008
|
struct gguf_init_params params = {
|
1795
2009
|
/*.no_alloc = */ true,
|
1796
2010
|
/*.ctx = */ &ctx_meta,
|
1797
2011
|
};
|
1798
2012
|
|
2013
|
+
if (param_overrides_p != nullptr) {
|
2014
|
+
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2015
|
+
kv_overrides.insert({std::string(p->key), *p});
|
2016
|
+
}
|
2017
|
+
}
|
2018
|
+
|
1799
2019
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
1800
2020
|
if (!ctx_gguf) {
|
1801
2021
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
1802
2022
|
}
|
1803
2023
|
|
2024
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2025
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2026
|
+
|
1804
2027
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
1805
2028
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
1806
2029
|
|
@@ -1868,6 +2091,7 @@ struct llama_model_loader {
|
|
1868
2091
|
}
|
1869
2092
|
}
|
1870
2093
|
|
2094
|
+
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
1871
2095
|
for (int i = 0; i < n_kv; i++) {
|
1872
2096
|
const char * name = gguf_get_key(ctx_gguf, i);
|
1873
2097
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
@@ -1913,19 +2137,59 @@ struct llama_model_loader {
|
|
1913
2137
|
}
|
1914
2138
|
}
|
1915
2139
|
|
1916
|
-
|
1917
|
-
|
2140
|
+
template<typename T>
|
2141
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2142
|
+
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2143
|
+
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
1918
2144
|
|
1919
|
-
|
1920
|
-
|
2145
|
+
if (kid < 0) {
|
2146
|
+
if (required) {
|
2147
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2148
|
+
}
|
2149
|
+
return false;
|
2150
|
+
}
|
2151
|
+
|
2152
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
2153
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
2154
|
+
|
2155
|
+
|
2156
|
+
result = arr_info.length;
|
2157
|
+
return true;
|
2158
|
+
}
|
2159
|
+
|
2160
|
+
template<typename T>
|
2161
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2162
|
+
get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
|
2163
|
+
return get_arr_n(llm_kv(kid), result, required);
|
2164
|
+
}
|
2165
|
+
|
2166
|
+
template<typename T>
|
2167
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
2168
|
+
auto it = kv_overrides.find(key);
|
2169
|
+
|
2170
|
+
const struct llama_model_kv_override * override =
|
2171
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
2172
|
+
|
2173
|
+
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
1921
2174
|
|
2175
|
+
if (required && !found) {
|
2176
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
return found;
|
2180
|
+
}
|
2181
|
+
|
2182
|
+
template<typename T>
|
2183
|
+
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
2184
|
+
return get_key(llm_kv(kid), result, required);
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
std::string get_arch_name() const {
|
1922
2188
|
return arch_name;
|
1923
2189
|
}
|
1924
2190
|
|
1925
2191
|
enum llm_arch get_arch() const {
|
1926
|
-
|
1927
|
-
|
1928
|
-
return llm_arch_from_string(arch_name);
|
2192
|
+
return llm_kv.arch;
|
1929
2193
|
}
|
1930
2194
|
|
1931
2195
|
const char * get_tensor_name(int i) const {
|
@@ -1965,10 +2229,13 @@ struct llama_model_loader {
|
|
1965
2229
|
return tensor;
|
1966
2230
|
}
|
1967
2231
|
|
1968
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
2232
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
|
1969
2233
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1970
2234
|
|
1971
2235
|
if (cur == NULL) {
|
2236
|
+
if (!required) {
|
2237
|
+
return NULL;
|
2238
|
+
}
|
1972
2239
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
1973
2240
|
}
|
1974
2241
|
|
@@ -2172,11 +2439,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
2172
2439
|
static void llm_load_hparams(
|
2173
2440
|
llama_model_loader & ml,
|
2174
2441
|
llama_model & model) {
|
2175
|
-
struct gguf_context * ctx = ml.ctx_gguf;
|
2176
|
-
|
2177
|
-
const auto kv = LLM_KV(model.arch);
|
2178
|
-
|
2179
2442
|
auto & hparams = model.hparams;
|
2443
|
+
const gguf_context * ctx = ml.ctx_gguf;
|
2180
2444
|
|
2181
2445
|
// get metadata as string
|
2182
2446
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -2190,42 +2454,51 @@ static void llm_load_hparams(
|
|
2190
2454
|
}
|
2191
2455
|
|
2192
2456
|
// get general kv
|
2193
|
-
|
2457
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
2194
2458
|
|
2195
2459
|
// get hparams kv
|
2196
|
-
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
2201
|
-
|
2460
|
+
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
2461
|
+
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
2462
|
+
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
2463
|
+
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2464
|
+
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2465
|
+
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2466
|
+
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
2467
|
+
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
2468
|
+
|
2469
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
2470
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
2471
|
+
if (hparams.n_expert > 0) {
|
2472
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
2473
|
+
} else {
|
2474
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
2475
|
+
}
|
2202
2476
|
|
2203
2477
|
// n_head_kv is optional, default to n_head
|
2204
2478
|
hparams.n_head_kv = hparams.n_head;
|
2205
|
-
|
2479
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
|
2206
2480
|
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2481
|
+
bool rope_finetuned = false;
|
2482
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
2483
|
+
hparams.rope_finetuned = rope_finetuned;
|
2210
2484
|
|
2211
2485
|
hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
2212
|
-
|
2213
|
-
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
2486
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
|
2214
2487
|
|
2215
2488
|
// rope_freq_base (optional)
|
2216
2489
|
hparams.rope_freq_base_train = 10000.0f;
|
2217
|
-
|
2490
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
2218
2491
|
|
2219
2492
|
std::string rope_scaling("linear");
|
2220
|
-
|
2493
|
+
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2221
2494
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2222
2495
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
2223
2496
|
|
2224
2497
|
// rope_freq_scale (inverse of the kv) is optional
|
2225
2498
|
float ropescale = 0.0f;
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2499
|
+
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
2500
|
+
// try the old key name
|
2501
|
+
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
2229
2502
|
}
|
2230
2503
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
2231
2504
|
|
@@ -2233,7 +2506,7 @@ static void llm_load_hparams(
|
|
2233
2506
|
{
|
2234
2507
|
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
2235
2508
|
|
2236
|
-
|
2509
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
2237
2510
|
|
2238
2511
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
2239
2512
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
@@ -2248,7 +2521,7 @@ static void llm_load_hparams(
|
|
2248
2521
|
switch (model.arch) {
|
2249
2522
|
case LLM_ARCH_LLAMA:
|
2250
2523
|
{
|
2251
|
-
|
2524
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2252
2525
|
|
2253
2526
|
switch (hparams.n_layer) {
|
2254
2527
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -2262,7 +2535,7 @@ static void llm_load_hparams(
|
|
2262
2535
|
} break;
|
2263
2536
|
case LLM_ARCH_FALCON:
|
2264
2537
|
{
|
2265
|
-
|
2538
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2266
2539
|
|
2267
2540
|
switch (hparams.n_layer) {
|
2268
2541
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2272,7 +2545,7 @@ static void llm_load_hparams(
|
|
2272
2545
|
} break;
|
2273
2546
|
case LLM_ARCH_BAICHUAN:
|
2274
2547
|
{
|
2275
|
-
|
2548
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2276
2549
|
switch (hparams.n_layer) {
|
2277
2550
|
case 32: model.type = e_model::MODEL_7B; break;
|
2278
2551
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2281,7 +2554,7 @@ static void llm_load_hparams(
|
|
2281
2554
|
} break;
|
2282
2555
|
case LLM_ARCH_STARCODER:
|
2283
2556
|
{
|
2284
|
-
|
2557
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2285
2558
|
switch (hparams.n_layer) {
|
2286
2559
|
case 24: model.type = e_model::MODEL_1B; break;
|
2287
2560
|
case 36: model.type = e_model::MODEL_3B; break;
|
@@ -2292,7 +2565,7 @@ static void llm_load_hparams(
|
|
2292
2565
|
} break;
|
2293
2566
|
case LLM_ARCH_PERSIMMON:
|
2294
2567
|
{
|
2295
|
-
|
2568
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2296
2569
|
switch (hparams.n_layer) {
|
2297
2570
|
case 36: model.type = e_model::MODEL_8B; break;
|
2298
2571
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2300,7 +2573,7 @@ static void llm_load_hparams(
|
|
2300
2573
|
} break;
|
2301
2574
|
case LLM_ARCH_REFACT:
|
2302
2575
|
{
|
2303
|
-
|
2576
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2304
2577
|
switch (hparams.n_layer) {
|
2305
2578
|
case 32: model.type = e_model::MODEL_1B; break;
|
2306
2579
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2308,7 +2581,7 @@ static void llm_load_hparams(
|
|
2308
2581
|
} break;
|
2309
2582
|
case LLM_ARCH_BLOOM:
|
2310
2583
|
{
|
2311
|
-
|
2584
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2312
2585
|
|
2313
2586
|
switch (hparams.n_layer) {
|
2314
2587
|
case 24: model.type = e_model::MODEL_1B; break;
|
@@ -2323,9 +2596,9 @@ static void llm_load_hparams(
|
|
2323
2596
|
{
|
2324
2597
|
hparams.f_clamp_kqv = 0.0f;
|
2325
2598
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2599
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2600
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
2601
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
2329
2602
|
|
2330
2603
|
switch (hparams.n_layer) {
|
2331
2604
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2335,13 +2608,23 @@ static void llm_load_hparams(
|
|
2335
2608
|
} break;
|
2336
2609
|
case LLM_ARCH_STABLELM:
|
2337
2610
|
{
|
2338
|
-
|
2611
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2339
2612
|
|
2340
2613
|
switch (hparams.n_layer) {
|
2341
2614
|
case 32: model.type = e_model::MODEL_3B; break;
|
2342
2615
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2343
2616
|
}
|
2344
2617
|
} break;
|
2618
|
+
case LLM_ARCH_QWEN:
|
2619
|
+
{
|
2620
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2621
|
+
|
2622
|
+
switch (hparams.n_layer) {
|
2623
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2624
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
2625
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2626
|
+
}
|
2627
|
+
} break;
|
2345
2628
|
|
2346
2629
|
default: (void)0;
|
2347
2630
|
}
|
@@ -2383,7 +2666,7 @@ static void llm_load_vocab(
|
|
2383
2666
|
{
|
2384
2667
|
std::string tokenizer_name;
|
2385
2668
|
|
2386
|
-
|
2669
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
2387
2670
|
|
2388
2671
|
if (tokenizer_name == "llama") {
|
2389
2672
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2473,34 +2756,31 @@ static void llm_load_vocab(
|
|
2473
2756
|
};
|
2474
2757
|
for (const auto & it : special_token_types) {
|
2475
2758
|
const std::string & key = kv(std::get<0>(it));
|
2476
|
-
int32_t & id = std::get<1>(it)
|
2759
|
+
int32_t & id = std::get<1>(it);
|
2477
2760
|
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
if (
|
2483
|
-
LLAMA_LOG_WARN("%s: bad special token: '%s' = %
|
2484
|
-
__func__, key.c_str(),
|
2485
|
-
|
2761
|
+
uint32_t new_id;
|
2762
|
+
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
2763
|
+
continue;
|
2764
|
+
}
|
2765
|
+
if (new_id >= vocab.id_to_token.size()) {
|
2766
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
|
2767
|
+
__func__, key.c_str(), new_id, id);
|
2768
|
+
} else {
|
2769
|
+
id = new_id;
|
2486
2770
|
}
|
2487
2771
|
|
2488
2772
|
}
|
2489
2773
|
|
2490
2774
|
// Handle add_bos_token and add_eos_token
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2496
|
-
|
2497
|
-
|
2498
|
-
|
2499
|
-
|
2500
|
-
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2501
|
-
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2502
|
-
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2503
|
-
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2775
|
+
{
|
2776
|
+
bool temp = true;
|
2777
|
+
|
2778
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
2779
|
+
vocab.special_add_bos = int(temp);
|
2780
|
+
}
|
2781
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
2782
|
+
vocab.special_add_eos = int(temp);
|
2783
|
+
}
|
2504
2784
|
}
|
2505
2785
|
}
|
2506
2786
|
|
@@ -2511,7 +2791,7 @@ static void llm_load_vocab(
|
|
2511
2791
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2512
2792
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2513
2793
|
// are special tokens.
|
2514
|
-
// From testing, this appears to
|
2794
|
+
// From testing, this appears to correlate 1:1 with special tokens.
|
2515
2795
|
//
|
2516
2796
|
|
2517
2797
|
// Counting special tokens and verifying in only one direction
|
@@ -2624,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2624
2904
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2625
2905
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2626
2906
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2907
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
2908
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
2627
2909
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
2628
2910
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2629
2911
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2733,14 +3015,7 @@ static void llm_load_tensors(
|
|
2733
3015
|
ggml_backend_type backend_output;
|
2734
3016
|
|
2735
3017
|
if (n_gpu_layers > int(n_layer)) {
|
2736
|
-
|
2737
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2738
|
-
#ifndef _WIN32
|
2739
|
-
backend_norm = llama_backend_offload;
|
2740
|
-
#else
|
2741
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2742
|
-
#endif // _WIN32
|
2743
|
-
|
3018
|
+
backend_norm = llama_backend_offload;
|
2744
3019
|
backend_output = llama_backend_offload_split;
|
2745
3020
|
} else {
|
2746
3021
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2777,17 +3052,55 @@ static void llm_load_tensors(
|
|
2777
3052
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2778
3053
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2779
3054
|
|
3055
|
+
// optional bias tensors
|
3056
|
+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
|
3057
|
+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
|
3058
|
+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
|
3059
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
|
3060
|
+
|
2780
3061
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2781
3062
|
|
2782
|
-
layer.
|
2783
|
-
|
2784
|
-
layer.
|
3063
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
3064
|
+
|
3065
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3066
|
+
GGML_ASSERT(hparams.n_expert == 0);
|
3067
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
3068
|
+
|
3069
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3070
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3071
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3072
|
+
} else {
|
3073
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
3074
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
3075
|
+
|
3076
|
+
// MoE branch
|
3077
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3078
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3079
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
3080
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
|
+
}
|
3082
|
+
}
|
2785
3083
|
|
2786
3084
|
if (backend == GGML_BACKEND_GPU) {
|
2787
3085
|
vram_weights +=
|
2788
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)
|
2789
|
-
ggml_nbytes(layer.wv)
|
2790
|
-
|
3086
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3087
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3088
|
+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3089
|
+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3090
|
+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3091
|
+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3092
|
+
ggml_nbytes(layer.ffn_norm);
|
3093
|
+
|
3094
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
+
vram_weights +=
|
3096
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
+
} else {
|
3098
|
+
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
+
vram_weights +=
|
3101
|
+
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
+
}
|
3103
|
+
}
|
2791
3104
|
}
|
2792
3105
|
}
|
2793
3106
|
} break;
|
@@ -2799,14 +3112,7 @@ static void llm_load_tensors(
|
|
2799
3112
|
ggml_backend_type backend_output;
|
2800
3113
|
|
2801
3114
|
if (n_gpu_layers > int(n_layer)) {
|
2802
|
-
|
2803
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2804
|
-
#ifndef _WIN32
|
2805
|
-
backend_norm = llama_backend_offload;
|
2806
|
-
#else
|
2807
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2808
|
-
#endif // _WIN32
|
2809
|
-
|
3115
|
+
backend_norm = llama_backend_offload;
|
2810
3116
|
backend_output = llama_backend_offload_split;
|
2811
3117
|
} else {
|
2812
3118
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2869,14 +3175,7 @@ static void llm_load_tensors(
|
|
2869
3175
|
ggml_backend_type backend_output;
|
2870
3176
|
|
2871
3177
|
if (n_gpu_layers > int(n_layer)) {
|
2872
|
-
|
2873
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2874
|
-
#ifndef _WIN32
|
2875
|
-
backend_norm = llama_backend_offload;
|
2876
|
-
#else
|
2877
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2878
|
-
#endif // _WIN32
|
2879
|
-
|
3178
|
+
backend_norm = llama_backend_offload;
|
2880
3179
|
backend_output = llama_backend_offload_split;
|
2881
3180
|
} else {
|
2882
3181
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2946,14 +3245,7 @@ static void llm_load_tensors(
|
|
2946
3245
|
ggml_backend_type backend_output;
|
2947
3246
|
|
2948
3247
|
if (n_gpu_layers > int(n_layer)) {
|
2949
|
-
|
2950
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2951
|
-
#ifndef _WIN32
|
2952
|
-
backend_norm = llama_backend_offload;
|
2953
|
-
#else
|
2954
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2955
|
-
#endif // _WIN32
|
2956
|
-
|
3248
|
+
backend_norm = llama_backend_offload;
|
2957
3249
|
backend_output = llama_backend_offload_split;
|
2958
3250
|
} else {
|
2959
3251
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3023,21 +3315,7 @@ static void llm_load_tensors(
|
|
3023
3315
|
ggml_backend_type backend_output;
|
3024
3316
|
|
3025
3317
|
if (n_gpu_layers > int(n_layer)) {
|
3026
|
-
|
3027
|
-
if (n_gpu_layers > int(n_layer + 1)) {
|
3028
|
-
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3029
|
-
__func__, n_layer + 1);
|
3030
|
-
throw std::runtime_error("Persimmon CUDA offload failed");
|
3031
|
-
}
|
3032
|
-
#endif
|
3033
|
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3034
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3035
|
-
#ifndef _WIN32
|
3036
|
-
backend_norm = llama_backend_offload;
|
3037
|
-
#else
|
3038
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3039
|
-
#endif // _WIN32
|
3040
|
-
|
3318
|
+
backend_norm = llama_backend_offload;
|
3041
3319
|
backend_output = llama_backend_offload_split;
|
3042
3320
|
} else {
|
3043
3321
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3096,14 +3374,7 @@ static void llm_load_tensors(
|
|
3096
3374
|
ggml_backend_type backend_output;
|
3097
3375
|
|
3098
3376
|
if (n_gpu_layers > int(n_layer)) {
|
3099
|
-
|
3100
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3101
|
-
#ifndef _WIN32
|
3102
|
-
backend_norm = llama_backend_offload;
|
3103
|
-
#else
|
3104
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3105
|
-
#endif // _WIN32
|
3106
|
-
|
3377
|
+
backend_norm = llama_backend_offload;
|
3107
3378
|
backend_output = llama_backend_offload_split;
|
3108
3379
|
} else {
|
3109
3380
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3174,14 +3445,7 @@ static void llm_load_tensors(
|
|
3174
3445
|
ggml_backend_type backend_output;
|
3175
3446
|
|
3176
3447
|
if (n_gpu_layers > int(n_layer)) {
|
3177
|
-
|
3178
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3179
|
-
#ifndef _WIN32
|
3180
|
-
backend_norm = llama_backend_offload;
|
3181
|
-
#else
|
3182
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3183
|
-
#endif // _WIN32
|
3184
|
-
|
3448
|
+
backend_norm = llama_backend_offload;
|
3185
3449
|
backend_output = llama_backend_offload_split;
|
3186
3450
|
} else {
|
3187
3451
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3241,14 +3505,7 @@ static void llm_load_tensors(
|
|
3241
3505
|
ggml_backend_type backend_output;
|
3242
3506
|
|
3243
3507
|
if (n_gpu_layers > int(n_layer)) {
|
3244
|
-
|
3245
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3246
|
-
#ifndef _WIN32
|
3247
|
-
backend_norm = llama_backend_offload;
|
3248
|
-
#else
|
3249
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3250
|
-
#endif // _WIN32
|
3251
|
-
|
3508
|
+
backend_norm = llama_backend_offload;
|
3252
3509
|
backend_output = llama_backend_offload_split;
|
3253
3510
|
} else {
|
3254
3511
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3305,6 +3562,64 @@ static void llm_load_tensors(
|
|
3305
3562
|
}
|
3306
3563
|
}
|
3307
3564
|
} break;
|
3565
|
+
case LLM_ARCH_QWEN:
|
3566
|
+
{
|
3567
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3568
|
+
{
|
3569
|
+
ggml_backend_type backend_norm;
|
3570
|
+
ggml_backend_type backend_output;
|
3571
|
+
|
3572
|
+
if (n_gpu_layers > int(n_layer)) {
|
3573
|
+
backend_norm = llama_backend_offload;
|
3574
|
+
backend_output = llama_backend_offload_split;
|
3575
|
+
} else {
|
3576
|
+
backend_norm = GGML_BACKEND_CPU;
|
3577
|
+
backend_output = GGML_BACKEND_CPU;
|
3578
|
+
}
|
3579
|
+
|
3580
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3581
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3582
|
+
|
3583
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3584
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3585
|
+
}
|
3586
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3587
|
+
vram_weights += ggml_nbytes(model.output);
|
3588
|
+
}
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
const uint32_t n_ff = hparams.n_ff / 2;
|
3592
|
+
|
3593
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3594
|
+
|
3595
|
+
model.layers.resize(n_layer);
|
3596
|
+
|
3597
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3598
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3599
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3600
|
+
|
3601
|
+
auto & layer = model.layers[i];
|
3602
|
+
|
3603
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3604
|
+
|
3605
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
|
3606
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
|
3607
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3608
|
+
|
3609
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3610
|
+
|
3611
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3612
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3613
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3614
|
+
|
3615
|
+
if (backend == GGML_BACKEND_GPU) {
|
3616
|
+
vram_weights +=
|
3617
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3618
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3619
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3620
|
+
}
|
3621
|
+
}
|
3622
|
+
} break;
|
3308
3623
|
|
3309
3624
|
default:
|
3310
3625
|
throw std::runtime_error("unknown architecture");
|
@@ -3331,8 +3646,8 @@ static void llm_load_tensors(
|
|
3331
3646
|
}
|
3332
3647
|
|
3333
3648
|
#ifdef GGML_USE_CUBLAS
|
3334
|
-
const int max_backend_supported_layers = hparams.n_layer +
|
3335
|
-
const int max_offloadable_layers = hparams.n_layer +
|
3649
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3650
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
3336
3651
|
#elif GGML_USE_CLBLAST
|
3337
3652
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3338
3653
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
@@ -3373,7 +3688,7 @@ static void llm_load_tensors(
|
|
3373
3688
|
|
3374
3689
|
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3375
3690
|
try {
|
3376
|
-
llama_model_loader ml(fname, params.use_mmap);
|
3691
|
+
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3377
3692
|
|
3378
3693
|
model.hparams.vocab_only = params.vocab_only;
|
3379
3694
|
|
@@ -3500,11 +3815,11 @@ static void llm_build_k_shift(
|
|
3500
3815
|
struct ggml_tensor * tmp =
|
3501
3816
|
// we rotate only the first n_rot dimensions
|
3502
3817
|
ggml_rope_custom_inplace(ctx,
|
3503
|
-
ggml_view_3d(ctx, kv.
|
3818
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3504
3819
|
n_embd_head, n_head_kv, n_ctx,
|
3505
|
-
|
3506
|
-
|
3507
|
-
|
3820
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3821
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
3822
|
+
0),
|
3508
3823
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3509
3824
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
3510
3825
|
cb(tmp, "K_shifted", il);
|
@@ -3531,13 +3846,13 @@ static void llm_build_kv_store(
|
|
3531
3846
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
3532
3847
|
cb(v_cur_t, "v_cur_t", il);
|
3533
3848
|
|
3534
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.
|
3535
|
-
(
|
3849
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3850
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
3536
3851
|
cb(k_cache_view, "k_cache_view", il);
|
3537
3852
|
|
3538
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.
|
3539
|
-
(
|
3540
|
-
(
|
3853
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
3854
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
3855
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
3541
3856
|
cb(v_cache_view, "v_cache_view", il);
|
3542
3857
|
|
3543
3858
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -3689,11 +4004,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3689
4004
|
cb(q, "q", il);
|
3690
4005
|
|
3691
4006
|
struct ggml_tensor * k =
|
3692
|
-
ggml_view_3d(ctx, kv.
|
4007
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3693
4008
|
n_embd_head, n_kv, n_head_kv,
|
3694
|
-
|
3695
|
-
|
3696
|
-
|
4009
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
4010
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
4011
|
+
0);
|
3697
4012
|
cb(k, "k", il);
|
3698
4013
|
|
3699
4014
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
@@ -3724,11 +4039,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3724
4039
|
|
3725
4040
|
// split cached v into n_head heads
|
3726
4041
|
struct ggml_tensor * v =
|
3727
|
-
ggml_view_3d(ctx, kv.
|
4042
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
3728
4043
|
n_kv, n_embd_head, n_head_kv,
|
3729
|
-
ggml_element_size(kv.
|
3730
|
-
ggml_element_size(kv.
|
3731
|
-
|
4044
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
4045
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
4046
|
+
0);
|
3732
4047
|
cb(v, "v", il);
|
3733
4048
|
|
3734
4049
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
@@ -3766,6 +4081,8 @@ struct llm_build_context {
|
|
3766
4081
|
const int64_t n_head_kv;
|
3767
4082
|
const int64_t n_embd_head;
|
3768
4083
|
const int64_t n_embd_gqa;
|
4084
|
+
const int64_t n_expert;
|
4085
|
+
const int64_t n_expert_used;
|
3769
4086
|
|
3770
4087
|
const float freq_base;
|
3771
4088
|
const float freq_scale;
|
@@ -3807,6 +4124,8 @@ struct llm_build_context {
|
|
3807
4124
|
n_head_kv (hparams.n_head_kv),
|
3808
4125
|
n_embd_head (hparams.n_embd_head()),
|
3809
4126
|
n_embd_gqa (hparams.n_embd_gqa()),
|
4127
|
+
n_expert (hparams.n_expert),
|
4128
|
+
n_expert_used (hparams.n_expert_used),
|
3810
4129
|
freq_base (cparams.rope_freq_base),
|
3811
4130
|
freq_scale (cparams.rope_freq_scale),
|
3812
4131
|
ext_factor (cparams.yarn_ext_factor),
|
@@ -3886,12 +4205,24 @@ struct llm_build_context {
|
|
3886
4205
|
// compute Q and K and RoPE them
|
3887
4206
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3888
4207
|
cb(Qcur, "Qcur", il);
|
4208
|
+
if (model.layers[il].bq) {
|
4209
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4210
|
+
cb(Qcur, "Qcur", il);
|
4211
|
+
}
|
3889
4212
|
|
3890
4213
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3891
4214
|
cb(Kcur, "Kcur", il);
|
4215
|
+
if (model.layers[il].bk) {
|
4216
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4217
|
+
cb(Kcur, "Kcur", il);
|
4218
|
+
}
|
3892
4219
|
|
3893
4220
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3894
4221
|
cb(Vcur, "Vcur", il);
|
4222
|
+
if (model.layers[il].bv) {
|
4223
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4224
|
+
cb(Vcur, "Vcur", il);
|
4225
|
+
}
|
3895
4226
|
|
3896
4227
|
Qcur = ggml_rope_custom(
|
3897
4228
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
@@ -3910,7 +4241,7 @@ struct llm_build_context {
|
|
3910
4241
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
3911
4242
|
|
3912
4243
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
3913
|
-
model.layers[il].wo,
|
4244
|
+
model.layers[il].wo, model.layers[il].bo,
|
3914
4245
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
3915
4246
|
cb(cur, "kqv_out", il);
|
3916
4247
|
}
|
@@ -3919,7 +4250,7 @@ struct llm_build_context {
|
|
3919
4250
|
cb(ffn_inp, "ffn_inp", il);
|
3920
4251
|
|
3921
4252
|
// feed-forward network
|
3922
|
-
{
|
4253
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
3923
4254
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
3924
4255
|
model.layers[il].ffn_norm, NULL,
|
3925
4256
|
LLM_NORM_RMS, cb, il);
|
@@ -3931,6 +4262,69 @@ struct llm_build_context {
|
|
3931
4262
|
model.layers[il].ffn_down, NULL,
|
3932
4263
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
3933
4264
|
cb(cur, "ffn_out", il);
|
4265
|
+
} else {
|
4266
|
+
// MoE branch
|
4267
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4268
|
+
model.layers[il].ffn_norm, NULL,
|
4269
|
+
LLM_NORM_RMS, cb, il);
|
4270
|
+
cb(cur, "ffn_norm", il);
|
4271
|
+
|
4272
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
4273
|
+
cb(logits, "ffn_moe_logits", il);
|
4274
|
+
|
4275
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
4276
|
+
cb(probs, "ffn_moe_probs", il);
|
4277
|
+
|
4278
|
+
// select experts
|
4279
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
4280
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
4281
|
+
|
4282
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
4283
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
4284
|
+
cb(weights, "ffn_moe_weights", il);
|
4285
|
+
|
4286
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
4287
|
+
|
4288
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
4289
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
4290
|
+
|
4291
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
4292
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
4293
|
+
|
4294
|
+
// compute expert outputs
|
4295
|
+
ggml_tensor * moe_out = nullptr;
|
4296
|
+
|
4297
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
4298
|
+
ggml_tensor * cur_expert;
|
4299
|
+
|
4300
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
4301
|
+
cb(cur_up, "ffn_moe_up", il);
|
4302
|
+
|
4303
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
4304
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
4305
|
+
|
4306
|
+
cur_gate = ggml_silu(ctx0, cur_gate);
|
4307
|
+
cb(cur_gate, "ffn_moe_silu", il);
|
4308
|
+
|
4309
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
4310
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
4311
|
+
|
4312
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
4313
|
+
cb(cur_expert, "ffn_moe_down", il);
|
4314
|
+
|
4315
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
4316
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
4317
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
4318
|
+
|
4319
|
+
if (i == 0) {
|
4320
|
+
moe_out = cur_expert;
|
4321
|
+
} else {
|
4322
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
4323
|
+
cb(moe_out, "ffn_moe_out", il);
|
4324
|
+
}
|
4325
|
+
}
|
4326
|
+
|
4327
|
+
cur = moe_out;
|
3934
4328
|
}
|
3935
4329
|
|
3936
4330
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -4308,6 +4702,7 @@ struct llm_build_context {
|
|
4308
4702
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4309
4703
|
cb(inpL, "imp_embd", -1);
|
4310
4704
|
|
4705
|
+
// inp_pos - contains the positions
|
4311
4706
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4312
4707
|
cb(inp_pos, "inp_pos", -1);
|
4313
4708
|
|
@@ -4315,6 +4710,7 @@ struct llm_build_context {
|
|
4315
4710
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4316
4711
|
cb(KQ_scale, "KQ_scale", -1);
|
4317
4712
|
|
4713
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4318
4714
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4319
4715
|
cb(KQ_mask, "KQ_mask", -1);
|
4320
4716
|
|
@@ -4903,6 +5299,121 @@ struct llm_build_context {
|
|
4903
5299
|
|
4904
5300
|
return gf;
|
4905
5301
|
}
|
5302
|
+
|
5303
|
+
struct ggml_cgraph * build_qwen() {
|
5304
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5305
|
+
|
5306
|
+
struct ggml_tensor * cur;
|
5307
|
+
struct ggml_tensor * inpL;
|
5308
|
+
|
5309
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5310
|
+
cb(inpL, "inp_embd", -1);
|
5311
|
+
|
5312
|
+
// inp_pos - contains the positions
|
5313
|
+
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5314
|
+
cb(inp_pos, "inp_pos", -1);
|
5315
|
+
|
5316
|
+
// KQ_scale
|
5317
|
+
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5318
|
+
cb(KQ_scale, "KQ_scale", -1);
|
5319
|
+
|
5320
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5321
|
+
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5322
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5323
|
+
|
5324
|
+
// shift the entire K-cache if needed
|
5325
|
+
if (do_rope_shift) {
|
5326
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5327
|
+
}
|
5328
|
+
|
5329
|
+
for (int il = 0; il < n_layer; ++il) {
|
5330
|
+
struct ggml_tensor * inpSA = inpL;
|
5331
|
+
|
5332
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5333
|
+
model.layers[il].attn_norm, NULL,
|
5334
|
+
LLM_NORM_RMS, cb, il);
|
5335
|
+
cb(cur, "attn_norm", il);
|
5336
|
+
|
5337
|
+
// self-attention
|
5338
|
+
{
|
5339
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5340
|
+
cb(cur, "wqkv", il);
|
5341
|
+
|
5342
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5343
|
+
cb(cur, "bqkv", il);
|
5344
|
+
|
5345
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5346
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5347
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
5348
|
+
|
5349
|
+
cb(Qcur, "Qcur", il);
|
5350
|
+
cb(Kcur, "Kcur", il);
|
5351
|
+
cb(Vcur, "Vcur", il);
|
5352
|
+
|
5353
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5354
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5355
|
+
|
5356
|
+
// using mode = 2 for neox mode
|
5357
|
+
Qcur = ggml_rope_custom(
|
5358
|
+
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5359
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5360
|
+
);
|
5361
|
+
cb(Qcur, "Qcur", il);
|
5362
|
+
|
5363
|
+
Kcur = ggml_rope_custom(
|
5364
|
+
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5365
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5366
|
+
);
|
5367
|
+
cb(Kcur, "Kcur", il);
|
5368
|
+
|
5369
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5370
|
+
|
5371
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5372
|
+
model.layers[il].wo, NULL,
|
5373
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
5374
|
+
cb(cur, "kqv_out", il);
|
5375
|
+
}
|
5376
|
+
|
5377
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5378
|
+
cb(ffn_inp, "ffn_inp", il);
|
5379
|
+
|
5380
|
+
// feed-forward forward
|
5381
|
+
{
|
5382
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5383
|
+
model.layers[il].ffn_norm, NULL,
|
5384
|
+
LLM_NORM_RMS, cb, il);
|
5385
|
+
cb(cur, "ffn_norm", il);
|
5386
|
+
|
5387
|
+
cur = llm_build_ffn(ctx0, cur,
|
5388
|
+
model.layers[il].ffn_up, NULL,
|
5389
|
+
model.layers[il].ffn_gate, NULL,
|
5390
|
+
model.layers[il].ffn_down, NULL,
|
5391
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5392
|
+
cb(cur, "ffn_out", il);
|
5393
|
+
}
|
5394
|
+
|
5395
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5396
|
+
cb(cur, "l_out", il);
|
5397
|
+
|
5398
|
+
// input for next layer
|
5399
|
+
inpL = cur;
|
5400
|
+
}
|
5401
|
+
|
5402
|
+
cur = inpL;
|
5403
|
+
|
5404
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5405
|
+
model.output_norm, NULL,
|
5406
|
+
LLM_NORM_RMS, cb, -1);
|
5407
|
+
cb(cur, "result_norm", -1);
|
5408
|
+
|
5409
|
+
// lm_head
|
5410
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5411
|
+
cb(cur, "result_output", -1);
|
5412
|
+
|
5413
|
+
ggml_build_forward_expand(gf, cur);
|
5414
|
+
|
5415
|
+
return gf;
|
5416
|
+
}
|
4906
5417
|
};
|
4907
5418
|
|
4908
5419
|
//
|
@@ -4913,8 +5424,8 @@ struct llm_build_context {
|
|
4913
5424
|
enum llm_offload_func_e {
|
4914
5425
|
OFFLOAD_FUNC_NOP,
|
4915
5426
|
OFFLOAD_FUNC,
|
4916
|
-
|
4917
|
-
|
5427
|
+
OFFLOAD_FUNC_FRC, // force offload
|
5428
|
+
OFFLOAD_FUNC_KQV,
|
4918
5429
|
OFFLOAD_FUNC_NR,
|
4919
5430
|
OFFLOAD_FUNC_EMB,
|
4920
5431
|
OFFLOAD_FUNC_OUT,
|
@@ -5000,11 +5511,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5000
5511
|
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
5001
5512
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5002
5513
|
|
5003
|
-
{ "inp_pos",
|
5004
|
-
{ "KQ_scale",
|
5005
|
-
{ "KQ_mask",
|
5006
|
-
{ "K_shift",
|
5007
|
-
|
5514
|
+
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5515
|
+
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5516
|
+
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5517
|
+
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5518
|
+
|
5519
|
+
{ "K_shifted", OFFLOAD_FUNC },
|
5008
5520
|
|
5009
5521
|
{ "inp_norm", OFFLOAD_FUNC_NR },
|
5010
5522
|
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
@@ -5017,38 +5529,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5017
5529
|
{ "attn_norm", OFFLOAD_FUNC },
|
5018
5530
|
{ "attn_norm_2", OFFLOAD_FUNC },
|
5019
5531
|
|
5020
|
-
{ "wqkv",
|
5021
|
-
{ "bqkv",
|
5022
|
-
{ "wqkv_clamped",
|
5023
|
-
|
5024
|
-
{ "tmpk",
|
5025
|
-
{ "tmpq",
|
5026
|
-
{ "tmpv",
|
5027
|
-
{ "Kcur",
|
5028
|
-
{ "Qcur",
|
5029
|
-
{ "Vcur",
|
5030
|
-
|
5031
|
-
{ "krot",
|
5032
|
-
{ "qrot",
|
5033
|
-
{ "kpass",
|
5034
|
-
{ "qpass",
|
5035
|
-
{ "krotated",
|
5036
|
-
{ "qrotated",
|
5037
|
-
|
5038
|
-
{ "q",
|
5039
|
-
{ "k",
|
5040
|
-
{ "kq",
|
5041
|
-
{ "kq_scaled",
|
5042
|
-
{ "kq_scaled_alibi",
|
5043
|
-
{ "kq_masked",
|
5044
|
-
{ "kq_soft_max",
|
5045
|
-
{ "kq_soft_max_ext",
|
5046
|
-
{ "v",
|
5047
|
-
{ "kqv",
|
5048
|
-
{ "kqv_merged",
|
5049
|
-
{ "kqv_merged_cont",
|
5050
|
-
{ "kqv_wo",
|
5051
|
-
{ "kqv_out",
|
5532
|
+
{ "wqkv", OFFLOAD_FUNC_KQV },
|
5533
|
+
{ "bqkv", OFFLOAD_FUNC_KQV },
|
5534
|
+
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
5535
|
+
|
5536
|
+
{ "tmpk", OFFLOAD_FUNC_KQV },
|
5537
|
+
{ "tmpq", OFFLOAD_FUNC_KQV },
|
5538
|
+
{ "tmpv", OFFLOAD_FUNC_KQV },
|
5539
|
+
{ "Kcur", OFFLOAD_FUNC_KQV },
|
5540
|
+
{ "Qcur", OFFLOAD_FUNC_KQV },
|
5541
|
+
{ "Vcur", OFFLOAD_FUNC_KQV },
|
5542
|
+
|
5543
|
+
{ "krot", OFFLOAD_FUNC_KQV },
|
5544
|
+
{ "qrot", OFFLOAD_FUNC_KQV },
|
5545
|
+
{ "kpass", OFFLOAD_FUNC_KQV },
|
5546
|
+
{ "qpass", OFFLOAD_FUNC_KQV },
|
5547
|
+
{ "krotated", OFFLOAD_FUNC_KQV },
|
5548
|
+
{ "qrotated", OFFLOAD_FUNC_KQV },
|
5549
|
+
|
5550
|
+
{ "q", OFFLOAD_FUNC_KQV },
|
5551
|
+
{ "k", OFFLOAD_FUNC_KQV },
|
5552
|
+
{ "kq", OFFLOAD_FUNC_KQV },
|
5553
|
+
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
5554
|
+
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
5555
|
+
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
5556
|
+
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
5557
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
5558
|
+
{ "v", OFFLOAD_FUNC_KQV },
|
5559
|
+
{ "kqv", OFFLOAD_FUNC_KQV },
|
5560
|
+
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
5561
|
+
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
5562
|
+
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
5563
|
+
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
5052
5564
|
|
5053
5565
|
{ "ffn_inp", OFFLOAD_FUNC },
|
5054
5566
|
{ "ffn_norm", OFFLOAD_FUNC },
|
@@ -5067,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5067
5579
|
{ "ffn_relu", OFFLOAD_FUNC },
|
5068
5580
|
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
5069
5581
|
|
5582
|
+
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
5583
|
+
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
5584
|
+
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
5585
|
+
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
5586
|
+
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
5587
|
+
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
5588
|
+
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
5589
|
+
{ "ffn_moe_up", OFFLOAD_FUNC },
|
5590
|
+
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
5591
|
+
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
5592
|
+
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
5593
|
+
{ "ffn_moe_down", OFFLOAD_FUNC },
|
5594
|
+
{ "ffn_moe_out", OFFLOAD_FUNC },
|
5595
|
+
|
5070
5596
|
{ "l_out", OFFLOAD_FUNC },
|
5071
5597
|
|
5072
5598
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
@@ -5240,15 +5766,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5240
5766
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5241
5767
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5242
5768
|
#ifdef GGML_USE_CUBLAS
|
5243
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)"
|
5244
|
-
{
|
5245
|
-
{
|
5246
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR"
|
5769
|
+
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5770
|
+
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5771
|
+
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
5772
|
+
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
5247
5773
|
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
5248
5774
|
#else
|
5249
5775
|
{ OFFLOAD_FUNC, "CPU" },
|
5250
|
-
{
|
5251
|
-
{
|
5776
|
+
{ OFFLOAD_FUNC_FRC, "CPU" },
|
5777
|
+
{ OFFLOAD_FUNC_KQV, "CPU" },
|
5252
5778
|
{ OFFLOAD_FUNC_NR, "CPU" },
|
5253
5779
|
{ OFFLOAD_FUNC_EMB, "CPU" },
|
5254
5780
|
#endif // GGML_USE_CUBLAS
|
@@ -5281,18 +5807,23 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5281
5807
|
}
|
5282
5808
|
}
|
5283
5809
|
break;
|
5284
|
-
case
|
5285
|
-
if (
|
5810
|
+
case OFFLOAD_FUNC_FRC:
|
5811
|
+
if (!lctx.cparams.offload_kqv) {
|
5286
5812
|
func_e = OFFLOAD_FUNC_NOP;
|
5287
|
-
}
|
5288
|
-
|
5289
|
-
|
5290
|
-
if (n_gpu_layers <= n_layer + 1) {
|
5813
|
+
} break;
|
5814
|
+
case OFFLOAD_FUNC_KQV:
|
5815
|
+
if (!lctx.cparams.offload_kqv) {
|
5291
5816
|
func_e = OFFLOAD_FUNC_NOP;
|
5817
|
+
} else {
|
5818
|
+
if (n_gpu_layers < n_layer) {
|
5819
|
+
if (il < i_gpu_start) {
|
5820
|
+
func_e = OFFLOAD_FUNC_NOP;
|
5821
|
+
}
|
5822
|
+
}
|
5292
5823
|
}
|
5293
5824
|
break;
|
5294
|
-
case
|
5295
|
-
if (n_gpu_layers <= n_layer +
|
5825
|
+
case OFFLOAD_FUNC_NR:
|
5826
|
+
if (n_gpu_layers <= n_layer + 0) {
|
5296
5827
|
func_e = OFFLOAD_FUNC_NOP;
|
5297
5828
|
}
|
5298
5829
|
break;
|
@@ -5317,8 +5848,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5317
5848
|
case OFFLOAD_FUNC_NOP:
|
5318
5849
|
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
5319
5850
|
case OFFLOAD_FUNC:
|
5320
|
-
case
|
5321
|
-
case
|
5851
|
+
case OFFLOAD_FUNC_KQV:
|
5852
|
+
case OFFLOAD_FUNC_FRC:
|
5322
5853
|
case OFFLOAD_FUNC_NR:
|
5323
5854
|
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
5324
5855
|
default: GGML_ASSERT(false);
|
@@ -5377,6 +5908,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5377
5908
|
{
|
5378
5909
|
result = llm.build_stablelm();
|
5379
5910
|
} break;
|
5911
|
+
case LLM_ARCH_QWEN:
|
5912
|
+
{
|
5913
|
+
result = llm.build_qwen();
|
5914
|
+
} break;
|
5380
5915
|
default:
|
5381
5916
|
GGML_ASSERT(false);
|
5382
5917
|
}
|
@@ -5454,7 +5989,7 @@ static int llama_decode_internal(
|
|
5454
5989
|
const int64_t n_embd = hparams.n_embd;
|
5455
5990
|
const int64_t n_vocab = hparams.n_vocab;
|
5456
5991
|
|
5457
|
-
// helpers for smoother batch API
|
5992
|
+
// helpers for smoother batch API transition
|
5458
5993
|
// after deprecating the llama_eval calls, these will be removed
|
5459
5994
|
std::vector<llama_pos> pos;
|
5460
5995
|
|
@@ -5499,8 +6034,8 @@ static int llama_decode_internal(
|
|
5499
6034
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
5500
6035
|
// after enough generations, the benefit from this heuristic disappears
|
5501
6036
|
// if we start defragmenting the cache, the benefit from this will be more important
|
5502
|
-
|
5503
|
-
kv_self.n =
|
6037
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
6038
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
5504
6039
|
|
5505
6040
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5506
6041
|
|
@@ -5551,7 +6086,7 @@ static int llama_decode_internal(
|
|
5551
6086
|
n_threads = std::min(4, n_threads);
|
5552
6087
|
}
|
5553
6088
|
|
5554
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer +
|
6089
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
5555
6090
|
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5556
6091
|
n_threads = 1;
|
5557
6092
|
}
|
@@ -6233,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
6233
6768
|
|
6234
6769
|
// loop over the text
|
6235
6770
|
while (true) {
|
6236
|
-
// find the first
|
6771
|
+
// find the first occurrence of a given special token in this fragment
|
6237
6772
|
// passing offset argument only limit the "search area" but match coordinates
|
6238
6773
|
// are still relative to the source full raw_text
|
6239
6774
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6240
6775
|
|
6241
|
-
// no
|
6776
|
+
// no occurrences found, stop processing this fragment for a given special token
|
6242
6777
|
if (match == std::string::npos) break;
|
6243
6778
|
|
6244
6779
|
// check if match is within bounds of offset <-> length
|
@@ -6410,14 +6945,13 @@ struct llama_grammar_candidate {
|
|
6410
6945
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
6411
6946
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6412
6947
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6413
|
-
const
|
6414
|
-
size_t n_src,
|
6948
|
+
const std::string & src,
|
6415
6949
|
llama_partial_utf8 partial_start) {
|
6416
6950
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6417
|
-
const char * pos = src;
|
6951
|
+
const char * pos = src.c_str();
|
6418
6952
|
std::vector<uint32_t> code_points;
|
6419
6953
|
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
-
code_points.reserve(
|
6954
|
+
code_points.reserve(src.size() + 1);
|
6421
6955
|
uint32_t value = partial_start.value;
|
6422
6956
|
int n_remain = partial_start.n_remain;
|
6423
6957
|
|
@@ -6468,13 +7002,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6468
7002
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6469
7003
|
}
|
6470
7004
|
|
6471
|
-
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
-
std::string src,
|
6473
|
-
llama_partial_utf8 partial_start
|
6474
|
-
) {
|
6475
|
-
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
-
}
|
6477
|
-
|
6478
7005
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6479
7006
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6480
7007
|
switch (pos->type) {
|
@@ -7113,7 +7640,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7113
7640
|
const llama_token eos = llama_token_eos(&ctx->model);
|
7114
7641
|
|
7115
7642
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7643
|
+
candidates_decoded.reserve(candidates->size);
|
7116
7644
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7645
|
+
candidates_grammar.reserve(candidates->size);
|
7117
7646
|
|
7118
7647
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7119
7648
|
const llama_token id = candidates->data[i].id;
|
@@ -7443,7 +7972,7 @@ struct llama_beam_search_data {
|
|
7443
7972
|
}
|
7444
7973
|
|
7445
7974
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
7446
|
-
// The
|
7975
|
+
// The repetitive patterns below reflect the 2 stages of heaps:
|
7447
7976
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
7448
7977
|
// * If the heap is full and a new element is found that should be included, pop the
|
7449
7978
|
// least element to the back(), replace it with the new, then push it into the heap.
|
@@ -7650,18 +8179,21 @@ static void llama_convert_tensor_internal(
|
|
7650
8179
|
return;
|
7651
8180
|
}
|
7652
8181
|
|
7653
|
-
|
7654
|
-
|
8182
|
+
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
8183
|
+
size_t block_size_bytes = ggml_type_size(tensor->type);
|
7655
8184
|
|
7656
8185
|
GGML_ASSERT(nelements % block_size == 0);
|
7657
|
-
|
7658
|
-
|
7659
|
-
|
8186
|
+
size_t nblocks = nelements / block_size;
|
8187
|
+
size_t blocks_per_thread = nblocks / nthread;
|
8188
|
+
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
8189
|
+
|
8190
|
+
size_t in_buff_offs = 0;
|
8191
|
+
size_t out_buff_offs = 0;
|
7660
8192
|
|
7661
|
-
for (
|
7662
|
-
|
7663
|
-
|
7664
|
-
|
8193
|
+
for (int tnum = 0; tnum < nthread; tnum++) {
|
8194
|
+
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
8195
|
+
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
8196
|
+
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
7665
8197
|
|
7666
8198
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
7667
8199
|
if (typ == GGML_TYPE_F16) {
|
@@ -7678,11 +8210,9 @@ static void llama_convert_tensor_internal(
|
|
7678
8210
|
workers.clear();
|
7679
8211
|
}
|
7680
8212
|
|
7681
|
-
static ggml_type get_k_quant_type(
|
7682
|
-
quantize_state_internal & qs,
|
7683
|
-
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
7684
|
-
) {
|
8213
|
+
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
7685
8214
|
const std::string name = ggml_get_name(tensor);
|
8215
|
+
|
7686
8216
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7687
8217
|
const llm_arch arch = qs.model.arch;
|
7688
8218
|
const auto tn = LLM_TN(arch);
|
@@ -7716,7 +8246,18 @@ static ggml_type get_k_quant_type(
|
|
7716
8246
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
7717
8247
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
7718
8248
|
}
|
8249
|
+
if (qs.model.hparams.n_expert == 8) {
|
8250
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8251
|
+
// TODO: explore better strategies
|
8252
|
+
new_type = GGML_TYPE_Q8_0;
|
8253
|
+
}
|
7719
8254
|
++qs.i_attention_wv;
|
8255
|
+
} else if (name.find("attn_k.weight") != std::string::npos) {
|
8256
|
+
if (qs.model.hparams.n_expert == 8) {
|
8257
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8258
|
+
// TODO: explore better strategies
|
8259
|
+
new_type = GGML_TYPE_Q8_0;
|
8260
|
+
}
|
7720
8261
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
7721
8262
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
7722
8263
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -7831,7 +8372,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7831
8372
|
constexpr bool use_mmap = false;
|
7832
8373
|
#endif
|
7833
8374
|
|
7834
|
-
llama_model_loader ml(fname_inp, use_mmap);
|
8375
|
+
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
7835
8376
|
if (ml.use_mmap) {
|
7836
8377
|
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7837
8378
|
}
|
@@ -7925,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7925
8466
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
7926
8467
|
|
7927
8468
|
// quantize only 2D tensors
|
7928
|
-
quantize &= (tensor
|
8469
|
+
quantize &= (ggml_n_dims(tensor) == 2);
|
7929
8470
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
7930
8471
|
quantize &= !params->only_copy;
|
7931
8472
|
|
8473
|
+
// do not quantize expert gating tensors
|
8474
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
8475
|
+
|
7932
8476
|
enum ggml_type new_type;
|
7933
8477
|
void * new_data;
|
7934
8478
|
size_t new_size;
|
@@ -8127,7 +8671,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8127
8671
|
std::vector<uint8_t> base_buf;
|
8128
8672
|
if (path_base_model) {
|
8129
8673
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8130
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
8674
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8131
8675
|
|
8132
8676
|
size_t ctx_size;
|
8133
8677
|
size_t mmapped_size;
|
@@ -8355,6 +8899,7 @@ struct llama_model_params llama_model_default_params() {
|
|
8355
8899
|
/*.tensor_split =*/ nullptr,
|
8356
8900
|
/*.progress_callback =*/ nullptr,
|
8357
8901
|
/*.progress_callback_user_data =*/ nullptr,
|
8902
|
+
/*.kv_overrides =*/ nullptr,
|
8358
8903
|
/*.vocab_only =*/ false,
|
8359
8904
|
/*.use_mmap =*/ true,
|
8360
8905
|
/*.use_mlock =*/ false,
|
@@ -8382,10 +8927,12 @@ struct llama_context_params llama_context_default_params() {
|
|
8382
8927
|
/*.yarn_beta_fast =*/ 32.0f,
|
8383
8928
|
/*.yarn_beta_slow =*/ 1.0f,
|
8384
8929
|
/*.yarn_orig_ctx =*/ 0,
|
8930
|
+
/*.type_k =*/ GGML_TYPE_F16,
|
8931
|
+
/*.type_v =*/ GGML_TYPE_F16,
|
8385
8932
|
/*.mul_mat_q =*/ true,
|
8386
|
-
/*.f16_kv =*/ true,
|
8387
8933
|
/*.logits_all =*/ false,
|
8388
8934
|
/*.embedding =*/ false,
|
8935
|
+
/*.offload_kqv =*/ true,
|
8389
8936
|
};
|
8390
8937
|
|
8391
8938
|
return result;
|
@@ -8502,6 +9049,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8502
9049
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
8503
9050
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
8504
9051
|
cparams.mul_mat_q = params.mul_mat_q;
|
9052
|
+
cparams.offload_kqv = params.offload_kqv;
|
8505
9053
|
|
8506
9054
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
8507
9055
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -8535,19 +9083,36 @@ struct llama_context * llama_new_context_with_model(
|
|
8535
9083
|
ctx->rng = std::mt19937(params.seed);
|
8536
9084
|
ctx->logits_all = params.logits_all;
|
8537
9085
|
|
8538
|
-
ggml_type
|
9086
|
+
const ggml_type type_k = params.type_k;
|
9087
|
+
const ggml_type type_v = params.type_v;
|
9088
|
+
|
9089
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
9090
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
8539
9091
|
|
8540
9092
|
// reserve memory for context buffers
|
8541
9093
|
if (!hparams.vocab_only) {
|
8542
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self,
|
9094
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
8543
9095
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
8544
9096
|
llama_free(ctx);
|
8545
9097
|
return nullptr;
|
8546
9098
|
}
|
8547
9099
|
|
8548
9100
|
{
|
8549
|
-
|
8550
|
-
|
9101
|
+
size_t memory_size_k = 0;
|
9102
|
+
size_t memory_size_v = 0;
|
9103
|
+
|
9104
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9105
|
+
memory_size_k += ggml_nbytes(k);
|
9106
|
+
}
|
9107
|
+
|
9108
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9109
|
+
memory_size_v += ggml_nbytes(v);
|
9110
|
+
}
|
9111
|
+
|
9112
|
+
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
9113
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
9114
|
+
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
9115
|
+
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
8551
9116
|
}
|
8552
9117
|
|
8553
9118
|
// resized during inference
|
@@ -8618,8 +9183,12 @@ struct llama_context * llama_new_context_with_model(
|
|
8618
9183
|
}
|
8619
9184
|
|
8620
9185
|
size_t kv_vram_size = 0;
|
8621
|
-
|
8622
|
-
|
9186
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9187
|
+
add_tensor(k, kv_vram_size);
|
9188
|
+
}
|
9189
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9190
|
+
add_tensor(v, kv_vram_size);
|
9191
|
+
}
|
8623
9192
|
|
8624
9193
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8625
9194
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
@@ -9089,37 +9658,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9089
9658
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
9090
9659
|
|
9091
9660
|
if (kv_buf_size) {
|
9092
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9661
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9093
9662
|
|
9094
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9663
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9095
9664
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9096
9665
|
|
9097
|
-
|
9098
|
-
std::vector<uint8_t
|
9099
|
-
kout3d->data = kout3d_data.data();
|
9666
|
+
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
9667
|
+
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
9100
9668
|
|
9101
|
-
|
9102
|
-
|
9103
|
-
|
9669
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9670
|
+
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9671
|
+
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
9672
|
+
kout2d->data = kout2d_data[il].data();
|
9104
9673
|
|
9105
|
-
|
9106
|
-
|
9107
|
-
|
9674
|
+
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9675
|
+
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9676
|
+
vout2d->data = vout2d_data[il].data();
|
9108
9677
|
|
9109
|
-
|
9110
|
-
|
9111
|
-
|
9678
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9679
|
+
n_embd, kv_head,
|
9680
|
+
elt_size*n_embd, 0);
|
9681
|
+
|
9682
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9683
|
+
kv_head, n_embd,
|
9684
|
+
elt_size*n_ctx, 0);
|
9685
|
+
|
9686
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9687
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9688
|
+
}
|
9112
9689
|
|
9113
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9114
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9115
9690
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9116
9691
|
|
9117
9692
|
ggml_free(cpy_ctx);
|
9118
9693
|
|
9119
|
-
// our data is now in the
|
9694
|
+
// our data is now in the kout2d_data and vout2d_data buffers
|
9120
9695
|
// write them to file
|
9121
|
-
|
9122
|
-
|
9696
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
9697
|
+
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9698
|
+
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9699
|
+
}
|
9123
9700
|
}
|
9124
9701
|
|
9125
9702
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9219,29 +9796,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9219
9796
|
if (kv_buf_size) {
|
9220
9797
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
9221
9798
|
|
9222
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9799
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9223
9800
|
|
9224
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9801
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9225
9802
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9226
9803
|
|
9227
|
-
|
9228
|
-
|
9229
|
-
|
9804
|
+
for (int il = 0; il < n_layer; ++il) {
|
9805
|
+
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9806
|
+
kin2d->data = (void *) inp;
|
9807
|
+
inp += ggml_nbytes(kin2d);
|
9808
|
+
|
9809
|
+
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9810
|
+
vin2d->data = (void *) inp;
|
9811
|
+
inp += ggml_nbytes(vin2d);
|
9230
9812
|
|
9231
|
-
|
9232
|
-
|
9233
|
-
|
9813
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9814
|
+
n_embd, kv_head,
|
9815
|
+
elt_size*n_embd, 0);
|
9234
9816
|
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9817
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9818
|
+
kv_head, n_embd,
|
9819
|
+
elt_size*n_ctx, 0);
|
9238
9820
|
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9821
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9822
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9823
|
+
}
|
9242
9824
|
|
9243
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9244
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9245
9825
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9246
9826
|
|
9247
9827
|
ggml_free(cpy_ctx);
|