llama_cpp 0.10.0 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +691 -93
- data/ext/llama_cpp/src/ggml-metal.m +535 -54
- data/ext/llama_cpp/src/ggml-metal.metal +1497 -169
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +325 -159
- data/ext/llama_cpp/src/ggml.h +34 -13
- data/ext/llama_cpp/src/llama.cpp +195 -35
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -423,7 +423,9 @@ extern "C" {
|
|
423
423
|
GGML_OP_POOL_1D,
|
424
424
|
GGML_OP_POOL_2D,
|
425
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_PAD,
|
426
427
|
GGML_OP_ARGSORT,
|
428
|
+
GGML_OP_LEAKY_RELU,
|
427
429
|
|
428
430
|
GGML_OP_FLASH_ATTN,
|
429
431
|
GGML_OP_FLASH_FF,
|
@@ -463,7 +465,6 @@ extern "C" {
|
|
463
465
|
GGML_UNARY_OP_GELU,
|
464
466
|
GGML_UNARY_OP_GELU_QUICK,
|
465
467
|
GGML_UNARY_OP_SILU,
|
466
|
-
GGML_UNARY_OP_LEAKY,
|
467
468
|
|
468
469
|
GGML_UNARY_OP_COUNT,
|
469
470
|
};
|
@@ -501,7 +502,6 @@ extern "C" {
|
|
501
502
|
|
502
503
|
struct ggml_backend_buffer * buffer;
|
503
504
|
|
504
|
-
int n_dims;
|
505
505
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
506
506
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
507
507
|
// nb[0] = ggml_type_size(type)
|
@@ -533,7 +533,7 @@ extern "C" {
|
|
533
533
|
|
534
534
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
535
535
|
|
536
|
-
char padding[
|
536
|
+
char padding[8];
|
537
537
|
};
|
538
538
|
|
539
539
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -638,11 +638,14 @@ extern "C" {
|
|
638
638
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
639
639
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
640
640
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
641
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
642
641
|
|
643
|
-
GGML_API int
|
644
|
-
GGML_API size_t
|
645
|
-
GGML_API
|
642
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
643
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
644
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
645
|
+
|
646
|
+
GGML_DEPRECATED(
|
647
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
648
|
+
"use ggml_row_size() instead");
|
646
649
|
|
647
650
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
648
651
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
@@ -661,6 +664,11 @@ extern "C" {
|
|
661
664
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
662
665
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
663
666
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
667
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
668
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
669
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
670
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
671
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
664
672
|
|
665
673
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
666
674
|
|
@@ -793,6 +801,9 @@ extern "C" {
|
|
793
801
|
struct ggml_tensor * a,
|
794
802
|
struct ggml_tensor * b);
|
795
803
|
|
804
|
+
// dst = a
|
805
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
806
|
+
// return dst
|
796
807
|
GGML_API struct ggml_tensor * ggml_acc(
|
797
808
|
struct ggml_context * ctx,
|
798
809
|
struct ggml_tensor * a,
|
@@ -957,15 +968,14 @@ extern "C" {
|
|
957
968
|
struct ggml_context * ctx,
|
958
969
|
struct ggml_tensor * a);
|
959
970
|
|
960
|
-
GGML_API struct ggml_tensor *
|
971
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
961
972
|
struct ggml_context * ctx,
|
962
|
-
struct ggml_tensor * a);
|
973
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
963
974
|
|
964
975
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
965
976
|
struct ggml_context * ctx,
|
966
977
|
struct ggml_tensor * a);
|
967
978
|
|
968
|
-
// TODO: double-check this computation is correct
|
969
979
|
GGML_API struct ggml_tensor * ggml_gelu(
|
970
980
|
struct ggml_context * ctx,
|
971
981
|
struct ggml_tensor * a);
|
@@ -1051,7 +1061,8 @@ extern "C" {
|
|
1051
1061
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
1062
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
1063
|
struct ggml_context * ctx,
|
1054
|
-
struct ggml_tensor * as[],
|
1064
|
+
struct ggml_tensor * const as[],
|
1065
|
+
int n_as,
|
1055
1066
|
struct ggml_tensor * ids,
|
1056
1067
|
int id,
|
1057
1068
|
struct ggml_tensor * b);
|
@@ -1263,6 +1274,7 @@ extern "C" {
|
|
1263
1274
|
struct ggml_context * ctx,
|
1264
1275
|
struct ggml_tensor * a);
|
1265
1276
|
|
1277
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1266
1278
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1267
1279
|
struct ggml_context * ctx,
|
1268
1280
|
struct ggml_tensor * a,
|
@@ -1549,6 +1561,15 @@ extern "C" {
|
|
1549
1561
|
struct ggml_tensor * a,
|
1550
1562
|
int scale_factor);
|
1551
1563
|
|
1564
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1565
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1566
|
+
struct ggml_context * ctx,
|
1567
|
+
struct ggml_tensor * a,
|
1568
|
+
int p0,
|
1569
|
+
int p1,
|
1570
|
+
int p2,
|
1571
|
+
int p3);
|
1572
|
+
|
1552
1573
|
// sort rows
|
1553
1574
|
enum ggml_sort_order {
|
1554
1575
|
GGML_SORT_ASC,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,7 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
#define LLAMA_MAX_EXPERTS 8
|
95
96
|
|
96
97
|
//
|
97
98
|
// logging
|
@@ -231,6 +232,8 @@ enum llm_kv {
|
|
231
232
|
LLM_KV_FEED_FORWARD_LENGTH,
|
232
233
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
233
234
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
235
|
+
LLM_KV_EXPERT_COUNT,
|
236
|
+
LLM_KV_EXPERT_USED_COUNT,
|
234
237
|
|
235
238
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
236
239
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
281
284
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
282
285
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
283
286
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
287
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
288
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
284
289
|
|
285
290
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
286
291
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -338,10 +343,14 @@ enum llm_tensor {
|
|
338
343
|
LLM_TENSOR_ATTN_NORM,
|
339
344
|
LLM_TENSOR_ATTN_NORM_2,
|
340
345
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
346
|
+
LLM_TENSOR_FFN_GATE_INP,
|
347
|
+
LLM_TENSOR_FFN_NORM,
|
341
348
|
LLM_TENSOR_FFN_GATE,
|
342
349
|
LLM_TENSOR_FFN_DOWN,
|
343
350
|
LLM_TENSOR_FFN_UP,
|
344
|
-
|
351
|
+
LLM_TENSOR_FFN_DOWN_EXP,
|
352
|
+
LLM_TENSOR_FFN_GATE_EXP,
|
353
|
+
LLM_TENSOR_FFN_UP_EXP,
|
345
354
|
LLM_TENSOR_ATTN_Q_NORM,
|
346
355
|
LLM_TENSOR_ATTN_K_NORM,
|
347
356
|
};
|
@@ -360,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
360
369
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
361
370
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
362
371
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
372
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
363
373
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
364
374
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
365
375
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
366
376
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
377
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
378
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
379
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
367
380
|
},
|
368
381
|
},
|
369
382
|
{
|
@@ -585,6 +598,10 @@ struct LLM_TN {
|
|
585
598
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
586
599
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
587
600
|
}
|
601
|
+
|
602
|
+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
603
|
+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
604
|
+
}
|
588
605
|
};
|
589
606
|
|
590
607
|
//
|
@@ -1159,6 +1176,8 @@ struct llama_hparams {
|
|
1159
1176
|
uint32_t n_layer;
|
1160
1177
|
uint32_t n_rot;
|
1161
1178
|
uint32_t n_ff;
|
1179
|
+
uint32_t n_expert = 0;
|
1180
|
+
uint32_t n_expert_used = 0;
|
1162
1181
|
|
1163
1182
|
float f_norm_eps;
|
1164
1183
|
float f_norm_rms_eps;
|
@@ -1173,15 +1192,18 @@ struct llama_hparams {
|
|
1173
1192
|
float f_max_alibi_bias;
|
1174
1193
|
|
1175
1194
|
bool operator!=(const llama_hparams & other) const {
|
1176
|
-
if (this->vocab_only
|
1177
|
-
if (this->n_vocab
|
1178
|
-
if (this->n_ctx_train
|
1179
|
-
if (this->n_embd
|
1180
|
-
if (this->n_head
|
1181
|
-
if (this->n_head_kv
|
1182
|
-
if (this->n_layer
|
1183
|
-
if (this->n_rot
|
1184
|
-
if (this->n_ff
|
1195
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1196
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1197
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1198
|
+
if (this->n_embd != other.n_embd) return true;
|
1199
|
+
if (this->n_head != other.n_head) return true;
|
1200
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1201
|
+
if (this->n_layer != other.n_layer) return true;
|
1202
|
+
if (this->n_rot != other.n_rot) return true;
|
1203
|
+
if (this->n_ff != other.n_ff) return true;
|
1204
|
+
if (this->n_expert != other.n_expert) return true;
|
1205
|
+
if (this->n_expert_used != other.n_expert_used) return true;
|
1206
|
+
|
1185
1207
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1186
1208
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1187
1209
|
|
@@ -1263,6 +1285,12 @@ struct llama_layer {
|
|
1263
1285
|
struct ggml_tensor * ffn_down; // w2
|
1264
1286
|
struct ggml_tensor * ffn_up; // w3
|
1265
1287
|
|
1288
|
+
// ff MoE
|
1289
|
+
struct ggml_tensor * ffn_gate_inp;
|
1290
|
+
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
1291
|
+
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
1292
|
+
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
1293
|
+
|
1266
1294
|
// ff bias
|
1267
1295
|
struct ggml_tensor * ffn_down_b; // b2
|
1268
1296
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -1522,7 +1550,7 @@ static bool llama_kv_cache_init(
|
|
1522
1550
|
cache.cells.clear();
|
1523
1551
|
cache.cells.resize(n_ctx);
|
1524
1552
|
|
1525
|
-
cache.buf.resize(
|
1553
|
+
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1526
1554
|
memset(cache.buf.data, 0, cache.buf.size);
|
1527
1555
|
|
1528
1556
|
struct ggml_init_params params;
|
@@ -2435,6 +2463,16 @@ static void llm_load_hparams(
|
|
2435
2463
|
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2436
2464
|
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2437
2465
|
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2466
|
+
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
2467
|
+
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
2468
|
+
|
2469
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
2470
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
2471
|
+
if (hparams.n_expert > 0) {
|
2472
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
2473
|
+
} else {
|
2474
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
2475
|
+
}
|
2438
2476
|
|
2439
2477
|
// n_head_kv is optional, default to n_head
|
2440
2478
|
hparams.n_head_kv = hparams.n_head;
|
@@ -2753,7 +2791,7 @@ static void llm_load_vocab(
|
|
2753
2791
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2754
2792
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2755
2793
|
// are special tokens.
|
2756
|
-
// From testing, this appears to
|
2794
|
+
// From testing, this appears to correlate 1:1 with special tokens.
|
2757
2795
|
//
|
2758
2796
|
|
2759
2797
|
// Counting special tokens and verifying in only one direction
|
@@ -2866,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2866
2904
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2867
2905
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2868
2906
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2907
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
2908
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
2869
2909
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
2870
2910
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2871
2911
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3020,9 +3060,26 @@ static void llm_load_tensors(
|
|
3020
3060
|
|
3021
3061
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3022
3062
|
|
3023
|
-
layer.
|
3024
|
-
|
3025
|
-
layer.
|
3063
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
3064
|
+
|
3065
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3066
|
+
GGML_ASSERT(hparams.n_expert == 0);
|
3067
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
3068
|
+
|
3069
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3070
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3071
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3072
|
+
} else {
|
3073
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
3074
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
3075
|
+
|
3076
|
+
// MoE branch
|
3077
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3078
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3079
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
3080
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
|
+
}
|
3082
|
+
}
|
3026
3083
|
|
3027
3084
|
if (backend == GGML_BACKEND_GPU) {
|
3028
3085
|
vram_weights +=
|
@@ -3032,8 +3089,18 @@ static void llm_load_tensors(
|
|
3032
3089
|
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3033
3090
|
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3034
3091
|
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3035
|
-
ggml_nbytes(layer.ffn_norm)
|
3036
|
-
|
3092
|
+
ggml_nbytes(layer.ffn_norm);
|
3093
|
+
|
3094
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
+
vram_weights +=
|
3096
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
+
} else {
|
3098
|
+
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
+
vram_weights +=
|
3101
|
+
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
+
}
|
3103
|
+
}
|
3037
3104
|
}
|
3038
3105
|
}
|
3039
3106
|
} break;
|
@@ -3750,8 +3817,8 @@ static void llm_build_k_shift(
|
|
3750
3817
|
ggml_rope_custom_inplace(ctx,
|
3751
3818
|
ggml_view_3d(ctx, kv.k_l[il],
|
3752
3819
|
n_embd_head, n_head_kv, n_ctx,
|
3753
|
-
|
3754
|
-
|
3820
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3821
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
3755
3822
|
0),
|
3756
3823
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3757
3824
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
@@ -3780,7 +3847,7 @@ static void llm_build_kv_store(
|
|
3780
3847
|
cb(v_cur_t, "v_cur_t", il);
|
3781
3848
|
|
3782
3849
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3783
|
-
(
|
3850
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
3784
3851
|
cb(k_cache_view, "k_cache_view", il);
|
3785
3852
|
|
3786
3853
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
@@ -3939,8 +4006,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3939
4006
|
struct ggml_tensor * k =
|
3940
4007
|
ggml_view_3d(ctx, kv.k_l[il],
|
3941
4008
|
n_embd_head, n_kv, n_head_kv,
|
3942
|
-
|
3943
|
-
|
4009
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
4010
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3944
4011
|
0);
|
3945
4012
|
cb(k, "k", il);
|
3946
4013
|
|
@@ -4014,6 +4081,8 @@ struct llm_build_context {
|
|
4014
4081
|
const int64_t n_head_kv;
|
4015
4082
|
const int64_t n_embd_head;
|
4016
4083
|
const int64_t n_embd_gqa;
|
4084
|
+
const int64_t n_expert;
|
4085
|
+
const int64_t n_expert_used;
|
4017
4086
|
|
4018
4087
|
const float freq_base;
|
4019
4088
|
const float freq_scale;
|
@@ -4055,6 +4124,8 @@ struct llm_build_context {
|
|
4055
4124
|
n_head_kv (hparams.n_head_kv),
|
4056
4125
|
n_embd_head (hparams.n_embd_head()),
|
4057
4126
|
n_embd_gqa (hparams.n_embd_gqa()),
|
4127
|
+
n_expert (hparams.n_expert),
|
4128
|
+
n_expert_used (hparams.n_expert_used),
|
4058
4129
|
freq_base (cparams.rope_freq_base),
|
4059
4130
|
freq_scale (cparams.rope_freq_scale),
|
4060
4131
|
ext_factor (cparams.yarn_ext_factor),
|
@@ -4179,7 +4250,7 @@ struct llm_build_context {
|
|
4179
4250
|
cb(ffn_inp, "ffn_inp", il);
|
4180
4251
|
|
4181
4252
|
// feed-forward network
|
4182
|
-
{
|
4253
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4183
4254
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4184
4255
|
model.layers[il].ffn_norm, NULL,
|
4185
4256
|
LLM_NORM_RMS, cb, il);
|
@@ -4191,6 +4262,69 @@ struct llm_build_context {
|
|
4191
4262
|
model.layers[il].ffn_down, NULL,
|
4192
4263
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4193
4264
|
cb(cur, "ffn_out", il);
|
4265
|
+
} else {
|
4266
|
+
// MoE branch
|
4267
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4268
|
+
model.layers[il].ffn_norm, NULL,
|
4269
|
+
LLM_NORM_RMS, cb, il);
|
4270
|
+
cb(cur, "ffn_norm", il);
|
4271
|
+
|
4272
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
4273
|
+
cb(logits, "ffn_moe_logits", il);
|
4274
|
+
|
4275
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
4276
|
+
cb(probs, "ffn_moe_probs", il);
|
4277
|
+
|
4278
|
+
// select experts
|
4279
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
4280
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
4281
|
+
|
4282
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
4283
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
4284
|
+
cb(weights, "ffn_moe_weights", il);
|
4285
|
+
|
4286
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
4287
|
+
|
4288
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
4289
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
4290
|
+
|
4291
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
4292
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
4293
|
+
|
4294
|
+
// compute expert outputs
|
4295
|
+
ggml_tensor * moe_out = nullptr;
|
4296
|
+
|
4297
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
4298
|
+
ggml_tensor * cur_expert;
|
4299
|
+
|
4300
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
4301
|
+
cb(cur_up, "ffn_moe_up", il);
|
4302
|
+
|
4303
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
4304
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
4305
|
+
|
4306
|
+
cur_gate = ggml_silu(ctx0, cur_gate);
|
4307
|
+
cb(cur_gate, "ffn_moe_silu", il);
|
4308
|
+
|
4309
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
4310
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
4311
|
+
|
4312
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
4313
|
+
cb(cur_expert, "ffn_moe_down", il);
|
4314
|
+
|
4315
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
4316
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
4317
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
4318
|
+
|
4319
|
+
if (i == 0) {
|
4320
|
+
moe_out = cur_expert;
|
4321
|
+
} else {
|
4322
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
4323
|
+
cb(moe_out, "ffn_moe_out", il);
|
4324
|
+
}
|
4325
|
+
}
|
4326
|
+
|
4327
|
+
cur = moe_out;
|
4194
4328
|
}
|
4195
4329
|
|
4196
4330
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -5445,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5445
5579
|
{ "ffn_relu", OFFLOAD_FUNC },
|
5446
5580
|
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
5447
5581
|
|
5582
|
+
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
5583
|
+
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
5584
|
+
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
5585
|
+
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
5586
|
+
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
5587
|
+
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
5588
|
+
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
5589
|
+
{ "ffn_moe_up", OFFLOAD_FUNC },
|
5590
|
+
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
5591
|
+
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
5592
|
+
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
5593
|
+
{ "ffn_moe_down", OFFLOAD_FUNC },
|
5594
|
+
{ "ffn_moe_out", OFFLOAD_FUNC },
|
5595
|
+
|
5448
5596
|
{ "l_out", OFFLOAD_FUNC },
|
5449
5597
|
|
5450
5598
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
@@ -5841,7 +5989,7 @@ static int llama_decode_internal(
|
|
5841
5989
|
const int64_t n_embd = hparams.n_embd;
|
5842
5990
|
const int64_t n_vocab = hparams.n_vocab;
|
5843
5991
|
|
5844
|
-
// helpers for smoother batch API
|
5992
|
+
// helpers for smoother batch API transition
|
5845
5993
|
// after deprecating the llama_eval calls, these will be removed
|
5846
5994
|
std::vector<llama_pos> pos;
|
5847
5995
|
|
@@ -6620,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
6620
6768
|
|
6621
6769
|
// loop over the text
|
6622
6770
|
while (true) {
|
6623
|
-
// find the first
|
6771
|
+
// find the first occurrence of a given special token in this fragment
|
6624
6772
|
// passing offset argument only limit the "search area" but match coordinates
|
6625
6773
|
// are still relative to the source full raw_text
|
6626
6774
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6627
6775
|
|
6628
|
-
// no
|
6776
|
+
// no occurrences found, stop processing this fragment for a given special token
|
6629
6777
|
if (match == std::string::npos) break;
|
6630
6778
|
|
6631
6779
|
// check if match is within bounds of offset <-> length
|
@@ -7498,7 +7646,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7498
7646
|
|
7499
7647
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7500
7648
|
const llama_token id = candidates->data[i].id;
|
7501
|
-
const std::string
|
7649
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7502
7650
|
if (id == eos) {
|
7503
7651
|
if (!allow_eos) {
|
7504
7652
|
candidates->data[i].logit = -INFINITY;
|
@@ -7710,7 +7858,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7710
7858
|
GGML_ASSERT(false);
|
7711
7859
|
}
|
7712
7860
|
|
7713
|
-
const std::string
|
7861
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7714
7862
|
|
7715
7863
|
// Note terminating 0 in decoded string
|
7716
7864
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -7824,7 +7972,7 @@ struct llama_beam_search_data {
|
|
7824
7972
|
}
|
7825
7973
|
|
7826
7974
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
7827
|
-
// The
|
7975
|
+
// The repetitive patterns below reflect the 2 stages of heaps:
|
7828
7976
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
7829
7977
|
// * If the heap is full and a new element is found that should be included, pop the
|
7830
7978
|
// least element to the back(), replace it with the new, then push it into the heap.
|
@@ -8062,11 +8210,9 @@ static void llama_convert_tensor_internal(
|
|
8062
8210
|
workers.clear();
|
8063
8211
|
}
|
8064
8212
|
|
8065
|
-
static ggml_type get_k_quant_type(
|
8066
|
-
quantize_state_internal & qs,
|
8067
|
-
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
8068
|
-
) {
|
8213
|
+
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
8069
8214
|
const std::string name = ggml_get_name(tensor);
|
8215
|
+
|
8070
8216
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
8071
8217
|
const llm_arch arch = qs.model.arch;
|
8072
8218
|
const auto tn = LLM_TN(arch);
|
@@ -8100,7 +8246,18 @@ static ggml_type get_k_quant_type(
|
|
8100
8246
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
8101
8247
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
8102
8248
|
}
|
8249
|
+
if (qs.model.hparams.n_expert == 8) {
|
8250
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8251
|
+
// TODO: explore better strategies
|
8252
|
+
new_type = GGML_TYPE_Q8_0;
|
8253
|
+
}
|
8103
8254
|
++qs.i_attention_wv;
|
8255
|
+
} else if (name.find("attn_k.weight") != std::string::npos) {
|
8256
|
+
if (qs.model.hparams.n_expert == 8) {
|
8257
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8258
|
+
// TODO: explore better strategies
|
8259
|
+
new_type = GGML_TYPE_Q8_0;
|
8260
|
+
}
|
8104
8261
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
8105
8262
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8106
8263
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -8309,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8309
8466
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
8310
8467
|
|
8311
8468
|
// quantize only 2D tensors
|
8312
|
-
quantize &= (tensor
|
8469
|
+
quantize &= (ggml_n_dims(tensor) == 2);
|
8313
8470
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
8314
8471
|
quantize &= !params->only_copy;
|
8315
8472
|
|
8473
|
+
// do not quantize expert gating tensors
|
8474
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
8475
|
+
|
8316
8476
|
enum ggml_type new_type;
|
8317
8477
|
void * new_data;
|
8318
8478
|
size_t new_size;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -216,7 +216,7 @@ extern "C" {
|
|
216
216
|
|
217
217
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
218
218
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
219
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
219
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
220
220
|
bool embedding; // embedding mode only
|
221
221
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
222
222
|
};
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.10.
|
6
|
+
VERSION = '0.10.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1641'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|