llama_cpp 0.10.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +691 -93
- data/ext/llama_cpp/src/ggml-metal.m +535 -54
- data/ext/llama_cpp/src/ggml-metal.metal +1497 -169
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +325 -159
- data/ext/llama_cpp/src/ggml.h +34 -13
- data/ext/llama_cpp/src/llama.cpp +195 -35
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -215,9 +215,9 @@
|
|
215
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
216
216
|
|
217
217
|
#define GGML_MAX_DIMS 4
|
218
|
-
#define GGML_MAX_PARAMS
|
218
|
+
#define GGML_MAX_PARAMS 2048
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
|
-
#define GGML_MAX_SRC
|
220
|
+
#define GGML_MAX_SRC 10
|
221
221
|
#define GGML_MAX_NAME 64
|
222
222
|
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -423,7 +423,9 @@ extern "C" {
|
|
423
423
|
GGML_OP_POOL_1D,
|
424
424
|
GGML_OP_POOL_2D,
|
425
425
|
GGML_OP_UPSCALE, // nearest interpolate
|
426
|
+
GGML_OP_PAD,
|
426
427
|
GGML_OP_ARGSORT,
|
428
|
+
GGML_OP_LEAKY_RELU,
|
427
429
|
|
428
430
|
GGML_OP_FLASH_ATTN,
|
429
431
|
GGML_OP_FLASH_FF,
|
@@ -463,7 +465,6 @@ extern "C" {
|
|
463
465
|
GGML_UNARY_OP_GELU,
|
464
466
|
GGML_UNARY_OP_GELU_QUICK,
|
465
467
|
GGML_UNARY_OP_SILU,
|
466
|
-
GGML_UNARY_OP_LEAKY,
|
467
468
|
|
468
469
|
GGML_UNARY_OP_COUNT,
|
469
470
|
};
|
@@ -501,7 +502,6 @@ extern "C" {
|
|
501
502
|
|
502
503
|
struct ggml_backend_buffer * buffer;
|
503
504
|
|
504
|
-
int n_dims;
|
505
505
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
506
506
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
507
507
|
// nb[0] = ggml_type_size(type)
|
@@ -533,7 +533,7 @@ extern "C" {
|
|
533
533
|
|
534
534
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
535
535
|
|
536
|
-
char padding[
|
536
|
+
char padding[8];
|
537
537
|
};
|
538
538
|
|
539
539
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -638,11 +638,14 @@ extern "C" {
|
|
638
638
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
639
639
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
640
640
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
641
|
-
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
642
641
|
|
643
|
-
GGML_API int
|
644
|
-
GGML_API size_t
|
645
|
-
GGML_API
|
642
|
+
GGML_API int ggml_blck_size(enum ggml_type type);
|
643
|
+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
644
|
+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
645
|
+
|
646
|
+
GGML_DEPRECATED(
|
647
|
+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
648
|
+
"use ggml_row_size() instead");
|
646
649
|
|
647
650
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
648
651
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
@@ -661,6 +664,11 @@ extern "C" {
|
|
661
664
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
662
665
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
663
666
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
667
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
668
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
669
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
670
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
671
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
664
672
|
|
665
673
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
666
674
|
|
@@ -793,6 +801,9 @@ extern "C" {
|
|
793
801
|
struct ggml_tensor * a,
|
794
802
|
struct ggml_tensor * b);
|
795
803
|
|
804
|
+
// dst = a
|
805
|
+
// view(dst, nb1, nb2, nb3, offset) += b
|
806
|
+
// return dst
|
796
807
|
GGML_API struct ggml_tensor * ggml_acc(
|
797
808
|
struct ggml_context * ctx,
|
798
809
|
struct ggml_tensor * a,
|
@@ -957,15 +968,14 @@ extern "C" {
|
|
957
968
|
struct ggml_context * ctx,
|
958
969
|
struct ggml_tensor * a);
|
959
970
|
|
960
|
-
GGML_API struct ggml_tensor *
|
971
|
+
GGML_API struct ggml_tensor * ggml_leaky_relu(
|
961
972
|
struct ggml_context * ctx,
|
962
|
-
struct ggml_tensor * a);
|
973
|
+
struct ggml_tensor * a, float negative_slope, bool inplace);
|
963
974
|
|
964
975
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
965
976
|
struct ggml_context * ctx,
|
966
977
|
struct ggml_tensor * a);
|
967
978
|
|
968
|
-
// TODO: double-check this computation is correct
|
969
979
|
GGML_API struct ggml_tensor * ggml_gelu(
|
970
980
|
struct ggml_context * ctx,
|
971
981
|
struct ggml_tensor * a);
|
@@ -1051,7 +1061,8 @@ extern "C" {
|
|
1051
1061
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1052
1062
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1053
1063
|
struct ggml_context * ctx,
|
1054
|
-
struct ggml_tensor * as[],
|
1064
|
+
struct ggml_tensor * const as[],
|
1065
|
+
int n_as,
|
1055
1066
|
struct ggml_tensor * ids,
|
1056
1067
|
int id,
|
1057
1068
|
struct ggml_tensor * b);
|
@@ -1263,6 +1274,7 @@ extern "C" {
|
|
1263
1274
|
struct ggml_context * ctx,
|
1264
1275
|
struct ggml_tensor * a);
|
1265
1276
|
|
1277
|
+
// supports 3D: a->ne[2] == b->ne[1]
|
1266
1278
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
1267
1279
|
struct ggml_context * ctx,
|
1268
1280
|
struct ggml_tensor * a,
|
@@ -1549,6 +1561,15 @@ extern "C" {
|
|
1549
1561
|
struct ggml_tensor * a,
|
1550
1562
|
int scale_factor);
|
1551
1563
|
|
1564
|
+
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
1565
|
+
GGML_API struct ggml_tensor * ggml_pad(
|
1566
|
+
struct ggml_context * ctx,
|
1567
|
+
struct ggml_tensor * a,
|
1568
|
+
int p0,
|
1569
|
+
int p1,
|
1570
|
+
int p2,
|
1571
|
+
int p3);
|
1572
|
+
|
1552
1573
|
// sort rows
|
1553
1574
|
enum ggml_sort_order {
|
1554
1575
|
GGML_SORT_ASC,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,7 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
|
+
#define LLAMA_MAX_EXPERTS 8
|
95
96
|
|
96
97
|
//
|
97
98
|
// logging
|
@@ -231,6 +232,8 @@ enum llm_kv {
|
|
231
232
|
LLM_KV_FEED_FORWARD_LENGTH,
|
232
233
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
233
234
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
235
|
+
LLM_KV_EXPERT_COUNT,
|
236
|
+
LLM_KV_EXPERT_USED_COUNT,
|
234
237
|
|
235
238
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
236
239
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
281
284
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
282
285
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
283
286
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
287
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
288
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
284
289
|
|
285
290
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
286
291
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -338,10 +343,14 @@ enum llm_tensor {
|
|
338
343
|
LLM_TENSOR_ATTN_NORM,
|
339
344
|
LLM_TENSOR_ATTN_NORM_2,
|
340
345
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
346
|
+
LLM_TENSOR_FFN_GATE_INP,
|
347
|
+
LLM_TENSOR_FFN_NORM,
|
341
348
|
LLM_TENSOR_FFN_GATE,
|
342
349
|
LLM_TENSOR_FFN_DOWN,
|
343
350
|
LLM_TENSOR_FFN_UP,
|
344
|
-
|
351
|
+
LLM_TENSOR_FFN_DOWN_EXP,
|
352
|
+
LLM_TENSOR_FFN_GATE_EXP,
|
353
|
+
LLM_TENSOR_FFN_UP_EXP,
|
345
354
|
LLM_TENSOR_ATTN_Q_NORM,
|
346
355
|
LLM_TENSOR_ATTN_K_NORM,
|
347
356
|
};
|
@@ -360,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
360
369
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
361
370
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
362
371
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
372
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
363
373
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
364
374
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
365
375
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
366
376
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
377
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
378
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
379
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
367
380
|
},
|
368
381
|
},
|
369
382
|
{
|
@@ -585,6 +598,10 @@ struct LLM_TN {
|
|
585
598
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
586
599
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
587
600
|
}
|
601
|
+
|
602
|
+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
603
|
+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
604
|
+
}
|
588
605
|
};
|
589
606
|
|
590
607
|
//
|
@@ -1159,6 +1176,8 @@ struct llama_hparams {
|
|
1159
1176
|
uint32_t n_layer;
|
1160
1177
|
uint32_t n_rot;
|
1161
1178
|
uint32_t n_ff;
|
1179
|
+
uint32_t n_expert = 0;
|
1180
|
+
uint32_t n_expert_used = 0;
|
1162
1181
|
|
1163
1182
|
float f_norm_eps;
|
1164
1183
|
float f_norm_rms_eps;
|
@@ -1173,15 +1192,18 @@ struct llama_hparams {
|
|
1173
1192
|
float f_max_alibi_bias;
|
1174
1193
|
|
1175
1194
|
bool operator!=(const llama_hparams & other) const {
|
1176
|
-
if (this->vocab_only
|
1177
|
-
if (this->n_vocab
|
1178
|
-
if (this->n_ctx_train
|
1179
|
-
if (this->n_embd
|
1180
|
-
if (this->n_head
|
1181
|
-
if (this->n_head_kv
|
1182
|
-
if (this->n_layer
|
1183
|
-
if (this->n_rot
|
1184
|
-
if (this->n_ff
|
1195
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1196
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1197
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1198
|
+
if (this->n_embd != other.n_embd) return true;
|
1199
|
+
if (this->n_head != other.n_head) return true;
|
1200
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1201
|
+
if (this->n_layer != other.n_layer) return true;
|
1202
|
+
if (this->n_rot != other.n_rot) return true;
|
1203
|
+
if (this->n_ff != other.n_ff) return true;
|
1204
|
+
if (this->n_expert != other.n_expert) return true;
|
1205
|
+
if (this->n_expert_used != other.n_expert_used) return true;
|
1206
|
+
|
1185
1207
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1186
1208
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1187
1209
|
|
@@ -1263,6 +1285,12 @@ struct llama_layer {
|
|
1263
1285
|
struct ggml_tensor * ffn_down; // w2
|
1264
1286
|
struct ggml_tensor * ffn_up; // w3
|
1265
1287
|
|
1288
|
+
// ff MoE
|
1289
|
+
struct ggml_tensor * ffn_gate_inp;
|
1290
|
+
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
1291
|
+
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
1292
|
+
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
1293
|
+
|
1266
1294
|
// ff bias
|
1267
1295
|
struct ggml_tensor * ffn_down_b; // b2
|
1268
1296
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -1522,7 +1550,7 @@ static bool llama_kv_cache_init(
|
|
1522
1550
|
cache.cells.clear();
|
1523
1551
|
cache.cells.resize(n_ctx);
|
1524
1552
|
|
1525
|
-
cache.buf.resize(
|
1553
|
+
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
|
1526
1554
|
memset(cache.buf.data, 0, cache.buf.size);
|
1527
1555
|
|
1528
1556
|
struct ggml_init_params params;
|
@@ -2435,6 +2463,16 @@ static void llm_load_hparams(
|
|
2435
2463
|
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2436
2464
|
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2437
2465
|
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2466
|
+
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
2467
|
+
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
2468
|
+
|
2469
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
2470
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
2471
|
+
if (hparams.n_expert > 0) {
|
2472
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
2473
|
+
} else {
|
2474
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
2475
|
+
}
|
2438
2476
|
|
2439
2477
|
// n_head_kv is optional, default to n_head
|
2440
2478
|
hparams.n_head_kv = hparams.n_head;
|
@@ -2753,7 +2791,7 @@ static void llm_load_vocab(
|
|
2753
2791
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2754
2792
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2755
2793
|
// are special tokens.
|
2756
|
-
// From testing, this appears to
|
2794
|
+
// From testing, this appears to correlate 1:1 with special tokens.
|
2757
2795
|
//
|
2758
2796
|
|
2759
2797
|
// Counting special tokens and verifying in only one direction
|
@@ -2866,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2866
2904
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2867
2905
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2868
2906
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2907
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
2908
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
2869
2909
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
2870
2910
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2871
2911
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3020,9 +3060,26 @@ static void llm_load_tensors(
|
|
3020
3060
|
|
3021
3061
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3022
3062
|
|
3023
|
-
layer.
|
3024
|
-
|
3025
|
-
layer.
|
3063
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
|
3064
|
+
|
3065
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3066
|
+
GGML_ASSERT(hparams.n_expert == 0);
|
3067
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
3068
|
+
|
3069
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3070
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3071
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3072
|
+
} else {
|
3073
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
3074
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
3075
|
+
|
3076
|
+
// MoE branch
|
3077
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3078
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3079
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
|
3080
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
|
3081
|
+
}
|
3082
|
+
}
|
3026
3083
|
|
3027
3084
|
if (backend == GGML_BACKEND_GPU) {
|
3028
3085
|
vram_weights +=
|
@@ -3032,8 +3089,18 @@ static void llm_load_tensors(
|
|
3032
3089
|
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3033
3090
|
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3034
3091
|
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3035
|
-
ggml_nbytes(layer.ffn_norm)
|
3036
|
-
|
3092
|
+
ggml_nbytes(layer.ffn_norm);
|
3093
|
+
|
3094
|
+
if (layer.ffn_gate_inp == nullptr) {
|
3095
|
+
vram_weights +=
|
3096
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3097
|
+
} else {
|
3098
|
+
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
|
3099
|
+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3100
|
+
vram_weights +=
|
3101
|
+
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
|
3102
|
+
}
|
3103
|
+
}
|
3037
3104
|
}
|
3038
3105
|
}
|
3039
3106
|
} break;
|
@@ -3750,8 +3817,8 @@ static void llm_build_k_shift(
|
|
3750
3817
|
ggml_rope_custom_inplace(ctx,
|
3751
3818
|
ggml_view_3d(ctx, kv.k_l[il],
|
3752
3819
|
n_embd_head, n_head_kv, n_ctx,
|
3753
|
-
|
3754
|
-
|
3820
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3821
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
3755
3822
|
0),
|
3756
3823
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3757
3824
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
@@ -3780,7 +3847,7 @@ static void llm_build_kv_store(
|
|
3780
3847
|
cb(v_cur_t, "v_cur_t", il);
|
3781
3848
|
|
3782
3849
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3783
|
-
(
|
3850
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
3784
3851
|
cb(k_cache_view, "k_cache_view", il);
|
3785
3852
|
|
3786
3853
|
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
@@ -3939,8 +4006,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3939
4006
|
struct ggml_tensor * k =
|
3940
4007
|
ggml_view_3d(ctx, kv.k_l[il],
|
3941
4008
|
n_embd_head, n_kv, n_head_kv,
|
3942
|
-
|
3943
|
-
|
4009
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
4010
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
3944
4011
|
0);
|
3945
4012
|
cb(k, "k", il);
|
3946
4013
|
|
@@ -4014,6 +4081,8 @@ struct llm_build_context {
|
|
4014
4081
|
const int64_t n_head_kv;
|
4015
4082
|
const int64_t n_embd_head;
|
4016
4083
|
const int64_t n_embd_gqa;
|
4084
|
+
const int64_t n_expert;
|
4085
|
+
const int64_t n_expert_used;
|
4017
4086
|
|
4018
4087
|
const float freq_base;
|
4019
4088
|
const float freq_scale;
|
@@ -4055,6 +4124,8 @@ struct llm_build_context {
|
|
4055
4124
|
n_head_kv (hparams.n_head_kv),
|
4056
4125
|
n_embd_head (hparams.n_embd_head()),
|
4057
4126
|
n_embd_gqa (hparams.n_embd_gqa()),
|
4127
|
+
n_expert (hparams.n_expert),
|
4128
|
+
n_expert_used (hparams.n_expert_used),
|
4058
4129
|
freq_base (cparams.rope_freq_base),
|
4059
4130
|
freq_scale (cparams.rope_freq_scale),
|
4060
4131
|
ext_factor (cparams.yarn_ext_factor),
|
@@ -4179,7 +4250,7 @@ struct llm_build_context {
|
|
4179
4250
|
cb(ffn_inp, "ffn_inp", il);
|
4180
4251
|
|
4181
4252
|
// feed-forward network
|
4182
|
-
{
|
4253
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
4183
4254
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4184
4255
|
model.layers[il].ffn_norm, NULL,
|
4185
4256
|
LLM_NORM_RMS, cb, il);
|
@@ -4191,6 +4262,69 @@ struct llm_build_context {
|
|
4191
4262
|
model.layers[il].ffn_down, NULL,
|
4192
4263
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4193
4264
|
cb(cur, "ffn_out", il);
|
4265
|
+
} else {
|
4266
|
+
// MoE branch
|
4267
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4268
|
+
model.layers[il].ffn_norm, NULL,
|
4269
|
+
LLM_NORM_RMS, cb, il);
|
4270
|
+
cb(cur, "ffn_norm", il);
|
4271
|
+
|
4272
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
4273
|
+
cb(logits, "ffn_moe_logits", il);
|
4274
|
+
|
4275
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
4276
|
+
cb(probs, "ffn_moe_probs", il);
|
4277
|
+
|
4278
|
+
// select experts
|
4279
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
4280
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
4281
|
+
|
4282
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
4283
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
4284
|
+
cb(weights, "ffn_moe_weights", il);
|
4285
|
+
|
4286
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
4287
|
+
|
4288
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
4289
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
4290
|
+
|
4291
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
4292
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
4293
|
+
|
4294
|
+
// compute expert outputs
|
4295
|
+
ggml_tensor * moe_out = nullptr;
|
4296
|
+
|
4297
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
4298
|
+
ggml_tensor * cur_expert;
|
4299
|
+
|
4300
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
4301
|
+
cb(cur_up, "ffn_moe_up", il);
|
4302
|
+
|
4303
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
4304
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
4305
|
+
|
4306
|
+
cur_gate = ggml_silu(ctx0, cur_gate);
|
4307
|
+
cb(cur_gate, "ffn_moe_silu", il);
|
4308
|
+
|
4309
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
4310
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
4311
|
+
|
4312
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
4313
|
+
cb(cur_expert, "ffn_moe_down", il);
|
4314
|
+
|
4315
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
4316
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
4317
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
4318
|
+
|
4319
|
+
if (i == 0) {
|
4320
|
+
moe_out = cur_expert;
|
4321
|
+
} else {
|
4322
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
4323
|
+
cb(moe_out, "ffn_moe_out", il);
|
4324
|
+
}
|
4325
|
+
}
|
4326
|
+
|
4327
|
+
cur = moe_out;
|
4194
4328
|
}
|
4195
4329
|
|
4196
4330
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -5445,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5445
5579
|
{ "ffn_relu", OFFLOAD_FUNC },
|
5446
5580
|
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
5447
5581
|
|
5582
|
+
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
5583
|
+
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
5584
|
+
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
5585
|
+
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
5586
|
+
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
5587
|
+
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
5588
|
+
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
5589
|
+
{ "ffn_moe_up", OFFLOAD_FUNC },
|
5590
|
+
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
5591
|
+
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
5592
|
+
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
5593
|
+
{ "ffn_moe_down", OFFLOAD_FUNC },
|
5594
|
+
{ "ffn_moe_out", OFFLOAD_FUNC },
|
5595
|
+
|
5448
5596
|
{ "l_out", OFFLOAD_FUNC },
|
5449
5597
|
|
5450
5598
|
{ "result_norm", OFFLOAD_FUNC_EMB },
|
@@ -5841,7 +5989,7 @@ static int llama_decode_internal(
|
|
5841
5989
|
const int64_t n_embd = hparams.n_embd;
|
5842
5990
|
const int64_t n_vocab = hparams.n_vocab;
|
5843
5991
|
|
5844
|
-
// helpers for smoother batch API
|
5992
|
+
// helpers for smoother batch API transition
|
5845
5993
|
// after deprecating the llama_eval calls, these will be removed
|
5846
5994
|
std::vector<llama_pos> pos;
|
5847
5995
|
|
@@ -6620,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
6620
6768
|
|
6621
6769
|
// loop over the text
|
6622
6770
|
while (true) {
|
6623
|
-
// find the first
|
6771
|
+
// find the first occurrence of a given special token in this fragment
|
6624
6772
|
// passing offset argument only limit the "search area" but match coordinates
|
6625
6773
|
// are still relative to the source full raw_text
|
6626
6774
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6627
6775
|
|
6628
|
-
// no
|
6776
|
+
// no occurrences found, stop processing this fragment for a given special token
|
6629
6777
|
if (match == std::string::npos) break;
|
6630
6778
|
|
6631
6779
|
// check if match is within bounds of offset <-> length
|
@@ -7498,7 +7646,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7498
7646
|
|
7499
7647
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7500
7648
|
const llama_token id = candidates->data[i].id;
|
7501
|
-
const std::string
|
7649
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7502
7650
|
if (id == eos) {
|
7503
7651
|
if (!allow_eos) {
|
7504
7652
|
candidates->data[i].logit = -INFINITY;
|
@@ -7710,7 +7858,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7710
7858
|
GGML_ASSERT(false);
|
7711
7859
|
}
|
7712
7860
|
|
7713
|
-
const std::string
|
7861
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7714
7862
|
|
7715
7863
|
// Note terminating 0 in decoded string
|
7716
7864
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -7824,7 +7972,7 @@ struct llama_beam_search_data {
|
|
7824
7972
|
}
|
7825
7973
|
|
7826
7974
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
7827
|
-
// The
|
7975
|
+
// The repetitive patterns below reflect the 2 stages of heaps:
|
7828
7976
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
7829
7977
|
// * If the heap is full and a new element is found that should be included, pop the
|
7830
7978
|
// least element to the back(), replace it with the new, then push it into the heap.
|
@@ -8062,11 +8210,9 @@ static void llama_convert_tensor_internal(
|
|
8062
8210
|
workers.clear();
|
8063
8211
|
}
|
8064
8212
|
|
8065
|
-
static ggml_type get_k_quant_type(
|
8066
|
-
quantize_state_internal & qs,
|
8067
|
-
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
8068
|
-
) {
|
8213
|
+
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
8069
8214
|
const std::string name = ggml_get_name(tensor);
|
8215
|
+
|
8070
8216
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
8071
8217
|
const llm_arch arch = qs.model.arch;
|
8072
8218
|
const auto tn = LLM_TN(arch);
|
@@ -8100,7 +8246,18 @@ static ggml_type get_k_quant_type(
|
|
8100
8246
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
8101
8247
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
8102
8248
|
}
|
8249
|
+
if (qs.model.hparams.n_expert == 8) {
|
8250
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8251
|
+
// TODO: explore better strategies
|
8252
|
+
new_type = GGML_TYPE_Q8_0;
|
8253
|
+
}
|
8103
8254
|
++qs.i_attention_wv;
|
8255
|
+
} else if (name.find("attn_k.weight") != std::string::npos) {
|
8256
|
+
if (qs.model.hparams.n_expert == 8) {
|
8257
|
+
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
8258
|
+
// TODO: explore better strategies
|
8259
|
+
new_type = GGML_TYPE_Q8_0;
|
8260
|
+
}
|
8104
8261
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
8105
8262
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8106
8263
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -8309,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8309
8466
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
8310
8467
|
|
8311
8468
|
// quantize only 2D tensors
|
8312
|
-
quantize &= (tensor
|
8469
|
+
quantize &= (ggml_n_dims(tensor) == 2);
|
8313
8470
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
8314
8471
|
quantize &= !params->only_copy;
|
8315
8472
|
|
8473
|
+
// do not quantize expert gating tensors
|
8474
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
8475
|
+
|
8316
8476
|
enum ggml_type new_type;
|
8317
8477
|
void * new_data;
|
8318
8478
|
size_t new_size;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -216,7 +216,7 @@ extern "C" {
|
|
216
216
|
|
217
217
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
218
218
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
219
|
-
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
219
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
220
220
|
bool embedding; // embedding mode only
|
221
221
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
222
222
|
};
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.10.
|
6
|
+
VERSION = '0.10.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1641'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|