llama_cpp 0.10.0 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -215,9 +215,9 @@
215
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
216
216
 
217
217
  #define GGML_MAX_DIMS 4
218
- #define GGML_MAX_PARAMS 1024
218
+ #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
220
+ #define GGML_MAX_SRC 10
221
221
  #define GGML_MAX_NAME 64
222
222
  #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
@@ -423,7 +423,9 @@ extern "C" {
423
423
  GGML_OP_POOL_1D,
424
424
  GGML_OP_POOL_2D,
425
425
  GGML_OP_UPSCALE, // nearest interpolate
426
+ GGML_OP_PAD,
426
427
  GGML_OP_ARGSORT,
428
+ GGML_OP_LEAKY_RELU,
427
429
 
428
430
  GGML_OP_FLASH_ATTN,
429
431
  GGML_OP_FLASH_FF,
@@ -463,7 +465,6 @@ extern "C" {
463
465
  GGML_UNARY_OP_GELU,
464
466
  GGML_UNARY_OP_GELU_QUICK,
465
467
  GGML_UNARY_OP_SILU,
466
- GGML_UNARY_OP_LEAKY,
467
468
 
468
469
  GGML_UNARY_OP_COUNT,
469
470
  };
@@ -501,7 +502,6 @@ extern "C" {
501
502
 
502
503
  struct ggml_backend_buffer * buffer;
503
504
 
504
- int n_dims;
505
505
  int64_t ne[GGML_MAX_DIMS]; // number of elements
506
506
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
507
507
  // nb[0] = ggml_type_size(type)
@@ -533,7 +533,7 @@ extern "C" {
533
533
 
534
534
  void * extra; // extra things e.g. for ggml-cuda.cu
535
535
 
536
- char padding[12];
536
+ char padding[8];
537
537
  };
538
538
 
539
539
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -638,11 +638,14 @@ extern "C" {
638
638
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
639
639
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
640
640
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
641
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
642
641
 
643
- GGML_API int ggml_blck_size (enum ggml_type type);
644
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
645
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
642
+ GGML_API int ggml_blck_size(enum ggml_type type);
643
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
644
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
645
+
646
+ GGML_DEPRECATED(
647
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
648
+ "use ggml_row_size() instead");
646
649
 
647
650
  GGML_API const char * ggml_type_name(enum ggml_type type);
648
651
  GGML_API const char * ggml_op_name (enum ggml_op op);
@@ -661,6 +664,11 @@ extern "C" {
661
664
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
662
665
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
663
666
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
667
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
668
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
669
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
670
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
671
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
664
672
 
665
673
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
666
674
 
@@ -793,6 +801,9 @@ extern "C" {
793
801
  struct ggml_tensor * a,
794
802
  struct ggml_tensor * b);
795
803
 
804
+ // dst = a
805
+ // view(dst, nb1, nb2, nb3, offset) += b
806
+ // return dst
796
807
  GGML_API struct ggml_tensor * ggml_acc(
797
808
  struct ggml_context * ctx,
798
809
  struct ggml_tensor * a,
@@ -957,15 +968,14 @@ extern "C" {
957
968
  struct ggml_context * ctx,
958
969
  struct ggml_tensor * a);
959
970
 
960
- GGML_API struct ggml_tensor * ggml_leaky(
971
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
961
972
  struct ggml_context * ctx,
962
- struct ggml_tensor * a);
973
+ struct ggml_tensor * a, float negative_slope, bool inplace);
963
974
 
964
975
  GGML_API struct ggml_tensor * ggml_relu_inplace(
965
976
  struct ggml_context * ctx,
966
977
  struct ggml_tensor * a);
967
978
 
968
- // TODO: double-check this computation is correct
969
979
  GGML_API struct ggml_tensor * ggml_gelu(
970
980
  struct ggml_context * ctx,
971
981
  struct ggml_tensor * a);
@@ -1051,7 +1061,8 @@ extern "C" {
1051
1061
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1052
1062
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1053
1063
  struct ggml_context * ctx,
1054
- struct ggml_tensor * as[],
1064
+ struct ggml_tensor * const as[],
1065
+ int n_as,
1055
1066
  struct ggml_tensor * ids,
1056
1067
  int id,
1057
1068
  struct ggml_tensor * b);
@@ -1263,6 +1274,7 @@ extern "C" {
1263
1274
  struct ggml_context * ctx,
1264
1275
  struct ggml_tensor * a);
1265
1276
 
1277
+ // supports 3D: a->ne[2] == b->ne[1]
1266
1278
  GGML_API struct ggml_tensor * ggml_get_rows(
1267
1279
  struct ggml_context * ctx,
1268
1280
  struct ggml_tensor * a,
@@ -1549,6 +1561,15 @@ extern "C" {
1549
1561
  struct ggml_tensor * a,
1550
1562
  int scale_factor);
1551
1563
 
1564
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1565
+ GGML_API struct ggml_tensor * ggml_pad(
1566
+ struct ggml_context * ctx,
1567
+ struct ggml_tensor * a,
1568
+ int p0,
1569
+ int p1,
1570
+ int p2,
1571
+ int p3);
1572
+
1552
1573
  // sort rows
1553
1574
  enum ggml_sort_order {
1554
1575
  GGML_SORT_ASC,
@@ -91,7 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
- #define LLAMA_MAX_NODES 8192
94
+ #define LLAMA_MAX_NODES 8192
95
+ #define LLAMA_MAX_EXPERTS 8
95
96
 
96
97
  //
97
98
  // logging
@@ -231,6 +232,8 @@ enum llm_kv {
231
232
  LLM_KV_FEED_FORWARD_LENGTH,
232
233
  LLM_KV_USE_PARALLEL_RESIDUAL,
233
234
  LLM_KV_TENSOR_DATA_LAYOUT,
235
+ LLM_KV_EXPERT_COUNT,
236
+ LLM_KV_EXPERT_USED_COUNT,
234
237
 
235
238
  LLM_KV_ATTENTION_HEAD_COUNT,
236
239
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
281
284
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
282
285
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
283
286
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
287
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
288
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
284
289
 
285
290
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
286
291
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -338,10 +343,14 @@ enum llm_tensor {
338
343
  LLM_TENSOR_ATTN_NORM,
339
344
  LLM_TENSOR_ATTN_NORM_2,
340
345
  LLM_TENSOR_ATTN_ROT_EMBD,
346
+ LLM_TENSOR_FFN_GATE_INP,
347
+ LLM_TENSOR_FFN_NORM,
341
348
  LLM_TENSOR_FFN_GATE,
342
349
  LLM_TENSOR_FFN_DOWN,
343
350
  LLM_TENSOR_FFN_UP,
344
- LLM_TENSOR_FFN_NORM,
351
+ LLM_TENSOR_FFN_DOWN_EXP,
352
+ LLM_TENSOR_FFN_GATE_EXP,
353
+ LLM_TENSOR_FFN_UP_EXP,
345
354
  LLM_TENSOR_ATTN_Q_NORM,
346
355
  LLM_TENSOR_ATTN_K_NORM,
347
356
  };
@@ -360,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360
369
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
361
370
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
362
371
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
372
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
363
373
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
364
374
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
365
375
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
366
376
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
377
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
378
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
379
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
367
380
  },
368
381
  },
369
382
  {
@@ -585,6 +598,10 @@ struct LLM_TN {
585
598
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
586
599
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
587
600
  }
601
+
602
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
603
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
604
+ }
588
605
  };
589
606
 
590
607
  //
@@ -1159,6 +1176,8 @@ struct llama_hparams {
1159
1176
  uint32_t n_layer;
1160
1177
  uint32_t n_rot;
1161
1178
  uint32_t n_ff;
1179
+ uint32_t n_expert = 0;
1180
+ uint32_t n_expert_used = 0;
1162
1181
 
1163
1182
  float f_norm_eps;
1164
1183
  float f_norm_rms_eps;
@@ -1173,15 +1192,18 @@ struct llama_hparams {
1173
1192
  float f_max_alibi_bias;
1174
1193
 
1175
1194
  bool operator!=(const llama_hparams & other) const {
1176
- if (this->vocab_only != other.vocab_only) return true;
1177
- if (this->n_vocab != other.n_vocab) return true;
1178
- if (this->n_ctx_train != other.n_ctx_train) return true;
1179
- if (this->n_embd != other.n_embd) return true;
1180
- if (this->n_head != other.n_head) return true;
1181
- if (this->n_head_kv != other.n_head_kv) return true;
1182
- if (this->n_layer != other.n_layer) return true;
1183
- if (this->n_rot != other.n_rot) return true;
1184
- if (this->n_ff != other.n_ff) return true;
1195
+ if (this->vocab_only != other.vocab_only) return true;
1196
+ if (this->n_vocab != other.n_vocab) return true;
1197
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1198
+ if (this->n_embd != other.n_embd) return true;
1199
+ if (this->n_head != other.n_head) return true;
1200
+ if (this->n_head_kv != other.n_head_kv) return true;
1201
+ if (this->n_layer != other.n_layer) return true;
1202
+ if (this->n_rot != other.n_rot) return true;
1203
+ if (this->n_ff != other.n_ff) return true;
1204
+ if (this->n_expert != other.n_expert) return true;
1205
+ if (this->n_expert_used != other.n_expert_used) return true;
1206
+
1185
1207
  if (this->rope_finetuned != other.rope_finetuned) return true;
1186
1208
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1187
1209
 
@@ -1263,6 +1285,12 @@ struct llama_layer {
1263
1285
  struct ggml_tensor * ffn_down; // w2
1264
1286
  struct ggml_tensor * ffn_up; // w3
1265
1287
 
1288
+ // ff MoE
1289
+ struct ggml_tensor * ffn_gate_inp;
1290
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1291
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1292
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1293
+
1266
1294
  // ff bias
1267
1295
  struct ggml_tensor * ffn_down_b; // b2
1268
1296
  struct ggml_tensor * ffn_up_b; // b3
@@ -1522,7 +1550,7 @@ static bool llama_kv_cache_init(
1522
1550
  cache.cells.clear();
1523
1551
  cache.cells.resize(n_ctx);
1524
1552
 
1525
- cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1553
+ cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1526
1554
  memset(cache.buf.data, 0, cache.buf.size);
1527
1555
 
1528
1556
  struct ggml_init_params params;
@@ -2435,6 +2463,16 @@ static void llm_load_hparams(
2435
2463
  ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2436
2464
  ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2437
2465
  ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2466
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2467
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2468
+
2469
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2470
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2471
+ if (hparams.n_expert > 0) {
2472
+ GGML_ASSERT(hparams.n_expert_used > 0);
2473
+ } else {
2474
+ GGML_ASSERT(hparams.n_expert_used == 0);
2475
+ }
2438
2476
 
2439
2477
  // n_head_kv is optional, default to n_head
2440
2478
  hparams.n_head_kv = hparams.n_head;
@@ -2753,7 +2791,7 @@ static void llm_load_vocab(
2753
2791
  // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2754
2792
  // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2755
2793
  // are special tokens.
2756
- // From testing, this appears to corelate 1:1 with special tokens.
2794
+ // From testing, this appears to correlate 1:1 with special tokens.
2757
2795
  //
2758
2796
 
2759
2797
  // Counting special tokens and verifying in only one direction
@@ -2866,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2866
2904
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2867
2905
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2868
2906
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2907
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
2908
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2869
2909
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2870
2910
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2871
2911
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3020,9 +3060,26 @@ static void llm_load_tensors(
3020
3060
 
3021
3061
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3022
3062
 
3023
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3024
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3025
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3063
+ layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3064
+
3065
+ if (layer.ffn_gate_inp == nullptr) {
3066
+ GGML_ASSERT(hparams.n_expert == 0);
3067
+ GGML_ASSERT(hparams.n_expert_used == 0);
3068
+
3069
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3070
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3071
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3072
+ } else {
3073
+ GGML_ASSERT(hparams.n_expert > 0);
3074
+ GGML_ASSERT(hparams.n_expert_used > 0);
3075
+
3076
+ // MoE branch
3077
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3078
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3079
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3080
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
+ }
3082
+ }
3026
3083
 
3027
3084
  if (backend == GGML_BACKEND_GPU) {
3028
3085
  vram_weights +=
@@ -3032,8 +3089,18 @@ static void llm_load_tensors(
3032
3089
  (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3033
3090
  (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3034
3091
  (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3035
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3036
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3092
+ ggml_nbytes(layer.ffn_norm);
3093
+
3094
+ if (layer.ffn_gate_inp == nullptr) {
3095
+ vram_weights +=
3096
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
+ } else {
3098
+ vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
+ vram_weights +=
3101
+ ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
+ }
3103
+ }
3037
3104
  }
3038
3105
  }
3039
3106
  } break;
@@ -3750,8 +3817,8 @@ static void llm_build_k_shift(
3750
3817
  ggml_rope_custom_inplace(ctx,
3751
3818
  ggml_view_3d(ctx, kv.k_l[il],
3752
3819
  n_embd_head, n_head_kv, n_ctx,
3753
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3754
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3820
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3821
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3755
3822
  0),
3756
3823
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3757
3824
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3780,7 +3847,7 @@ static void llm_build_kv_store(
3780
3847
  cb(v_cur_t, "v_cur_t", il);
3781
3848
 
3782
3849
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3783
- (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3850
+ (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3784
3851
  cb(k_cache_view, "k_cache_view", il);
3785
3852
 
3786
3853
  struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@@ -3939,8 +4006,8 @@ static struct ggml_tensor * llm_build_kqv(
3939
4006
  struct ggml_tensor * k =
3940
4007
  ggml_view_3d(ctx, kv.k_l[il],
3941
4008
  n_embd_head, n_kv, n_head_kv,
3942
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3943
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
4009
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4010
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3944
4011
  0);
3945
4012
  cb(k, "k", il);
3946
4013
 
@@ -4014,6 +4081,8 @@ struct llm_build_context {
4014
4081
  const int64_t n_head_kv;
4015
4082
  const int64_t n_embd_head;
4016
4083
  const int64_t n_embd_gqa;
4084
+ const int64_t n_expert;
4085
+ const int64_t n_expert_used;
4017
4086
 
4018
4087
  const float freq_base;
4019
4088
  const float freq_scale;
@@ -4055,6 +4124,8 @@ struct llm_build_context {
4055
4124
  n_head_kv (hparams.n_head_kv),
4056
4125
  n_embd_head (hparams.n_embd_head()),
4057
4126
  n_embd_gqa (hparams.n_embd_gqa()),
4127
+ n_expert (hparams.n_expert),
4128
+ n_expert_used (hparams.n_expert_used),
4058
4129
  freq_base (cparams.rope_freq_base),
4059
4130
  freq_scale (cparams.rope_freq_scale),
4060
4131
  ext_factor (cparams.yarn_ext_factor),
@@ -4179,7 +4250,7 @@ struct llm_build_context {
4179
4250
  cb(ffn_inp, "ffn_inp", il);
4180
4251
 
4181
4252
  // feed-forward network
4182
- {
4253
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4183
4254
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
4184
4255
  model.layers[il].ffn_norm, NULL,
4185
4256
  LLM_NORM_RMS, cb, il);
@@ -4191,6 +4262,69 @@ struct llm_build_context {
4191
4262
  model.layers[il].ffn_down, NULL,
4192
4263
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4193
4264
  cb(cur, "ffn_out", il);
4265
+ } else {
4266
+ // MoE branch
4267
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4268
+ model.layers[il].ffn_norm, NULL,
4269
+ LLM_NORM_RMS, cb, il);
4270
+ cb(cur, "ffn_norm", il);
4271
+
4272
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4273
+ cb(logits, "ffn_moe_logits", il);
4274
+
4275
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4276
+ cb(probs, "ffn_moe_probs", il);
4277
+
4278
+ // select experts
4279
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
4280
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
4281
+
4282
+ ggml_tensor * weights = ggml_get_rows(ctx0,
4283
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
4284
+ cb(weights, "ffn_moe_weights", il);
4285
+
4286
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
4287
+
4288
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4289
+ cb(weights_sum, "ffn_moe_weights_sum", il);
4290
+
4291
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4292
+ cb(weights, "ffn_moe_weights_norm", il);
4293
+
4294
+ // compute expert outputs
4295
+ ggml_tensor * moe_out = nullptr;
4296
+
4297
+ for (int i = 0; i < n_expert_used; ++i) {
4298
+ ggml_tensor * cur_expert;
4299
+
4300
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4301
+ cb(cur_up, "ffn_moe_up", il);
4302
+
4303
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
4304
+ cb(cur_gate, "ffn_moe_gate", il);
4305
+
4306
+ cur_gate = ggml_silu(ctx0, cur_gate);
4307
+ cb(cur_gate, "ffn_moe_silu", il);
4308
+
4309
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4310
+ cb(cur_expert, "ffn_moe_gate_par", il);
4311
+
4312
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4313
+ cb(cur_expert, "ffn_moe_down", il);
4314
+
4315
+ cur_expert = ggml_mul(ctx0, cur_expert,
4316
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
4317
+ cb(cur_expert, "ffn_moe_weighted", il);
4318
+
4319
+ if (i == 0) {
4320
+ moe_out = cur_expert;
4321
+ } else {
4322
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
4323
+ cb(moe_out, "ffn_moe_out", il);
4324
+ }
4325
+ }
4326
+
4327
+ cur = moe_out;
4194
4328
  }
4195
4329
 
4196
4330
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -5445,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5445
5579
  { "ffn_relu", OFFLOAD_FUNC },
5446
5580
  { "ffn_sqr(relu)", OFFLOAD_FUNC },
5447
5581
 
5582
+ { "ffn_moe_logits", OFFLOAD_FUNC },
5583
+ { "ffn_moe_probs", OFFLOAD_FUNC },
5584
+ { "ffn_moe_argsort", OFFLOAD_FUNC },
5585
+ { "ffn_moe_weights", OFFLOAD_FUNC },
5586
+ { "ffn_moe_weights_sum", OFFLOAD_FUNC },
5587
+ { "ffn_moe_weights_norm", OFFLOAD_FUNC },
5588
+ { "ffn_moe_weighted", OFFLOAD_FUNC },
5589
+ { "ffn_moe_up", OFFLOAD_FUNC },
5590
+ { "ffn_moe_gate", OFFLOAD_FUNC },
5591
+ { "ffn_moe_silu", OFFLOAD_FUNC },
5592
+ { "ffn_moe_gate_par", OFFLOAD_FUNC },
5593
+ { "ffn_moe_down", OFFLOAD_FUNC },
5594
+ { "ffn_moe_out", OFFLOAD_FUNC },
5595
+
5448
5596
  { "l_out", OFFLOAD_FUNC },
5449
5597
 
5450
5598
  { "result_norm", OFFLOAD_FUNC_EMB },
@@ -5841,7 +5989,7 @@ static int llama_decode_internal(
5841
5989
  const int64_t n_embd = hparams.n_embd;
5842
5990
  const int64_t n_vocab = hparams.n_vocab;
5843
5991
 
5844
- // helpers for smoother batch API transistion
5992
+ // helpers for smoother batch API transition
5845
5993
  // after deprecating the llama_eval calls, these will be removed
5846
5994
  std::vector<llama_pos> pos;
5847
5995
 
@@ -6620,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6620
6768
 
6621
6769
  // loop over the text
6622
6770
  while (true) {
6623
- // find the first occurence of a given special token in this fragment
6771
+ // find the first occurrence of a given special token in this fragment
6624
6772
  // passing offset argument only limit the "search area" but match coordinates
6625
6773
  // are still relative to the source full raw_text
6626
6774
  auto match = raw_text->find(special_token, raw_text_base_offset);
6627
6775
 
6628
- // no occurences found, stop processing this fragment for a given special token
6776
+ // no occurrences found, stop processing this fragment for a given special token
6629
6777
  if (match == std::string::npos) break;
6630
6778
 
6631
6779
  // check if match is within bounds of offset <-> length
@@ -7498,7 +7646,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7498
7646
 
7499
7647
  for (size_t i = 0; i < candidates->size; ++i) {
7500
7648
  const llama_token id = candidates->data[i].id;
7501
- const std::string & piece = ctx->model.vocab.id_to_token[id].text;
7649
+ const std::string piece = llama_token_to_piece(ctx, id);
7502
7650
  if (id == eos) {
7503
7651
  if (!allow_eos) {
7504
7652
  candidates->data[i].logit = -INFINITY;
@@ -7710,7 +7858,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7710
7858
  GGML_ASSERT(false);
7711
7859
  }
7712
7860
 
7713
- const std::string & piece = ctx->model.vocab.id_to_token[token].text;
7861
+ const std::string piece = llama_token_to_piece(ctx, token);
7714
7862
 
7715
7863
  // Note terminating 0 in decoded string
7716
7864
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -7824,7 +7972,7 @@ struct llama_beam_search_data {
7824
7972
  }
7825
7973
 
7826
7974
  // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
7827
- // The repetative patterns below reflect the 2 stages of heaps:
7975
+ // The repetitive patterns below reflect the 2 stages of heaps:
7828
7976
  // * Gather elements until the vector is full, then call std::make_heap() on it.
7829
7977
  // * If the heap is full and a new element is found that should be included, pop the
7830
7978
  // least element to the back(), replace it with the new, then push it into the heap.
@@ -8062,11 +8210,9 @@ static void llama_convert_tensor_internal(
8062
8210
  workers.clear();
8063
8211
  }
8064
8212
 
8065
- static ggml_type get_k_quant_type(
8066
- quantize_state_internal & qs,
8067
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8068
- ) {
8213
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
8069
8214
  const std::string name = ggml_get_name(tensor);
8215
+
8070
8216
  // TODO: avoid hardcoded tensor names - use the TN_* constants
8071
8217
  const llm_arch arch = qs.model.arch;
8072
8218
  const auto tn = LLM_TN(arch);
@@ -8100,7 +8246,18 @@ static ggml_type get_k_quant_type(
8100
8246
  // nearly negligible increase in model size by quantizing this tensor with more bits:
8101
8247
  if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8102
8248
  }
8249
+ if (qs.model.hparams.n_expert == 8) {
8250
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8251
+ // TODO: explore better strategies
8252
+ new_type = GGML_TYPE_Q8_0;
8253
+ }
8103
8254
  ++qs.i_attention_wv;
8255
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8256
+ if (qs.model.hparams.n_expert == 8) {
8257
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8258
+ // TODO: explore better strategies
8259
+ new_type = GGML_TYPE_Q8_0;
8260
+ }
8104
8261
  } else if (name.find("ffn_down.weight") != std::string::npos) {
8105
8262
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8106
8263
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8309,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8309
8466
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
8310
8467
 
8311
8468
  // quantize only 2D tensors
8312
- quantize &= (tensor->n_dims == 2);
8469
+ quantize &= (ggml_n_dims(tensor) == 2);
8313
8470
  quantize &= params->quantize_output_tensor || name != "output.weight";
8314
8471
  quantize &= !params->only_copy;
8315
8472
 
8473
+ // do not quantize expert gating tensors
8474
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
8475
+
8316
8476
  enum ggml_type new_type;
8317
8477
  void * new_data;
8318
8478
  size_t new_size;
@@ -216,7 +216,7 @@ extern "C" {
216
216
 
217
217
  // Keep the booleans together to avoid misalignment during copy-by-value.
218
218
  bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
219
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
219
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
220
220
  bool embedding; // embedding mode only
221
221
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
222
222
  };
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.10.0'
6
+ VERSION = '0.10.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1620'
9
+ LLAMA_CPP_VERSION = 'b1641'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -256,6 +256,8 @@ module LLaMACpp
256
256
  class ModelQuantizeParams
257
257
  public
258
258
 
259
+ attr_reader params: ::LLaMACpp::ModelParams
260
+
259
261
  def n_thread: () -> Integer
260
262
  def n_thread=: (Integer) -> Integer
261
263
  def ftype: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.10.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-09 00:00:00.000000000 Z
11
+ date: 2023-12-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: