llama_cpp 0.10.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -215,9 +215,9 @@
215
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
216
216
 
217
217
  #define GGML_MAX_DIMS 4
218
- #define GGML_MAX_PARAMS 1024
218
+ #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
220
+ #define GGML_MAX_SRC 10
221
221
  #define GGML_MAX_NAME 64
222
222
  #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
@@ -423,7 +423,9 @@ extern "C" {
423
423
  GGML_OP_POOL_1D,
424
424
  GGML_OP_POOL_2D,
425
425
  GGML_OP_UPSCALE, // nearest interpolate
426
+ GGML_OP_PAD,
426
427
  GGML_OP_ARGSORT,
428
+ GGML_OP_LEAKY_RELU,
427
429
 
428
430
  GGML_OP_FLASH_ATTN,
429
431
  GGML_OP_FLASH_FF,
@@ -463,7 +465,6 @@ extern "C" {
463
465
  GGML_UNARY_OP_GELU,
464
466
  GGML_UNARY_OP_GELU_QUICK,
465
467
  GGML_UNARY_OP_SILU,
466
- GGML_UNARY_OP_LEAKY,
467
468
 
468
469
  GGML_UNARY_OP_COUNT,
469
470
  };
@@ -501,7 +502,6 @@ extern "C" {
501
502
 
502
503
  struct ggml_backend_buffer * buffer;
503
504
 
504
- int n_dims;
505
505
  int64_t ne[GGML_MAX_DIMS]; // number of elements
506
506
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
507
507
  // nb[0] = ggml_type_size(type)
@@ -533,7 +533,7 @@ extern "C" {
533
533
 
534
534
  void * extra; // extra things e.g. for ggml-cuda.cu
535
535
 
536
- char padding[12];
536
+ char padding[8];
537
537
  };
538
538
 
539
539
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -638,11 +638,14 @@ extern "C" {
638
638
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
639
639
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
640
640
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
641
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
642
641
 
643
- GGML_API int ggml_blck_size (enum ggml_type type);
644
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
645
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
642
+ GGML_API int ggml_blck_size(enum ggml_type type);
643
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
644
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
645
+
646
+ GGML_DEPRECATED(
647
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
648
+ "use ggml_row_size() instead");
646
649
 
647
650
  GGML_API const char * ggml_type_name(enum ggml_type type);
648
651
  GGML_API const char * ggml_op_name (enum ggml_op op);
@@ -661,6 +664,11 @@ extern "C" {
661
664
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
662
665
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
663
666
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
667
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
668
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
669
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
670
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
671
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
664
672
 
665
673
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
666
674
 
@@ -793,6 +801,9 @@ extern "C" {
793
801
  struct ggml_tensor * a,
794
802
  struct ggml_tensor * b);
795
803
 
804
+ // dst = a
805
+ // view(dst, nb1, nb2, nb3, offset) += b
806
+ // return dst
796
807
  GGML_API struct ggml_tensor * ggml_acc(
797
808
  struct ggml_context * ctx,
798
809
  struct ggml_tensor * a,
@@ -957,15 +968,14 @@ extern "C" {
957
968
  struct ggml_context * ctx,
958
969
  struct ggml_tensor * a);
959
970
 
960
- GGML_API struct ggml_tensor * ggml_leaky(
971
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
961
972
  struct ggml_context * ctx,
962
- struct ggml_tensor * a);
973
+ struct ggml_tensor * a, float negative_slope, bool inplace);
963
974
 
964
975
  GGML_API struct ggml_tensor * ggml_relu_inplace(
965
976
  struct ggml_context * ctx,
966
977
  struct ggml_tensor * a);
967
978
 
968
- // TODO: double-check this computation is correct
969
979
  GGML_API struct ggml_tensor * ggml_gelu(
970
980
  struct ggml_context * ctx,
971
981
  struct ggml_tensor * a);
@@ -1051,7 +1061,8 @@ extern "C" {
1051
1061
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1052
1062
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1053
1063
  struct ggml_context * ctx,
1054
- struct ggml_tensor * as[],
1064
+ struct ggml_tensor * const as[],
1065
+ int n_as,
1055
1066
  struct ggml_tensor * ids,
1056
1067
  int id,
1057
1068
  struct ggml_tensor * b);
@@ -1263,6 +1274,7 @@ extern "C" {
1263
1274
  struct ggml_context * ctx,
1264
1275
  struct ggml_tensor * a);
1265
1276
 
1277
+ // supports 3D: a->ne[2] == b->ne[1]
1266
1278
  GGML_API struct ggml_tensor * ggml_get_rows(
1267
1279
  struct ggml_context * ctx,
1268
1280
  struct ggml_tensor * a,
@@ -1549,6 +1561,15 @@ extern "C" {
1549
1561
  struct ggml_tensor * a,
1550
1562
  int scale_factor);
1551
1563
 
1564
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1565
+ GGML_API struct ggml_tensor * ggml_pad(
1566
+ struct ggml_context * ctx,
1567
+ struct ggml_tensor * a,
1568
+ int p0,
1569
+ int p1,
1570
+ int p2,
1571
+ int p3);
1572
+
1552
1573
  // sort rows
1553
1574
  enum ggml_sort_order {
1554
1575
  GGML_SORT_ASC,
@@ -91,7 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
- #define LLAMA_MAX_NODES 8192
94
+ #define LLAMA_MAX_NODES 8192
95
+ #define LLAMA_MAX_EXPERTS 8
95
96
 
96
97
  //
97
98
  // logging
@@ -231,6 +232,8 @@ enum llm_kv {
231
232
  LLM_KV_FEED_FORWARD_LENGTH,
232
233
  LLM_KV_USE_PARALLEL_RESIDUAL,
233
234
  LLM_KV_TENSOR_DATA_LAYOUT,
235
+ LLM_KV_EXPERT_COUNT,
236
+ LLM_KV_EXPERT_USED_COUNT,
234
237
 
235
238
  LLM_KV_ATTENTION_HEAD_COUNT,
236
239
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
281
284
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
282
285
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
283
286
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
287
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
288
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
284
289
 
285
290
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
286
291
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -338,10 +343,14 @@ enum llm_tensor {
338
343
  LLM_TENSOR_ATTN_NORM,
339
344
  LLM_TENSOR_ATTN_NORM_2,
340
345
  LLM_TENSOR_ATTN_ROT_EMBD,
346
+ LLM_TENSOR_FFN_GATE_INP,
347
+ LLM_TENSOR_FFN_NORM,
341
348
  LLM_TENSOR_FFN_GATE,
342
349
  LLM_TENSOR_FFN_DOWN,
343
350
  LLM_TENSOR_FFN_UP,
344
- LLM_TENSOR_FFN_NORM,
351
+ LLM_TENSOR_FFN_DOWN_EXP,
352
+ LLM_TENSOR_FFN_GATE_EXP,
353
+ LLM_TENSOR_FFN_UP_EXP,
345
354
  LLM_TENSOR_ATTN_Q_NORM,
346
355
  LLM_TENSOR_ATTN_K_NORM,
347
356
  };
@@ -360,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360
369
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
361
370
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
362
371
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
372
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
363
373
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
364
374
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
365
375
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
366
376
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
377
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
378
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
379
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
367
380
  },
368
381
  },
369
382
  {
@@ -585,6 +598,10 @@ struct LLM_TN {
585
598
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
586
599
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
587
600
  }
601
+
602
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
603
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
604
+ }
588
605
  };
589
606
 
590
607
  //
@@ -1159,6 +1176,8 @@ struct llama_hparams {
1159
1176
  uint32_t n_layer;
1160
1177
  uint32_t n_rot;
1161
1178
  uint32_t n_ff;
1179
+ uint32_t n_expert = 0;
1180
+ uint32_t n_expert_used = 0;
1162
1181
 
1163
1182
  float f_norm_eps;
1164
1183
  float f_norm_rms_eps;
@@ -1173,15 +1192,18 @@ struct llama_hparams {
1173
1192
  float f_max_alibi_bias;
1174
1193
 
1175
1194
  bool operator!=(const llama_hparams & other) const {
1176
- if (this->vocab_only != other.vocab_only) return true;
1177
- if (this->n_vocab != other.n_vocab) return true;
1178
- if (this->n_ctx_train != other.n_ctx_train) return true;
1179
- if (this->n_embd != other.n_embd) return true;
1180
- if (this->n_head != other.n_head) return true;
1181
- if (this->n_head_kv != other.n_head_kv) return true;
1182
- if (this->n_layer != other.n_layer) return true;
1183
- if (this->n_rot != other.n_rot) return true;
1184
- if (this->n_ff != other.n_ff) return true;
1195
+ if (this->vocab_only != other.vocab_only) return true;
1196
+ if (this->n_vocab != other.n_vocab) return true;
1197
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1198
+ if (this->n_embd != other.n_embd) return true;
1199
+ if (this->n_head != other.n_head) return true;
1200
+ if (this->n_head_kv != other.n_head_kv) return true;
1201
+ if (this->n_layer != other.n_layer) return true;
1202
+ if (this->n_rot != other.n_rot) return true;
1203
+ if (this->n_ff != other.n_ff) return true;
1204
+ if (this->n_expert != other.n_expert) return true;
1205
+ if (this->n_expert_used != other.n_expert_used) return true;
1206
+
1185
1207
  if (this->rope_finetuned != other.rope_finetuned) return true;
1186
1208
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1187
1209
 
@@ -1263,6 +1285,12 @@ struct llama_layer {
1263
1285
  struct ggml_tensor * ffn_down; // w2
1264
1286
  struct ggml_tensor * ffn_up; // w3
1265
1287
 
1288
+ // ff MoE
1289
+ struct ggml_tensor * ffn_gate_inp;
1290
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1291
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1292
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1293
+
1266
1294
  // ff bias
1267
1295
  struct ggml_tensor * ffn_down_b; // b2
1268
1296
  struct ggml_tensor * ffn_up_b; // b3
@@ -1522,7 +1550,7 @@ static bool llama_kv_cache_init(
1522
1550
  cache.cells.clear();
1523
1551
  cache.cells.resize(n_ctx);
1524
1552
 
1525
- cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1553
+ cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1526
1554
  memset(cache.buf.data, 0, cache.buf.size);
1527
1555
 
1528
1556
  struct ggml_init_params params;
@@ -2435,6 +2463,16 @@ static void llm_load_hparams(
2435
2463
  ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2436
2464
  ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2437
2465
  ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2466
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2467
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2468
+
2469
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2470
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2471
+ if (hparams.n_expert > 0) {
2472
+ GGML_ASSERT(hparams.n_expert_used > 0);
2473
+ } else {
2474
+ GGML_ASSERT(hparams.n_expert_used == 0);
2475
+ }
2438
2476
 
2439
2477
  // n_head_kv is optional, default to n_head
2440
2478
  hparams.n_head_kv = hparams.n_head;
@@ -2753,7 +2791,7 @@ static void llm_load_vocab(
2753
2791
  // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2754
2792
  // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2755
2793
  // are special tokens.
2756
- // From testing, this appears to corelate 1:1 with special tokens.
2794
+ // From testing, this appears to correlate 1:1 with special tokens.
2757
2795
  //
2758
2796
 
2759
2797
  // Counting special tokens and verifying in only one direction
@@ -2866,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2866
2904
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2867
2905
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2868
2906
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2907
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
2908
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2869
2909
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2870
2910
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2871
2911
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3020,9 +3060,26 @@ static void llm_load_tensors(
3020
3060
 
3021
3061
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3022
3062
 
3023
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3024
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3025
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3063
+ layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3064
+
3065
+ if (layer.ffn_gate_inp == nullptr) {
3066
+ GGML_ASSERT(hparams.n_expert == 0);
3067
+ GGML_ASSERT(hparams.n_expert_used == 0);
3068
+
3069
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3070
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3071
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3072
+ } else {
3073
+ GGML_ASSERT(hparams.n_expert > 0);
3074
+ GGML_ASSERT(hparams.n_expert_used > 0);
3075
+
3076
+ // MoE branch
3077
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3078
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3079
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3080
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
+ }
3082
+ }
3026
3083
 
3027
3084
  if (backend == GGML_BACKEND_GPU) {
3028
3085
  vram_weights +=
@@ -3032,8 +3089,18 @@ static void llm_load_tensors(
3032
3089
  (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3033
3090
  (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3034
3091
  (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3035
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3036
- ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3092
+ ggml_nbytes(layer.ffn_norm);
3093
+
3094
+ if (layer.ffn_gate_inp == nullptr) {
3095
+ vram_weights +=
3096
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
+ } else {
3098
+ vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
+ vram_weights +=
3101
+ ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
+ }
3103
+ }
3037
3104
  }
3038
3105
  }
3039
3106
  } break;
@@ -3750,8 +3817,8 @@ static void llm_build_k_shift(
3750
3817
  ggml_rope_custom_inplace(ctx,
3751
3818
  ggml_view_3d(ctx, kv.k_l[il],
3752
3819
  n_embd_head, n_head_kv, n_ctx,
3753
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3754
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3820
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3821
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3755
3822
  0),
3756
3823
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3757
3824
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3780,7 +3847,7 @@ static void llm_build_kv_store(
3780
3847
  cb(v_cur_t, "v_cur_t", il);
3781
3848
 
3782
3849
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3783
- (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3850
+ (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3784
3851
  cb(k_cache_view, "k_cache_view", il);
3785
3852
 
3786
3853
  struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@@ -3939,8 +4006,8 @@ static struct ggml_tensor * llm_build_kqv(
3939
4006
  struct ggml_tensor * k =
3940
4007
  ggml_view_3d(ctx, kv.k_l[il],
3941
4008
  n_embd_head, n_kv, n_head_kv,
3942
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3943
- ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
4009
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4010
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3944
4011
  0);
3945
4012
  cb(k, "k", il);
3946
4013
 
@@ -4014,6 +4081,8 @@ struct llm_build_context {
4014
4081
  const int64_t n_head_kv;
4015
4082
  const int64_t n_embd_head;
4016
4083
  const int64_t n_embd_gqa;
4084
+ const int64_t n_expert;
4085
+ const int64_t n_expert_used;
4017
4086
 
4018
4087
  const float freq_base;
4019
4088
  const float freq_scale;
@@ -4055,6 +4124,8 @@ struct llm_build_context {
4055
4124
  n_head_kv (hparams.n_head_kv),
4056
4125
  n_embd_head (hparams.n_embd_head()),
4057
4126
  n_embd_gqa (hparams.n_embd_gqa()),
4127
+ n_expert (hparams.n_expert),
4128
+ n_expert_used (hparams.n_expert_used),
4058
4129
  freq_base (cparams.rope_freq_base),
4059
4130
  freq_scale (cparams.rope_freq_scale),
4060
4131
  ext_factor (cparams.yarn_ext_factor),
@@ -4179,7 +4250,7 @@ struct llm_build_context {
4179
4250
  cb(ffn_inp, "ffn_inp", il);
4180
4251
 
4181
4252
  // feed-forward network
4182
- {
4253
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4183
4254
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
4184
4255
  model.layers[il].ffn_norm, NULL,
4185
4256
  LLM_NORM_RMS, cb, il);
@@ -4191,6 +4262,69 @@ struct llm_build_context {
4191
4262
  model.layers[il].ffn_down, NULL,
4192
4263
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4193
4264
  cb(cur, "ffn_out", il);
4265
+ } else {
4266
+ // MoE branch
4267
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4268
+ model.layers[il].ffn_norm, NULL,
4269
+ LLM_NORM_RMS, cb, il);
4270
+ cb(cur, "ffn_norm", il);
4271
+
4272
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4273
+ cb(logits, "ffn_moe_logits", il);
4274
+
4275
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4276
+ cb(probs, "ffn_moe_probs", il);
4277
+
4278
+ // select experts
4279
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
4280
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
4281
+
4282
+ ggml_tensor * weights = ggml_get_rows(ctx0,
4283
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
4284
+ cb(weights, "ffn_moe_weights", il);
4285
+
4286
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
4287
+
4288
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4289
+ cb(weights_sum, "ffn_moe_weights_sum", il);
4290
+
4291
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4292
+ cb(weights, "ffn_moe_weights_norm", il);
4293
+
4294
+ // compute expert outputs
4295
+ ggml_tensor * moe_out = nullptr;
4296
+
4297
+ for (int i = 0; i < n_expert_used; ++i) {
4298
+ ggml_tensor * cur_expert;
4299
+
4300
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4301
+ cb(cur_up, "ffn_moe_up", il);
4302
+
4303
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
4304
+ cb(cur_gate, "ffn_moe_gate", il);
4305
+
4306
+ cur_gate = ggml_silu(ctx0, cur_gate);
4307
+ cb(cur_gate, "ffn_moe_silu", il);
4308
+
4309
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4310
+ cb(cur_expert, "ffn_moe_gate_par", il);
4311
+
4312
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4313
+ cb(cur_expert, "ffn_moe_down", il);
4314
+
4315
+ cur_expert = ggml_mul(ctx0, cur_expert,
4316
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
4317
+ cb(cur_expert, "ffn_moe_weighted", il);
4318
+
4319
+ if (i == 0) {
4320
+ moe_out = cur_expert;
4321
+ } else {
4322
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
4323
+ cb(moe_out, "ffn_moe_out", il);
4324
+ }
4325
+ }
4326
+
4327
+ cur = moe_out;
4194
4328
  }
4195
4329
 
4196
4330
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -5445,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5445
5579
  { "ffn_relu", OFFLOAD_FUNC },
5446
5580
  { "ffn_sqr(relu)", OFFLOAD_FUNC },
5447
5581
 
5582
+ { "ffn_moe_logits", OFFLOAD_FUNC },
5583
+ { "ffn_moe_probs", OFFLOAD_FUNC },
5584
+ { "ffn_moe_argsort", OFFLOAD_FUNC },
5585
+ { "ffn_moe_weights", OFFLOAD_FUNC },
5586
+ { "ffn_moe_weights_sum", OFFLOAD_FUNC },
5587
+ { "ffn_moe_weights_norm", OFFLOAD_FUNC },
5588
+ { "ffn_moe_weighted", OFFLOAD_FUNC },
5589
+ { "ffn_moe_up", OFFLOAD_FUNC },
5590
+ { "ffn_moe_gate", OFFLOAD_FUNC },
5591
+ { "ffn_moe_silu", OFFLOAD_FUNC },
5592
+ { "ffn_moe_gate_par", OFFLOAD_FUNC },
5593
+ { "ffn_moe_down", OFFLOAD_FUNC },
5594
+ { "ffn_moe_out", OFFLOAD_FUNC },
5595
+
5448
5596
  { "l_out", OFFLOAD_FUNC },
5449
5597
 
5450
5598
  { "result_norm", OFFLOAD_FUNC_EMB },
@@ -5841,7 +5989,7 @@ static int llama_decode_internal(
5841
5989
  const int64_t n_embd = hparams.n_embd;
5842
5990
  const int64_t n_vocab = hparams.n_vocab;
5843
5991
 
5844
- // helpers for smoother batch API transistion
5992
+ // helpers for smoother batch API transition
5845
5993
  // after deprecating the llama_eval calls, these will be removed
5846
5994
  std::vector<llama_pos> pos;
5847
5995
 
@@ -6620,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6620
6768
 
6621
6769
  // loop over the text
6622
6770
  while (true) {
6623
- // find the first occurence of a given special token in this fragment
6771
+ // find the first occurrence of a given special token in this fragment
6624
6772
  // passing offset argument only limit the "search area" but match coordinates
6625
6773
  // are still relative to the source full raw_text
6626
6774
  auto match = raw_text->find(special_token, raw_text_base_offset);
6627
6775
 
6628
- // no occurences found, stop processing this fragment for a given special token
6776
+ // no occurrences found, stop processing this fragment for a given special token
6629
6777
  if (match == std::string::npos) break;
6630
6778
 
6631
6779
  // check if match is within bounds of offset <-> length
@@ -7498,7 +7646,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7498
7646
 
7499
7647
  for (size_t i = 0; i < candidates->size; ++i) {
7500
7648
  const llama_token id = candidates->data[i].id;
7501
- const std::string & piece = ctx->model.vocab.id_to_token[id].text;
7649
+ const std::string piece = llama_token_to_piece(ctx, id);
7502
7650
  if (id == eos) {
7503
7651
  if (!allow_eos) {
7504
7652
  candidates->data[i].logit = -INFINITY;
@@ -7710,7 +7858,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7710
7858
  GGML_ASSERT(false);
7711
7859
  }
7712
7860
 
7713
- const std::string & piece = ctx->model.vocab.id_to_token[token].text;
7861
+ const std::string piece = llama_token_to_piece(ctx, token);
7714
7862
 
7715
7863
  // Note terminating 0 in decoded string
7716
7864
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -7824,7 +7972,7 @@ struct llama_beam_search_data {
7824
7972
  }
7825
7973
 
7826
7974
  // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
7827
- // The repetative patterns below reflect the 2 stages of heaps:
7975
+ // The repetitive patterns below reflect the 2 stages of heaps:
7828
7976
  // * Gather elements until the vector is full, then call std::make_heap() on it.
7829
7977
  // * If the heap is full and a new element is found that should be included, pop the
7830
7978
  // least element to the back(), replace it with the new, then push it into the heap.
@@ -8062,11 +8210,9 @@ static void llama_convert_tensor_internal(
8062
8210
  workers.clear();
8063
8211
  }
8064
8212
 
8065
- static ggml_type get_k_quant_type(
8066
- quantize_state_internal & qs,
8067
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8068
- ) {
8213
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
8069
8214
  const std::string name = ggml_get_name(tensor);
8215
+
8070
8216
  // TODO: avoid hardcoded tensor names - use the TN_* constants
8071
8217
  const llm_arch arch = qs.model.arch;
8072
8218
  const auto tn = LLM_TN(arch);
@@ -8100,7 +8246,18 @@ static ggml_type get_k_quant_type(
8100
8246
  // nearly negligible increase in model size by quantizing this tensor with more bits:
8101
8247
  if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8102
8248
  }
8249
+ if (qs.model.hparams.n_expert == 8) {
8250
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8251
+ // TODO: explore better strategies
8252
+ new_type = GGML_TYPE_Q8_0;
8253
+ }
8103
8254
  ++qs.i_attention_wv;
8255
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8256
+ if (qs.model.hparams.n_expert == 8) {
8257
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8258
+ // TODO: explore better strategies
8259
+ new_type = GGML_TYPE_Q8_0;
8260
+ }
8104
8261
  } else if (name.find("ffn_down.weight") != std::string::npos) {
8105
8262
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8106
8263
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8309,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8309
8466
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
8310
8467
 
8311
8468
  // quantize only 2D tensors
8312
- quantize &= (tensor->n_dims == 2);
8469
+ quantize &= (ggml_n_dims(tensor) == 2);
8313
8470
  quantize &= params->quantize_output_tensor || name != "output.weight";
8314
8471
  quantize &= !params->only_copy;
8315
8472
 
8473
+ // do not quantize expert gating tensors
8474
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
8475
+
8316
8476
  enum ggml_type new_type;
8317
8477
  void * new_data;
8318
8478
  size_t new_size;
@@ -216,7 +216,7 @@ extern "C" {
216
216
 
217
217
  // Keep the booleans together to avoid misalignment during copy-by-value.
218
218
  bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
219
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
219
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
220
220
  bool embedding; // embedding mode only
221
221
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
222
222
  };
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.10.0'
6
+ VERSION = '0.10.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1620'
9
+ LLAMA_CPP_VERSION = 'b1641'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -256,6 +256,8 @@ module LLaMACpp
256
256
  class ModelQuantizeParams
257
257
  public
258
258
 
259
+ attr_reader params: ::LLaMACpp::ModelParams
260
+
259
261
  def n_thread: () -> Integer
260
262
  def n_thread=: (Integer) -> Integer
261
263
  def ftype: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.10.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-09 00:00:00.000000000 Z
11
+ date: 2023-12-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: