llama_cpp 0.9.5 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -74,6 +74,7 @@
74
74
  #include <set>
75
75
  #include <sstream>
76
76
  #include <thread>
77
+ #include <type_traits>
77
78
  #include <unordered_map>
78
79
 
79
80
  #if defined(_MSC_VER)
@@ -90,7 +91,8 @@
90
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
91
92
  #endif
92
93
 
93
- #define LLAMA_MAX_NODES 8192
94
+ #define LLAMA_MAX_NODES 8192
95
+ #define LLAMA_MAX_EXPERTS 8
94
96
 
95
97
  //
96
98
  // logging
@@ -192,6 +194,7 @@ enum llm_arch {
192
194
  LLM_ARCH_REFACT,
193
195
  LLM_ARCH_BLOOM,
194
196
  LLM_ARCH_STABLELM,
197
+ LLM_ARCH_QWEN,
195
198
  LLM_ARCH_UNKNOWN,
196
199
  };
197
200
 
@@ -208,6 +211,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
208
211
  { LLM_ARCH_REFACT, "refact" },
209
212
  { LLM_ARCH_BLOOM, "bloom" },
210
213
  { LLM_ARCH_STABLELM, "stablelm" },
214
+ { LLM_ARCH_QWEN, "qwen" },
211
215
  };
212
216
 
213
217
  enum llm_kv {
@@ -228,6 +232,8 @@ enum llm_kv {
228
232
  LLM_KV_FEED_FORWARD_LENGTH,
229
233
  LLM_KV_USE_PARALLEL_RESIDUAL,
230
234
  LLM_KV_TENSOR_DATA_LAYOUT,
235
+ LLM_KV_EXPERT_COUNT,
236
+ LLM_KV_EXPERT_USED_COUNT,
231
237
 
232
238
  LLM_KV_ATTENTION_HEAD_COUNT,
233
239
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -278,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
278
284
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
279
285
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
280
286
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
287
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
288
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
281
289
 
282
290
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
283
291
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -335,10 +343,14 @@ enum llm_tensor {
335
343
  LLM_TENSOR_ATTN_NORM,
336
344
  LLM_TENSOR_ATTN_NORM_2,
337
345
  LLM_TENSOR_ATTN_ROT_EMBD,
346
+ LLM_TENSOR_FFN_GATE_INP,
347
+ LLM_TENSOR_FFN_NORM,
338
348
  LLM_TENSOR_FFN_GATE,
339
349
  LLM_TENSOR_FFN_DOWN,
340
350
  LLM_TENSOR_FFN_UP,
341
- LLM_TENSOR_FFN_NORM,
351
+ LLM_TENSOR_FFN_DOWN_EXP,
352
+ LLM_TENSOR_FFN_GATE_EXP,
353
+ LLM_TENSOR_FFN_UP_EXP,
342
354
  LLM_TENSOR_ATTN_Q_NORM,
343
355
  LLM_TENSOR_ATTN_K_NORM,
344
356
  };
@@ -357,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
357
369
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
358
370
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
359
371
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
372
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
360
373
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
361
374
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
362
375
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
363
376
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
377
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
378
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
379
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
364
380
  },
365
381
  },
366
382
  {
@@ -518,6 +534,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
518
534
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
519
535
  },
520
536
  },
537
+ {
538
+ LLM_ARCH_QWEN,
539
+ {
540
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
541
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
542
+ { LLM_TENSOR_OUTPUT, "output" },
543
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
544
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
545
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
546
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
547
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
548
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
549
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
550
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551
+ },
552
+ },
521
553
 
522
554
  {
523
555
  LLM_ARCH_UNKNOWN,
@@ -566,27 +598,16 @@ struct LLM_TN {
566
598
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
567
599
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
568
600
  }
601
+
602
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
603
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
604
+ }
569
605
  };
570
606
 
571
607
  //
572
608
  // gguf helpers
573
609
  //
574
610
 
575
- #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
576
- do { \
577
- const std::string skey(key); \
578
- const int kid = gguf_find_key(ctx, skey.c_str()); \
579
- if (kid >= 0) { \
580
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
581
- if (ktype != (type)) { \
582
- throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
583
- } \
584
- (dst) = func(ctx, kid); \
585
- } else if (req) { \
586
- throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
587
- } \
588
- } while (0)
589
-
590
611
  static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
591
612
  { LLAMA_ROPE_SCALING_NONE, "none" },
592
613
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -620,7 +641,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
620
641
  }
621
642
  }
622
643
 
623
- static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
644
+ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
624
645
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
646
 
626
647
  switch (type) {
@@ -1155,6 +1176,8 @@ struct llama_hparams {
1155
1176
  uint32_t n_layer;
1156
1177
  uint32_t n_rot;
1157
1178
  uint32_t n_ff;
1179
+ uint32_t n_expert = 0;
1180
+ uint32_t n_expert_used = 0;
1158
1181
 
1159
1182
  float f_norm_eps;
1160
1183
  float f_norm_rms_eps;
@@ -1169,15 +1192,18 @@ struct llama_hparams {
1169
1192
  float f_max_alibi_bias;
1170
1193
 
1171
1194
  bool operator!=(const llama_hparams & other) const {
1172
- if (this->vocab_only != other.vocab_only) return true;
1173
- if (this->n_vocab != other.n_vocab) return true;
1174
- if (this->n_ctx_train != other.n_ctx_train) return true;
1175
- if (this->n_embd != other.n_embd) return true;
1176
- if (this->n_head != other.n_head) return true;
1177
- if (this->n_head_kv != other.n_head_kv) return true;
1178
- if (this->n_layer != other.n_layer) return true;
1179
- if (this->n_rot != other.n_rot) return true;
1180
- if (this->n_ff != other.n_ff) return true;
1195
+ if (this->vocab_only != other.vocab_only) return true;
1196
+ if (this->n_vocab != other.n_vocab) return true;
1197
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1198
+ if (this->n_embd != other.n_embd) return true;
1199
+ if (this->n_head != other.n_head) return true;
1200
+ if (this->n_head_kv != other.n_head_kv) return true;
1201
+ if (this->n_layer != other.n_layer) return true;
1202
+ if (this->n_rot != other.n_rot) return true;
1203
+ if (this->n_ff != other.n_ff) return true;
1204
+ if (this->n_expert != other.n_expert) return true;
1205
+ if (this->n_expert_used != other.n_expert_used) return true;
1206
+
1181
1207
  if (this->rope_finetuned != other.rope_finetuned) return true;
1182
1208
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1183
1209
 
@@ -1222,6 +1248,7 @@ struct llama_cparams {
1222
1248
  float yarn_beta_slow;
1223
1249
 
1224
1250
  bool mul_mat_q;
1251
+ bool offload_kqv;
1225
1252
  };
1226
1253
 
1227
1254
  struct llama_layer {
@@ -1243,6 +1270,9 @@ struct llama_layer {
1243
1270
  struct ggml_tensor * wqkv;
1244
1271
 
1245
1272
  // attention bias
1273
+ struct ggml_tensor * bq;
1274
+ struct ggml_tensor * bk;
1275
+ struct ggml_tensor * bv;
1246
1276
  struct ggml_tensor * bo;
1247
1277
  struct ggml_tensor * bqkv;
1248
1278
 
@@ -1255,6 +1285,12 @@ struct llama_layer {
1255
1285
  struct ggml_tensor * ffn_down; // w2
1256
1286
  struct ggml_tensor * ffn_up; // w3
1257
1287
 
1288
+ // ff MoE
1289
+ struct ggml_tensor * ffn_gate_inp;
1290
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1291
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1292
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1293
+
1258
1294
  // ff bias
1259
1295
  struct ggml_tensor * ffn_down_b; // b2
1260
1296
  struct ggml_tensor * ffn_up_b; // b3
@@ -1287,8 +1323,8 @@ struct llama_kv_cache {
1287
1323
 
1288
1324
  std::vector<llama_kv_cell> cells;
1289
1325
 
1290
- struct ggml_tensor * k = NULL;
1291
- struct ggml_tensor * v = NULL;
1326
+ std::vector<struct ggml_tensor *> k_l; // per layer
1327
+ std::vector<struct ggml_tensor *> v_l;
1292
1328
 
1293
1329
  struct ggml_context * ctx = NULL;
1294
1330
 
@@ -1301,8 +1337,10 @@ struct llama_kv_cache {
1301
1337
 
1302
1338
  #ifdef GGML_USE_CUBLAS
1303
1339
  if (ggml_cublas_loaded()) {
1304
- ggml_cuda_free_data(k);
1305
- ggml_cuda_free_data(v);
1340
+ for (size_t i = 0; i < k_l.size(); ++i) {
1341
+ ggml_cuda_free_data(k_l[i]);
1342
+ ggml_cuda_free_data(v_l[i]);
1343
+ }
1306
1344
  }
1307
1345
  #endif
1308
1346
  }
@@ -1492,9 +1530,11 @@ struct llama_context {
1492
1530
  static bool llama_kv_cache_init(
1493
1531
  const struct llama_hparams & hparams,
1494
1532
  struct llama_kv_cache & cache,
1495
- ggml_type wtype,
1533
+ ggml_type ktype,
1534
+ ggml_type vtype,
1496
1535
  uint32_t n_ctx,
1497
- int n_gpu_layers) {
1536
+ int n_gpu_layers,
1537
+ bool offload) {
1498
1538
  const uint32_t n_embd = hparams.n_embd_gqa();
1499
1539
  const uint32_t n_layer = hparams.n_layer;
1500
1540
 
@@ -1510,7 +1550,7 @@ static bool llama_kv_cache_init(
1510
1550
  cache.cells.clear();
1511
1551
  cache.cells.resize(n_ctx);
1512
1552
 
1513
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1553
+ cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1514
1554
  memset(cache.buf.data, 0, cache.buf.size);
1515
1555
 
1516
1556
  struct ggml_init_params params;
@@ -1520,37 +1560,44 @@ static bool llama_kv_cache_init(
1520
1560
 
1521
1561
  cache.ctx = ggml_init(params);
1522
1562
 
1563
+ size_t vram_kv_cache = 0;
1564
+
1523
1565
  if (!cache.ctx) {
1524
1566
  LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1525
1567
  return false;
1526
1568
  }
1527
1569
 
1528
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1529
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1530
- ggml_set_name(cache.k, "cache_k");
1531
- ggml_set_name(cache.v, "cache_v");
1570
+ cache.k_l.reserve(n_layer);
1571
+ cache.v_l.reserve(n_layer);
1532
1572
 
1533
- (void) n_gpu_layers;
1573
+ const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1534
1574
 
1535
- #ifdef GGML_USE_CUBLAS
1536
- if (ggml_cublas_loaded()) {
1537
- size_t vram_kv_cache = 0;
1575
+ GGML_UNUSED(offload);
1538
1576
 
1539
- if (n_gpu_layers > (int)n_layer + 1) {
1540
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1541
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1542
- vram_kv_cache += ggml_nbytes(cache.v);
1543
- }
1544
- if (n_gpu_layers > (int)n_layer + 2) {
1545
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1546
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1547
- vram_kv_cache += ggml_nbytes(cache.k);
1548
- }
1549
- if (vram_kv_cache > 0) {
1550
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1577
+ for (int i = 0; i < (int) n_layer; i++) {
1578
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1579
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1580
+ ggml_format_name(k, "cache_k_l%d", i);
1581
+ ggml_format_name(v, "cache_v_l%d", i);
1582
+ cache.k_l.push_back(k);
1583
+ cache.v_l.push_back(v);
1584
+ #ifdef GGML_USE_CUBLAS
1585
+ if (i >= i_gpu_start) {
1586
+ if (offload) {
1587
+ ggml_cuda_assign_buffers_no_scratch(k);
1588
+ vram_kv_cache += ggml_nbytes(k);
1589
+ ggml_cuda_assign_buffers_no_scratch(v);
1590
+ vram_kv_cache += ggml_nbytes(v);
1591
+ }
1551
1592
  }
1593
+ #endif // GGML_USE_CUBLAS
1552
1594
  }
1553
- #endif
1595
+
1596
+ if (vram_kv_cache > 0) {
1597
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1598
+ }
1599
+
1600
+ GGML_UNUSED(n_gpu_layers);
1554
1601
 
1555
1602
  return true;
1556
1603
  }
@@ -1771,6 +1818,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
1771
1818
  return buf;
1772
1819
  }
1773
1820
 
1821
+ namespace GGUFMeta {
1822
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
1823
+ struct GKV_Base_Type {
1824
+ static constexpr gguf_type gt = gt_;
1825
+
1826
+ static T getter(const gguf_context * ctx, const int kid) {
1827
+ return gfun(ctx, kid);
1828
+ }
1829
+ };
1830
+
1831
+ template<typename T> struct GKV_Base;
1832
+
1833
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
1834
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
1835
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
1836
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
1837
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
1838
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
1839
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
1840
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
1841
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
1842
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
1843
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
1844
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
1845
+
1846
+ template<> struct GKV_Base<std::string> {
1847
+ static constexpr gguf_type gt = GGUF_TYPE_STRING;
1848
+
1849
+ static std::string getter(const gguf_context * ctx, const int kid) {
1850
+ return gguf_get_val_str(ctx, kid);
1851
+ }
1852
+ };
1853
+
1854
+ struct ArrayInfo{
1855
+ const gguf_type gt;
1856
+ const size_t length;
1857
+ const void * data;
1858
+ };
1859
+
1860
+ template<> struct GKV_Base<ArrayInfo> {
1861
+ public:
1862
+ static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
1863
+ static ArrayInfo getter(const gguf_context *ctx, const int k) {
1864
+ return ArrayInfo {
1865
+ gguf_get_arr_type(ctx, k),
1866
+ size_t(gguf_get_arr_n(ctx, k)),
1867
+ gguf_get_arr_data(ctx, k),
1868
+ };
1869
+ }
1870
+ };
1871
+
1872
+ template<typename T>
1873
+ class GKV: public GKV_Base<T> {
1874
+ GKV() = delete;
1875
+
1876
+ public:
1877
+ static T get_kv(const gguf_context * ctx, const int k) {
1878
+ const enum gguf_type kt = gguf_get_kv_type(ctx, k);
1879
+
1880
+ if (kt != GKV::gt) {
1881
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
1882
+ gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
1883
+ }
1884
+ return GKV::getter(ctx, k);
1885
+ }
1886
+
1887
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
1888
+ switch (ty) {
1889
+ case LLAMA_KV_OVERRIDE_BOOL: return "bool";
1890
+ case LLAMA_KV_OVERRIDE_INT: return "int";
1891
+ case LLAMA_KV_OVERRIDE_FLOAT: return "float";
1892
+ }
1893
+ return "unknown";
1894
+ }
1895
+
1896
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
1897
+ if (!override) { return false; }
1898
+ if (override->tag == expected_type) {
1899
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
1900
+ __func__, override_type_to_str(override->tag), override->key);
1901
+ switch (override->tag) {
1902
+ case LLAMA_KV_OVERRIDE_BOOL: {
1903
+ printf("%s\n", override->bool_value ? "true" : "false");
1904
+ } break;
1905
+ case LLAMA_KV_OVERRIDE_INT: {
1906
+ printf("%" PRId64 "\n", override->int_value);
1907
+ } break;
1908
+ case LLAMA_KV_OVERRIDE_FLOAT: {
1909
+ printf("%.6f\n", override->float_value);
1910
+ } break;
1911
+ default:
1912
+ // Shouldn't be possible to end up here, but just in case...
1913
+ throw std::runtime_error(
1914
+ format("Unsupported attempt to override %s type for metadata key %s\n",
1915
+ override_type_to_str(override->tag), override->key));
1916
+ }
1917
+ return true;
1918
+ }
1919
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
1920
+ __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
1921
+ return false;
1922
+ }
1923
+
1924
+ template<typename OT>
1925
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
1926
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1927
+ if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
1928
+ target = override->bool_value;
1929
+ return true;
1930
+ }
1931
+ return true;
1932
+ }
1933
+
1934
+ template<typename OT>
1935
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
1936
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1937
+ if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
1938
+ target = override->int_value;
1939
+ return true;
1940
+ }
1941
+ return false;
1942
+ }
1943
+
1944
+ template<typename OT>
1945
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
1946
+ try_override(T & target, const struct llama_model_kv_override *override) {
1947
+ if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
1948
+ target = override->float_value;
1949
+ return true;
1950
+ }
1951
+ return false;
1952
+ }
1953
+
1954
+ template<typename OT>
1955
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
1956
+ try_override(T & target, const struct llama_model_kv_override *override) {
1957
+ (void)target;
1958
+ (void)override;
1959
+ if (!override) { return false; }
1960
+ // Currently, we should never end up here so it would be a bug if we do.
1961
+ throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
1962
+ override ? override->key : "NULL"));
1963
+ }
1964
+
1965
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
1966
+ if (try_override<T>(target, override)) {
1967
+ return true;
1968
+ }
1969
+ if (k < 0) { return false; }
1970
+ target = get_kv(ctx, k);
1971
+ return true;
1972
+ }
1973
+
1974
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
1975
+ return set(ctx, gguf_find_key(ctx, key), target, override);
1976
+ }
1977
+
1978
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
1979
+ return set(ctx, key.c_str(), target, override);
1980
+ }
1981
+ };
1982
+ }
1983
+
1774
1984
  struct llama_model_loader {
1775
1985
  int n_kv = 0;
1776
1986
  int n_tensors = 0;
@@ -1786,21 +1996,34 @@ struct llama_model_loader {
1786
1996
  llama_fver fver;
1787
1997
 
1788
1998
  std::unique_ptr<llama_mmap> mapping;
1999
+ std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
1789
2000
 
1790
2001
  struct gguf_context * ctx_gguf = NULL;
1791
2002
  struct ggml_context * ctx_meta = NULL;
1792
2003
 
1793
- llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
2004
+ std::string arch_name;
2005
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2006
+
2007
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
1794
2008
  struct gguf_init_params params = {
1795
2009
  /*.no_alloc = */ true,
1796
2010
  /*.ctx = */ &ctx_meta,
1797
2011
  };
1798
2012
 
2013
+ if (param_overrides_p != nullptr) {
2014
+ for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2015
+ kv_overrides.insert({std::string(p->key), *p});
2016
+ }
2017
+ }
2018
+
1799
2019
  ctx_gguf = gguf_init_from_file(fname.c_str(), params);
1800
2020
  if (!ctx_gguf) {
1801
2021
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
1802
2022
  }
1803
2023
 
2024
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2025
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2026
+
1804
2027
  n_kv = gguf_get_n_kv(ctx_gguf);
1805
2028
  n_tensors = gguf_get_n_tensors(ctx_gguf);
1806
2029
 
@@ -1868,6 +2091,7 @@ struct llama_model_loader {
1868
2091
  }
1869
2092
  }
1870
2093
 
2094
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1871
2095
  for (int i = 0; i < n_kv; i++) {
1872
2096
  const char * name = gguf_get_key(ctx_gguf, i);
1873
2097
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1913,19 +2137,59 @@ struct llama_model_loader {
1913
2137
  }
1914
2138
  }
1915
2139
 
1916
- std::string get_arch_name() const {
1917
- const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
2140
+ template<typename T>
2141
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2142
+ get_arr_n(const std::string & key, T & result, const bool required = true) {
2143
+ const int kid = gguf_find_key(ctx_gguf, key.c_str());
1918
2144
 
1919
- std::string arch_name;
1920
- GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
2145
+ if (kid < 0) {
2146
+ if (required) {
2147
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2148
+ }
2149
+ return false;
2150
+ }
2151
+
2152
+ struct GGUFMeta::ArrayInfo arr_info =
2153
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
2154
+
2155
+
2156
+ result = arr_info.length;
2157
+ return true;
2158
+ }
2159
+
2160
+ template<typename T>
2161
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2162
+ get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
2163
+ return get_arr_n(llm_kv(kid), result, required);
2164
+ }
2165
+
2166
+ template<typename T>
2167
+ bool get_key(const std::string & key, T & result, const bool required = true) {
2168
+ auto it = kv_overrides.find(key);
2169
+
2170
+ const struct llama_model_kv_override * override =
2171
+ it != kv_overrides.end() ? &it->second : nullptr;
2172
+
2173
+ const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
1921
2174
 
2175
+ if (required && !found) {
2176
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2177
+ }
2178
+
2179
+ return found;
2180
+ }
2181
+
2182
+ template<typename T>
2183
+ bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
2184
+ return get_key(llm_kv(kid), result, required);
2185
+ }
2186
+
2187
+ std::string get_arch_name() const {
1922
2188
  return arch_name;
1923
2189
  }
1924
2190
 
1925
2191
  enum llm_arch get_arch() const {
1926
- const std::string arch_name = get_arch_name();
1927
-
1928
- return llm_arch_from_string(arch_name);
2192
+ return llm_kv.arch;
1929
2193
  }
1930
2194
 
1931
2195
  const char * get_tensor_name(int i) const {
@@ -1965,10 +2229,13 @@ struct llama_model_loader {
1965
2229
  return tensor;
1966
2230
  }
1967
2231
 
1968
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
2232
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
1969
2233
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1970
2234
 
1971
2235
  if (cur == NULL) {
2236
+ if (!required) {
2237
+ return NULL;
2238
+ }
1972
2239
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
1973
2240
  }
1974
2241
 
@@ -2172,11 +2439,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2172
2439
  static void llm_load_hparams(
2173
2440
  llama_model_loader & ml,
2174
2441
  llama_model & model) {
2175
- struct gguf_context * ctx = ml.ctx_gguf;
2176
-
2177
- const auto kv = LLM_KV(model.arch);
2178
-
2179
2442
  auto & hparams = model.hparams;
2443
+ const gguf_context * ctx = ml.ctx_gguf;
2180
2444
 
2181
2445
  // get metadata as string
2182
2446
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -2190,42 +2454,51 @@ static void llm_load_hparams(
2190
2454
  }
2191
2455
 
2192
2456
  // get general kv
2193
- GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2457
+ ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
2194
2458
 
2195
2459
  // get hparams kv
2196
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
2197
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
2198
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
2199
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
2200
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
2201
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
2460
+ ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
2461
+ ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
2462
+ ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
2463
+ ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2464
+ ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2465
+ ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2466
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2467
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2468
+
2469
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2470
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2471
+ if (hparams.n_expert > 0) {
2472
+ GGML_ASSERT(hparams.n_expert_used > 0);
2473
+ } else {
2474
+ GGML_ASSERT(hparams.n_expert_used == 0);
2475
+ }
2202
2476
 
2203
2477
  // n_head_kv is optional, default to n_head
2204
2478
  hparams.n_head_kv = hparams.n_head;
2205
- GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
2479
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
2206
2480
 
2207
- hparams.rope_finetuned = false;
2208
- GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
2209
- kv(LLM_KV_ROPE_SCALING_FINETUNED));
2481
+ bool rope_finetuned = false;
2482
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2483
+ hparams.rope_finetuned = rope_finetuned;
2210
2484
 
2211
2485
  hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
2212
- GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
2213
- kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
2486
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
2214
2487
 
2215
2488
  // rope_freq_base (optional)
2216
2489
  hparams.rope_freq_base_train = 10000.0f;
2217
- GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
2490
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
2218
2491
 
2219
2492
  std::string rope_scaling("linear");
2220
- GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
2493
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2221
2494
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2222
2495
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
2223
2496
 
2224
2497
  // rope_freq_scale (inverse of the kv) is optional
2225
2498
  float ropescale = 0.0f;
2226
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
2227
- if (ropescale == 0.0f) { // try the old key name
2228
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
2499
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
2500
+ // try the old key name
2501
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
2229
2502
  }
2230
2503
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
2231
2504
 
@@ -2233,7 +2506,7 @@ static void llm_load_hparams(
2233
2506
  {
2234
2507
  hparams.n_rot = hparams.n_embd / hparams.n_head;
2235
2508
 
2236
- GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
2509
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
2237
2510
 
2238
2511
  if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
2239
2512
  if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2248,7 +2521,7 @@ static void llm_load_hparams(
2248
2521
  switch (model.arch) {
2249
2522
  case LLM_ARCH_LLAMA:
2250
2523
  {
2251
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2524
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2252
2525
 
2253
2526
  switch (hparams.n_layer) {
2254
2527
  case 26: model.type = e_model::MODEL_3B; break;
@@ -2262,7 +2535,7 @@ static void llm_load_hparams(
2262
2535
  } break;
2263
2536
  case LLM_ARCH_FALCON:
2264
2537
  {
2265
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2538
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2266
2539
 
2267
2540
  switch (hparams.n_layer) {
2268
2541
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2272,7 +2545,7 @@ static void llm_load_hparams(
2272
2545
  } break;
2273
2546
  case LLM_ARCH_BAICHUAN:
2274
2547
  {
2275
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2548
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2276
2549
  switch (hparams.n_layer) {
2277
2550
  case 32: model.type = e_model::MODEL_7B; break;
2278
2551
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2281,7 +2554,7 @@ static void llm_load_hparams(
2281
2554
  } break;
2282
2555
  case LLM_ARCH_STARCODER:
2283
2556
  {
2284
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2557
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2285
2558
  switch (hparams.n_layer) {
2286
2559
  case 24: model.type = e_model::MODEL_1B; break;
2287
2560
  case 36: model.type = e_model::MODEL_3B; break;
@@ -2292,7 +2565,7 @@ static void llm_load_hparams(
2292
2565
  } break;
2293
2566
  case LLM_ARCH_PERSIMMON:
2294
2567
  {
2295
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2568
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2296
2569
  switch (hparams.n_layer) {
2297
2570
  case 36: model.type = e_model::MODEL_8B; break;
2298
2571
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2300,7 +2573,7 @@ static void llm_load_hparams(
2300
2573
  } break;
2301
2574
  case LLM_ARCH_REFACT:
2302
2575
  {
2303
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2576
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2304
2577
  switch (hparams.n_layer) {
2305
2578
  case 32: model.type = e_model::MODEL_1B; break;
2306
2579
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2308,7 +2581,7 @@ static void llm_load_hparams(
2308
2581
  } break;
2309
2582
  case LLM_ARCH_BLOOM:
2310
2583
  {
2311
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2584
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2312
2585
 
2313
2586
  switch (hparams.n_layer) {
2314
2587
  case 24: model.type = e_model::MODEL_1B; break;
@@ -2323,9 +2596,9 @@ static void llm_load_hparams(
2323
2596
  {
2324
2597
  hparams.f_clamp_kqv = 0.0f;
2325
2598
 
2326
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2327
- GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2328
- GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2599
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2600
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
2601
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
2329
2602
 
2330
2603
  switch (hparams.n_layer) {
2331
2604
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2335,13 +2608,23 @@ static void llm_load_hparams(
2335
2608
  } break;
2336
2609
  case LLM_ARCH_STABLELM:
2337
2610
  {
2338
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2611
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2339
2612
 
2340
2613
  switch (hparams.n_layer) {
2341
2614
  case 32: model.type = e_model::MODEL_3B; break;
2342
2615
  default: model.type = e_model::MODEL_UNKNOWN;
2343
2616
  }
2344
2617
  } break;
2618
+ case LLM_ARCH_QWEN:
2619
+ {
2620
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2621
+
2622
+ switch (hparams.n_layer) {
2623
+ case 32: model.type = e_model::MODEL_7B; break;
2624
+ case 40: model.type = e_model::MODEL_13B; break;
2625
+ default: model.type = e_model::MODEL_UNKNOWN;
2626
+ }
2627
+ } break;
2345
2628
 
2346
2629
  default: (void)0;
2347
2630
  }
@@ -2383,7 +2666,7 @@ static void llm_load_vocab(
2383
2666
  {
2384
2667
  std::string tokenizer_name;
2385
2668
 
2386
- GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
2669
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
2387
2670
 
2388
2671
  if (tokenizer_name == "llama") {
2389
2672
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2473,34 +2756,31 @@ static void llm_load_vocab(
2473
2756
  };
2474
2757
  for (const auto & it : special_token_types) {
2475
2758
  const std::string & key = kv(std::get<0>(it));
2476
- int32_t & id = std::get<1>(it), old_id = id;
2759
+ int32_t & id = std::get<1>(it);
2477
2760
 
2478
- GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2479
- // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2480
- // can only come from the default value, so there's no point in
2481
- // validating that.
2482
- if (size_t(id + 1) > vocab.id_to_token.size()) {
2483
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2484
- __func__, key.c_str(), id, old_id);
2485
- id = old_id;
2761
+ uint32_t new_id;
2762
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
2763
+ continue;
2764
+ }
2765
+ if (new_id >= vocab.id_to_token.size()) {
2766
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
2767
+ __func__, key.c_str(), new_id, id);
2768
+ } else {
2769
+ id = new_id;
2486
2770
  }
2487
2771
 
2488
2772
  }
2489
2773
 
2490
2774
  // Handle add_bos_token and add_eos_token
2491
- std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2492
- int kid = gguf_find_key(ctx, key.c_str());
2493
- enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2494
- vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2495
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2496
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2497
- }
2498
- key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2499
- kid = gguf_find_key(ctx, key.c_str());
2500
- ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2501
- vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2502
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2503
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2775
+ {
2776
+ bool temp = true;
2777
+
2778
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2779
+ vocab.special_add_bos = int(temp);
2780
+ }
2781
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2782
+ vocab.special_add_eos = int(temp);
2783
+ }
2504
2784
  }
2505
2785
  }
2506
2786
 
@@ -2511,7 +2791,7 @@ static void llm_load_vocab(
2511
2791
  // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2512
2792
  // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2513
2793
  // are special tokens.
2514
- // From testing, this appears to corelate 1:1 with special tokens.
2794
+ // From testing, this appears to correlate 1:1 with special tokens.
2515
2795
  //
2516
2796
 
2517
2797
  // Counting special tokens and verifying in only one direction
@@ -2624,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2624
2904
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2625
2905
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2626
2906
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2907
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
2908
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2627
2909
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2628
2910
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2629
2911
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2733,14 +3015,7 @@ static void llm_load_tensors(
2733
3015
  ggml_backend_type backend_output;
2734
3016
 
2735
3017
  if (n_gpu_layers > int(n_layer)) {
2736
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2737
- // on Windows however this is detrimental unless everything is on the GPU
2738
- #ifndef _WIN32
2739
- backend_norm = llama_backend_offload;
2740
- #else
2741
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2742
- #endif // _WIN32
2743
-
3018
+ backend_norm = llama_backend_offload;
2744
3019
  backend_output = llama_backend_offload_split;
2745
3020
  } else {
2746
3021
  backend_norm = GGML_BACKEND_CPU;
@@ -2777,17 +3052,55 @@ static void llm_load_tensors(
2777
3052
  layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2778
3053
  layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2779
3054
 
3055
+ // optional bias tensors
3056
+ layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3057
+ layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3058
+ layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3059
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3060
+
2780
3061
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
3062
 
2782
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2783
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2784
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3063
+ layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3064
+
3065
+ if (layer.ffn_gate_inp == nullptr) {
3066
+ GGML_ASSERT(hparams.n_expert == 0);
3067
+ GGML_ASSERT(hparams.n_expert_used == 0);
3068
+
3069
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3070
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3071
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3072
+ } else {
3073
+ GGML_ASSERT(hparams.n_expert > 0);
3074
+ GGML_ASSERT(hparams.n_expert_used > 0);
3075
+
3076
+ // MoE branch
3077
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3078
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3079
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3080
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
+ }
3082
+ }
2785
3083
 
2786
3084
  if (backend == GGML_BACKEND_GPU) {
2787
3085
  vram_weights +=
2788
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2789
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2790
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3086
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3087
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3088
+ (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3089
+ (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3090
+ (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3091
+ (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3092
+ ggml_nbytes(layer.ffn_norm);
3093
+
3094
+ if (layer.ffn_gate_inp == nullptr) {
3095
+ vram_weights +=
3096
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
+ } else {
3098
+ vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
+ vram_weights +=
3101
+ ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
+ }
3103
+ }
2791
3104
  }
2792
3105
  }
2793
3106
  } break;
@@ -2799,14 +3112,7 @@ static void llm_load_tensors(
2799
3112
  ggml_backend_type backend_output;
2800
3113
 
2801
3114
  if (n_gpu_layers > int(n_layer)) {
2802
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2803
- // on Windows however this is detrimental unless everything is on the GPU
2804
- #ifndef _WIN32
2805
- backend_norm = llama_backend_offload;
2806
- #else
2807
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2808
- #endif // _WIN32
2809
-
3115
+ backend_norm = llama_backend_offload;
2810
3116
  backend_output = llama_backend_offload_split;
2811
3117
  } else {
2812
3118
  backend_norm = GGML_BACKEND_CPU;
@@ -2869,14 +3175,7 @@ static void llm_load_tensors(
2869
3175
  ggml_backend_type backend_output;
2870
3176
 
2871
3177
  if (n_gpu_layers > int(n_layer)) {
2872
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2873
- // on Windows however this is detrimental unless everything is on the GPU
2874
- #ifndef _WIN32
2875
- backend_norm = llama_backend_offload;
2876
- #else
2877
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2878
- #endif // _WIN32
2879
-
3178
+ backend_norm = llama_backend_offload;
2880
3179
  backend_output = llama_backend_offload_split;
2881
3180
  } else {
2882
3181
  backend_norm = GGML_BACKEND_CPU;
@@ -2946,14 +3245,7 @@ static void llm_load_tensors(
2946
3245
  ggml_backend_type backend_output;
2947
3246
 
2948
3247
  if (n_gpu_layers > int(n_layer)) {
2949
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2950
- // on Windows however this is detrimental unless everything is on the GPU
2951
- #ifndef _WIN32
2952
- backend_norm = llama_backend_offload;
2953
- #else
2954
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2955
- #endif // _WIN32
2956
-
3248
+ backend_norm = llama_backend_offload;
2957
3249
  backend_output = llama_backend_offload_split;
2958
3250
  } else {
2959
3251
  backend_norm = GGML_BACKEND_CPU;
@@ -3023,21 +3315,7 @@ static void llm_load_tensors(
3023
3315
  ggml_backend_type backend_output;
3024
3316
 
3025
3317
  if (n_gpu_layers > int(n_layer)) {
3026
- #ifdef GGML_USE_CUBLAS
3027
- if (n_gpu_layers > int(n_layer + 1)) {
3028
- LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3029
- __func__, n_layer + 1);
3030
- throw std::runtime_error("Persimmon CUDA offload failed");
3031
- }
3032
- #endif
3033
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3034
- // on Windows however this is detrimental unless everything is on the GPU
3035
- #ifndef _WIN32
3036
- backend_norm = llama_backend_offload;
3037
- #else
3038
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3039
- #endif // _WIN32
3040
-
3318
+ backend_norm = llama_backend_offload;
3041
3319
  backend_output = llama_backend_offload_split;
3042
3320
  } else {
3043
3321
  backend_norm = GGML_BACKEND_CPU;
@@ -3096,14 +3374,7 @@ static void llm_load_tensors(
3096
3374
  ggml_backend_type backend_output;
3097
3375
 
3098
3376
  if (n_gpu_layers > int(n_layer)) {
3099
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3100
- // on Windows however this is detrimental unless everything is on the GPU
3101
- #ifndef _WIN32
3102
- backend_norm = llama_backend_offload;
3103
- #else
3104
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3105
- #endif // _WIN32
3106
-
3377
+ backend_norm = llama_backend_offload;
3107
3378
  backend_output = llama_backend_offload_split;
3108
3379
  } else {
3109
3380
  backend_norm = GGML_BACKEND_CPU;
@@ -3174,14 +3445,7 @@ static void llm_load_tensors(
3174
3445
  ggml_backend_type backend_output;
3175
3446
 
3176
3447
  if (n_gpu_layers > int(n_layer)) {
3177
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3178
- // on Windows however this is detrimental unless everything is on the GPU
3179
- #ifndef _WIN32
3180
- backend_norm = llama_backend_offload;
3181
- #else
3182
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3183
- #endif // _WIN32
3184
-
3448
+ backend_norm = llama_backend_offload;
3185
3449
  backend_output = llama_backend_offload_split;
3186
3450
  } else {
3187
3451
  backend_norm = GGML_BACKEND_CPU;
@@ -3241,14 +3505,7 @@ static void llm_load_tensors(
3241
3505
  ggml_backend_type backend_output;
3242
3506
 
3243
3507
  if (n_gpu_layers > int(n_layer)) {
3244
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3245
- // on Windows however this is detrimental unless everything is on the GPU
3246
- #ifndef _WIN32
3247
- backend_norm = llama_backend_offload;
3248
- #else
3249
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3250
- #endif // _WIN32
3251
-
3508
+ backend_norm = llama_backend_offload;
3252
3509
  backend_output = llama_backend_offload_split;
3253
3510
  } else {
3254
3511
  backend_norm = GGML_BACKEND_CPU;
@@ -3305,6 +3562,64 @@ static void llm_load_tensors(
3305
3562
  }
3306
3563
  }
3307
3564
  } break;
3565
+ case LLM_ARCH_QWEN:
3566
+ {
3567
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3568
+ {
3569
+ ggml_backend_type backend_norm;
3570
+ ggml_backend_type backend_output;
3571
+
3572
+ if (n_gpu_layers > int(n_layer)) {
3573
+ backend_norm = llama_backend_offload;
3574
+ backend_output = llama_backend_offload_split;
3575
+ } else {
3576
+ backend_norm = GGML_BACKEND_CPU;
3577
+ backend_output = GGML_BACKEND_CPU;
3578
+ }
3579
+
3580
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3581
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3582
+
3583
+ if (backend_norm == GGML_BACKEND_GPU) {
3584
+ vram_weights += ggml_nbytes(model.output_norm);
3585
+ }
3586
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3587
+ vram_weights += ggml_nbytes(model.output);
3588
+ }
3589
+ }
3590
+
3591
+ const uint32_t n_ff = hparams.n_ff / 2;
3592
+
3593
+ const int i_gpu_start = n_layer - n_gpu_layers;
3594
+
3595
+ model.layers.resize(n_layer);
3596
+
3597
+ for (uint32_t i = 0; i < n_layer; ++i) {
3598
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3599
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3600
+
3601
+ auto & layer = model.layers[i];
3602
+
3603
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3604
+
3605
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3606
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3607
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3608
+
3609
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3610
+
3611
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3612
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3613
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3614
+
3615
+ if (backend == GGML_BACKEND_GPU) {
3616
+ vram_weights +=
3617
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3618
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3619
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3620
+ }
3621
+ }
3622
+ } break;
3308
3623
 
3309
3624
  default:
3310
3625
  throw std::runtime_error("unknown architecture");
@@ -3331,8 +3646,8 @@ static void llm_load_tensors(
3331
3646
  }
3332
3647
 
3333
3648
  #ifdef GGML_USE_CUBLAS
3334
- const int max_backend_supported_layers = hparams.n_layer + 3;
3335
- const int max_offloadable_layers = hparams.n_layer + 3;
3649
+ const int max_backend_supported_layers = hparams.n_layer + 1;
3650
+ const int max_offloadable_layers = hparams.n_layer + 1;
3336
3651
  #elif GGML_USE_CLBLAST
3337
3652
  const int max_backend_supported_layers = hparams.n_layer + 1;
3338
3653
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -3373,7 +3688,7 @@ static void llm_load_tensors(
3373
3688
 
3374
3689
  static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3375
3690
  try {
3376
- llama_model_loader ml(fname, params.use_mmap);
3691
+ llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3377
3692
 
3378
3693
  model.hparams.vocab_only = params.vocab_only;
3379
3694
 
@@ -3500,11 +3815,11 @@ static void llm_build_k_shift(
3500
3815
  struct ggml_tensor * tmp =
3501
3816
  // we rotate only the first n_rot dimensions
3502
3817
  ggml_rope_custom_inplace(ctx,
3503
- ggml_view_3d(ctx, kv.k,
3818
+ ggml_view_3d(ctx, kv.k_l[il],
3504
3819
  n_embd_head, n_head_kv, n_ctx,
3505
- ggml_element_size(kv.k)*n_embd_head,
3506
- ggml_element_size(kv.k)*n_embd_gqa,
3507
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
3820
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3821
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3822
+ 0),
3508
3823
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3509
3824
  ext_factor, attn_factor, beta_fast, beta_slow);
3510
3825
  cb(tmp, "K_shifted", il);
@@ -3531,13 +3846,13 @@ static void llm_build_kv_store(
3531
3846
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
3532
3847
  cb(v_cur_t, "v_cur_t", il);
3533
3848
 
3534
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
3535
- (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3849
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3850
+ (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3536
3851
  cb(k_cache_view, "k_cache_view", il);
3537
3852
 
3538
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
3539
- ( n_ctx)*ggml_element_size(kv.v),
3540
- (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
3853
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
3854
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
3855
+ (kv_head)*ggml_element_size(kv.v_l[il]));
3541
3856
  cb(v_cache_view, "v_cache_view", il);
3542
3857
 
3543
3858
  // important: storing RoPE-ed version of K in the KV cache!
@@ -3689,11 +4004,11 @@ static struct ggml_tensor * llm_build_kqv(
3689
4004
  cb(q, "q", il);
3690
4005
 
3691
4006
  struct ggml_tensor * k =
3692
- ggml_view_3d(ctx, kv.k,
4007
+ ggml_view_3d(ctx, kv.k_l[il],
3693
4008
  n_embd_head, n_kv, n_head_kv,
3694
- ggml_element_size(kv.k)*n_embd_gqa,
3695
- ggml_element_size(kv.k)*n_embd_head,
3696
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
4009
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4010
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
4011
+ 0);
3697
4012
  cb(k, "k", il);
3698
4013
 
3699
4014
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@@ -3724,11 +4039,11 @@ static struct ggml_tensor * llm_build_kqv(
3724
4039
 
3725
4040
  // split cached v into n_head heads
3726
4041
  struct ggml_tensor * v =
3727
- ggml_view_3d(ctx, kv.v,
4042
+ ggml_view_3d(ctx, kv.v_l[il],
3728
4043
  n_kv, n_embd_head, n_head_kv,
3729
- ggml_element_size(kv.v)*n_ctx,
3730
- ggml_element_size(kv.v)*n_ctx*n_embd_head,
3731
- ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
4044
+ ggml_element_size(kv.v_l[il])*n_ctx,
4045
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
4046
+ 0);
3732
4047
  cb(v, "v", il);
3733
4048
 
3734
4049
  struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
@@ -3766,6 +4081,8 @@ struct llm_build_context {
3766
4081
  const int64_t n_head_kv;
3767
4082
  const int64_t n_embd_head;
3768
4083
  const int64_t n_embd_gqa;
4084
+ const int64_t n_expert;
4085
+ const int64_t n_expert_used;
3769
4086
 
3770
4087
  const float freq_base;
3771
4088
  const float freq_scale;
@@ -3807,6 +4124,8 @@ struct llm_build_context {
3807
4124
  n_head_kv (hparams.n_head_kv),
3808
4125
  n_embd_head (hparams.n_embd_head()),
3809
4126
  n_embd_gqa (hparams.n_embd_gqa()),
4127
+ n_expert (hparams.n_expert),
4128
+ n_expert_used (hparams.n_expert_used),
3810
4129
  freq_base (cparams.rope_freq_base),
3811
4130
  freq_scale (cparams.rope_freq_scale),
3812
4131
  ext_factor (cparams.yarn_ext_factor),
@@ -3886,12 +4205,24 @@ struct llm_build_context {
3886
4205
  // compute Q and K and RoPE them
3887
4206
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3888
4207
  cb(Qcur, "Qcur", il);
4208
+ if (model.layers[il].bq) {
4209
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4210
+ cb(Qcur, "Qcur", il);
4211
+ }
3889
4212
 
3890
4213
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3891
4214
  cb(Kcur, "Kcur", il);
4215
+ if (model.layers[il].bk) {
4216
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4217
+ cb(Kcur, "Kcur", il);
4218
+ }
3892
4219
 
3893
4220
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3894
4221
  cb(Vcur, "Vcur", il);
4222
+ if (model.layers[il].bv) {
4223
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4224
+ cb(Vcur, "Vcur", il);
4225
+ }
3895
4226
 
3896
4227
  Qcur = ggml_rope_custom(
3897
4228
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -3910,7 +4241,7 @@ struct llm_build_context {
3910
4241
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3911
4242
 
3912
4243
  cur = llm_build_kqv(ctx0, hparams, kv_self,
3913
- model.layers[il].wo, NULL,
4244
+ model.layers[il].wo, model.layers[il].bo,
3914
4245
  Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
3915
4246
  cb(cur, "kqv_out", il);
3916
4247
  }
@@ -3919,7 +4250,7 @@ struct llm_build_context {
3919
4250
  cb(ffn_inp, "ffn_inp", il);
3920
4251
 
3921
4252
  // feed-forward network
3922
- {
4253
+ if (model.layers[il].ffn_gate_inp == nullptr) {
3923
4254
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
3924
4255
  model.layers[il].ffn_norm, NULL,
3925
4256
  LLM_NORM_RMS, cb, il);
@@ -3931,6 +4262,69 @@ struct llm_build_context {
3931
4262
  model.layers[il].ffn_down, NULL,
3932
4263
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
3933
4264
  cb(cur, "ffn_out", il);
4265
+ } else {
4266
+ // MoE branch
4267
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4268
+ model.layers[il].ffn_norm, NULL,
4269
+ LLM_NORM_RMS, cb, il);
4270
+ cb(cur, "ffn_norm", il);
4271
+
4272
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4273
+ cb(logits, "ffn_moe_logits", il);
4274
+
4275
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4276
+ cb(probs, "ffn_moe_probs", il);
4277
+
4278
+ // select experts
4279
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
4280
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
4281
+
4282
+ ggml_tensor * weights = ggml_get_rows(ctx0,
4283
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
4284
+ cb(weights, "ffn_moe_weights", il);
4285
+
4286
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
4287
+
4288
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4289
+ cb(weights_sum, "ffn_moe_weights_sum", il);
4290
+
4291
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4292
+ cb(weights, "ffn_moe_weights_norm", il);
4293
+
4294
+ // compute expert outputs
4295
+ ggml_tensor * moe_out = nullptr;
4296
+
4297
+ for (int i = 0; i < n_expert_used; ++i) {
4298
+ ggml_tensor * cur_expert;
4299
+
4300
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4301
+ cb(cur_up, "ffn_moe_up", il);
4302
+
4303
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
4304
+ cb(cur_gate, "ffn_moe_gate", il);
4305
+
4306
+ cur_gate = ggml_silu(ctx0, cur_gate);
4307
+ cb(cur_gate, "ffn_moe_silu", il);
4308
+
4309
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4310
+ cb(cur_expert, "ffn_moe_gate_par", il);
4311
+
4312
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4313
+ cb(cur_expert, "ffn_moe_down", il);
4314
+
4315
+ cur_expert = ggml_mul(ctx0, cur_expert,
4316
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
4317
+ cb(cur_expert, "ffn_moe_weighted", il);
4318
+
4319
+ if (i == 0) {
4320
+ moe_out = cur_expert;
4321
+ } else {
4322
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
4323
+ cb(moe_out, "ffn_moe_out", il);
4324
+ }
4325
+ }
4326
+
4327
+ cur = moe_out;
3934
4328
  }
3935
4329
 
3936
4330
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4308,6 +4702,7 @@ struct llm_build_context {
4308
4702
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4309
4703
  cb(inpL, "imp_embd", -1);
4310
4704
 
4705
+ // inp_pos - contains the positions
4311
4706
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4312
4707
  cb(inp_pos, "inp_pos", -1);
4313
4708
 
@@ -4315,6 +4710,7 @@ struct llm_build_context {
4315
4710
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4316
4711
  cb(KQ_scale, "KQ_scale", -1);
4317
4712
 
4713
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4318
4714
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4319
4715
  cb(KQ_mask, "KQ_mask", -1);
4320
4716
 
@@ -4903,6 +5299,121 @@ struct llm_build_context {
4903
5299
 
4904
5300
  return gf;
4905
5301
  }
5302
+
5303
+ struct ggml_cgraph * build_qwen() {
5304
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5305
+
5306
+ struct ggml_tensor * cur;
5307
+ struct ggml_tensor * inpL;
5308
+
5309
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5310
+ cb(inpL, "inp_embd", -1);
5311
+
5312
+ // inp_pos - contains the positions
5313
+ struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5314
+ cb(inp_pos, "inp_pos", -1);
5315
+
5316
+ // KQ_scale
5317
+ struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5318
+ cb(KQ_scale, "KQ_scale", -1);
5319
+
5320
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5321
+ struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5322
+ cb(KQ_mask, "KQ_mask", -1);
5323
+
5324
+ // shift the entire K-cache if needed
5325
+ if (do_rope_shift) {
5326
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5327
+ }
5328
+
5329
+ for (int il = 0; il < n_layer; ++il) {
5330
+ struct ggml_tensor * inpSA = inpL;
5331
+
5332
+ cur = llm_build_norm(ctx0, inpL, hparams,
5333
+ model.layers[il].attn_norm, NULL,
5334
+ LLM_NORM_RMS, cb, il);
5335
+ cb(cur, "attn_norm", il);
5336
+
5337
+ // self-attention
5338
+ {
5339
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5340
+ cb(cur, "wqkv", il);
5341
+
5342
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5343
+ cb(cur, "bqkv", il);
5344
+
5345
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5346
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5347
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5348
+
5349
+ cb(Qcur, "Qcur", il);
5350
+ cb(Kcur, "Kcur", il);
5351
+ cb(Vcur, "Vcur", il);
5352
+
5353
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5354
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5355
+
5356
+ // using mode = 2 for neox mode
5357
+ Qcur = ggml_rope_custom(
5358
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5359
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5360
+ );
5361
+ cb(Qcur, "Qcur", il);
5362
+
5363
+ Kcur = ggml_rope_custom(
5364
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5365
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5366
+ );
5367
+ cb(Kcur, "Kcur", il);
5368
+
5369
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5370
+
5371
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
5372
+ model.layers[il].wo, NULL,
5373
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5374
+ cb(cur, "kqv_out", il);
5375
+ }
5376
+
5377
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5378
+ cb(ffn_inp, "ffn_inp", il);
5379
+
5380
+ // feed-forward forward
5381
+ {
5382
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5383
+ model.layers[il].ffn_norm, NULL,
5384
+ LLM_NORM_RMS, cb, il);
5385
+ cb(cur, "ffn_norm", il);
5386
+
5387
+ cur = llm_build_ffn(ctx0, cur,
5388
+ model.layers[il].ffn_up, NULL,
5389
+ model.layers[il].ffn_gate, NULL,
5390
+ model.layers[il].ffn_down, NULL,
5391
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5392
+ cb(cur, "ffn_out", il);
5393
+ }
5394
+
5395
+ cur = ggml_add(ctx0, cur, ffn_inp);
5396
+ cb(cur, "l_out", il);
5397
+
5398
+ // input for next layer
5399
+ inpL = cur;
5400
+ }
5401
+
5402
+ cur = inpL;
5403
+
5404
+ cur = llm_build_norm(ctx0, cur, hparams,
5405
+ model.output_norm, NULL,
5406
+ LLM_NORM_RMS, cb, -1);
5407
+ cb(cur, "result_norm", -1);
5408
+
5409
+ // lm_head
5410
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5411
+ cb(cur, "result_output", -1);
5412
+
5413
+ ggml_build_forward_expand(gf, cur);
5414
+
5415
+ return gf;
5416
+ }
4906
5417
  };
4907
5418
 
4908
5419
  //
@@ -4913,8 +5424,8 @@ struct llm_build_context {
4913
5424
  enum llm_offload_func_e {
4914
5425
  OFFLOAD_FUNC_NOP,
4915
5426
  OFFLOAD_FUNC,
4916
- OFFLOAD_FUNC_KQ,
4917
- OFFLOAD_FUNC_V,
5427
+ OFFLOAD_FUNC_FRC, // force offload
5428
+ OFFLOAD_FUNC_KQV,
4918
5429
  OFFLOAD_FUNC_NR,
4919
5430
  OFFLOAD_FUNC_EMB,
4920
5431
  OFFLOAD_FUNC_OUT,
@@ -5000,11 +5511,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5511
  //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
5001
5512
  { "pos_embd", OFFLOAD_FUNC_NR },
5002
5513
 
5003
- { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope)
5004
- { "KQ_scale", OFFLOAD_FUNC_KQ },
5005
- { "KQ_mask", OFFLOAD_FUNC_KQ },
5006
- { "K_shift", OFFLOAD_FUNC_KQ },
5007
- { "K_shifted", OFFLOAD_FUNC_KQ },
5514
+ { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5515
+ { "KQ_scale", OFFLOAD_FUNC_FRC },
5516
+ { "KQ_mask", OFFLOAD_FUNC_FRC },
5517
+ { "K_shift", OFFLOAD_FUNC_FRC },
5518
+
5519
+ { "K_shifted", OFFLOAD_FUNC },
5008
5520
 
5009
5521
  { "inp_norm", OFFLOAD_FUNC_NR },
5010
5522
  { "inp_norm_w", OFFLOAD_FUNC_NR },
@@ -5017,38 +5529,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5017
5529
  { "attn_norm", OFFLOAD_FUNC },
5018
5530
  { "attn_norm_2", OFFLOAD_FUNC },
5019
5531
 
5020
- { "wqkv", OFFLOAD_FUNC_KQ },
5021
- { "bqkv", OFFLOAD_FUNC_KQ },
5022
- { "wqkv_clamped", OFFLOAD_FUNC_KQ },
5023
-
5024
- { "tmpk", OFFLOAD_FUNC_KQ },
5025
- { "tmpq", OFFLOAD_FUNC_KQ },
5026
- { "tmpv", OFFLOAD_FUNC_V },
5027
- { "Kcur", OFFLOAD_FUNC_KQ },
5028
- { "Qcur", OFFLOAD_FUNC_KQ },
5029
- { "Vcur", OFFLOAD_FUNC_V },
5030
-
5031
- { "krot", OFFLOAD_FUNC_KQ },
5032
- { "qrot", OFFLOAD_FUNC_KQ },
5033
- { "kpass", OFFLOAD_FUNC_KQ },
5034
- { "qpass", OFFLOAD_FUNC_KQ },
5035
- { "krotated", OFFLOAD_FUNC_KQ },
5036
- { "qrotated", OFFLOAD_FUNC_KQ },
5037
-
5038
- { "q", OFFLOAD_FUNC_KQ },
5039
- { "k", OFFLOAD_FUNC_KQ },
5040
- { "kq", OFFLOAD_FUNC_KQ },
5041
- { "kq_scaled", OFFLOAD_FUNC_KQ },
5042
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5043
- { "kq_masked", OFFLOAD_FUNC_KQ },
5044
- { "kq_soft_max", OFFLOAD_FUNC_V },
5045
- { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5046
- { "v", OFFLOAD_FUNC_V },
5047
- { "kqv", OFFLOAD_FUNC_V },
5048
- { "kqv_merged", OFFLOAD_FUNC_V },
5049
- { "kqv_merged_cont", OFFLOAD_FUNC_V },
5050
- { "kqv_wo", OFFLOAD_FUNC_V },
5051
- { "kqv_out", OFFLOAD_FUNC_V },
5532
+ { "wqkv", OFFLOAD_FUNC_KQV },
5533
+ { "bqkv", OFFLOAD_FUNC_KQV },
5534
+ { "wqkv_clamped", OFFLOAD_FUNC_KQV },
5535
+
5536
+ { "tmpk", OFFLOAD_FUNC_KQV },
5537
+ { "tmpq", OFFLOAD_FUNC_KQV },
5538
+ { "tmpv", OFFLOAD_FUNC_KQV },
5539
+ { "Kcur", OFFLOAD_FUNC_KQV },
5540
+ { "Qcur", OFFLOAD_FUNC_KQV },
5541
+ { "Vcur", OFFLOAD_FUNC_KQV },
5542
+
5543
+ { "krot", OFFLOAD_FUNC_KQV },
5544
+ { "qrot", OFFLOAD_FUNC_KQV },
5545
+ { "kpass", OFFLOAD_FUNC_KQV },
5546
+ { "qpass", OFFLOAD_FUNC_KQV },
5547
+ { "krotated", OFFLOAD_FUNC_KQV },
5548
+ { "qrotated", OFFLOAD_FUNC_KQV },
5549
+
5550
+ { "q", OFFLOAD_FUNC_KQV },
5551
+ { "k", OFFLOAD_FUNC_KQV },
5552
+ { "kq", OFFLOAD_FUNC_KQV },
5553
+ { "kq_scaled", OFFLOAD_FUNC_KQV },
5554
+ { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
5555
+ { "kq_masked", OFFLOAD_FUNC_KQV },
5556
+ { "kq_soft_max", OFFLOAD_FUNC_KQV },
5557
+ { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
5558
+ { "v", OFFLOAD_FUNC_KQV },
5559
+ { "kqv", OFFLOAD_FUNC_KQV },
5560
+ { "kqv_merged", OFFLOAD_FUNC_KQV },
5561
+ { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
5562
+ { "kqv_wo", OFFLOAD_FUNC_KQV },
5563
+ { "kqv_out", OFFLOAD_FUNC_KQV },
5052
5564
 
5053
5565
  { "ffn_inp", OFFLOAD_FUNC },
5054
5566
  { "ffn_norm", OFFLOAD_FUNC },
@@ -5067,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5067
5579
  { "ffn_relu", OFFLOAD_FUNC },
5068
5580
  { "ffn_sqr(relu)", OFFLOAD_FUNC },
5069
5581
 
5582
+ { "ffn_moe_logits", OFFLOAD_FUNC },
5583
+ { "ffn_moe_probs", OFFLOAD_FUNC },
5584
+ { "ffn_moe_argsort", OFFLOAD_FUNC },
5585
+ { "ffn_moe_weights", OFFLOAD_FUNC },
5586
+ { "ffn_moe_weights_sum", OFFLOAD_FUNC },
5587
+ { "ffn_moe_weights_norm", OFFLOAD_FUNC },
5588
+ { "ffn_moe_weighted", OFFLOAD_FUNC },
5589
+ { "ffn_moe_up", OFFLOAD_FUNC },
5590
+ { "ffn_moe_gate", OFFLOAD_FUNC },
5591
+ { "ffn_moe_silu", OFFLOAD_FUNC },
5592
+ { "ffn_moe_gate_par", OFFLOAD_FUNC },
5593
+ { "ffn_moe_down", OFFLOAD_FUNC },
5594
+ { "ffn_moe_out", OFFLOAD_FUNC },
5595
+
5070
5596
  { "l_out", OFFLOAD_FUNC },
5071
5597
 
5072
5598
  { "result_norm", OFFLOAD_FUNC_EMB },
@@ -5240,15 +5766,15 @@ static struct ggml_cgraph * llama_build_graph(
5240
5766
  { OFFLOAD_FUNC_NOP, "CPU" },
5241
5767
  { OFFLOAD_FUNC_OUT, "CPU" },
5242
5768
  #ifdef GGML_USE_CUBLAS
5243
- { OFFLOAD_FUNC, "GPU (CUDA)" },
5244
- { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" },
5245
- { OFFLOAD_FUNC_V, "GPU (CUDA) V" },
5246
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5769
+ { OFFLOAD_FUNC, "GPU (CUDA)" },
5770
+ { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5771
+ { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
5772
+ { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5247
5773
  { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
5248
5774
  #else
5249
5775
  { OFFLOAD_FUNC, "CPU" },
5250
- { OFFLOAD_FUNC_KQ, "CPU" },
5251
- { OFFLOAD_FUNC_V, "CPU" },
5776
+ { OFFLOAD_FUNC_FRC, "CPU" },
5777
+ { OFFLOAD_FUNC_KQV, "CPU" },
5252
5778
  { OFFLOAD_FUNC_NR, "CPU" },
5253
5779
  { OFFLOAD_FUNC_EMB, "CPU" },
5254
5780
  #endif // GGML_USE_CUBLAS
@@ -5281,18 +5807,23 @@ static struct ggml_cgraph * llama_build_graph(
5281
5807
  }
5282
5808
  }
5283
5809
  break;
5284
- case OFFLOAD_FUNC_NR:
5285
- if (n_gpu_layers <= n_layer + 0) {
5810
+ case OFFLOAD_FUNC_FRC:
5811
+ if (!lctx.cparams.offload_kqv) {
5286
5812
  func_e = OFFLOAD_FUNC_NOP;
5287
- }
5288
- break;
5289
- case OFFLOAD_FUNC_V:
5290
- if (n_gpu_layers <= n_layer + 1) {
5813
+ } break;
5814
+ case OFFLOAD_FUNC_KQV:
5815
+ if (!lctx.cparams.offload_kqv) {
5291
5816
  func_e = OFFLOAD_FUNC_NOP;
5817
+ } else {
5818
+ if (n_gpu_layers < n_layer) {
5819
+ if (il < i_gpu_start) {
5820
+ func_e = OFFLOAD_FUNC_NOP;
5821
+ }
5822
+ }
5292
5823
  }
5293
5824
  break;
5294
- case OFFLOAD_FUNC_KQ:
5295
- if (n_gpu_layers <= n_layer + 2) {
5825
+ case OFFLOAD_FUNC_NR:
5826
+ if (n_gpu_layers <= n_layer + 0) {
5296
5827
  func_e = OFFLOAD_FUNC_NOP;
5297
5828
  }
5298
5829
  break;
@@ -5317,8 +5848,8 @@ static struct ggml_cgraph * llama_build_graph(
5317
5848
  case OFFLOAD_FUNC_NOP:
5318
5849
  case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
5319
5850
  case OFFLOAD_FUNC:
5320
- case OFFLOAD_FUNC_KQ:
5321
- case OFFLOAD_FUNC_V:
5851
+ case OFFLOAD_FUNC_KQV:
5852
+ case OFFLOAD_FUNC_FRC:
5322
5853
  case OFFLOAD_FUNC_NR:
5323
5854
  case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
5324
5855
  default: GGML_ASSERT(false);
@@ -5377,6 +5908,10 @@ static struct ggml_cgraph * llama_build_graph(
5377
5908
  {
5378
5909
  result = llm.build_stablelm();
5379
5910
  } break;
5911
+ case LLM_ARCH_QWEN:
5912
+ {
5913
+ result = llm.build_qwen();
5914
+ } break;
5380
5915
  default:
5381
5916
  GGML_ASSERT(false);
5382
5917
  }
@@ -5454,7 +5989,7 @@ static int llama_decode_internal(
5454
5989
  const int64_t n_embd = hparams.n_embd;
5455
5990
  const int64_t n_vocab = hparams.n_vocab;
5456
5991
 
5457
- // helpers for smoother batch API transistion
5992
+ // helpers for smoother batch API transition
5458
5993
  // after deprecating the llama_eval calls, these will be removed
5459
5994
  std::vector<llama_pos> pos;
5460
5995
 
@@ -5499,8 +6034,8 @@ static int llama_decode_internal(
5499
6034
  // a heuristic, to avoid attending the full cache if it is not yet utilized
5500
6035
  // after enough generations, the benefit from this heuristic disappears
5501
6036
  // if we start defragmenting the cache, the benefit from this will be more important
5502
- //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5503
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
6037
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
6038
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
5504
6039
 
5505
6040
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5506
6041
 
@@ -5551,7 +6086,7 @@ static int llama_decode_internal(
5551
6086
  n_threads = std::min(4, n_threads);
5552
6087
  }
5553
6088
 
5554
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
6089
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
5555
6090
  if (ggml_cpu_has_cublas() && fully_offloaded) {
5556
6091
  n_threads = 1;
5557
6092
  }
@@ -6233,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6233
6768
 
6234
6769
  // loop over the text
6235
6770
  while (true) {
6236
- // find the first occurence of a given special token in this fragment
6771
+ // find the first occurrence of a given special token in this fragment
6237
6772
  // passing offset argument only limit the "search area" but match coordinates
6238
6773
  // are still relative to the source full raw_text
6239
6774
  auto match = raw_text->find(special_token, raw_text_base_offset);
6240
6775
 
6241
- // no occurences found, stop processing this fragment for a given special token
6776
+ // no occurrences found, stop processing this fragment for a given special token
6242
6777
  if (match == std::string::npos) break;
6243
6778
 
6244
6779
  // check if match is within bounds of offset <-> length
@@ -6410,14 +6945,13 @@ struct llama_grammar_candidate {
6410
6945
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
6411
6946
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6412
6947
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6413
- const char * src,
6414
- size_t n_src,
6948
+ const std::string & src,
6415
6949
  llama_partial_utf8 partial_start) {
6416
6950
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6417
- const char * pos = src;
6951
+ const char * pos = src.c_str();
6418
6952
  std::vector<uint32_t> code_points;
6419
6953
  // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
- code_points.reserve(n_src + 1);
6954
+ code_points.reserve(src.size() + 1);
6421
6955
  uint32_t value = partial_start.value;
6422
6956
  int n_remain = partial_start.n_remain;
6423
6957
 
@@ -6468,13 +7002,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6468
7002
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6469
7003
  }
6470
7004
 
6471
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
- std::string src,
6473
- llama_partial_utf8 partial_start
6474
- ) {
6475
- return decode_utf8(src.c_str(), src.size(), partial_start);
6476
- }
6477
-
6478
7005
  // returns true iff pos points to the end of one of the definitions of a rule
6479
7006
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6480
7007
  switch (pos->type) {
@@ -7113,7 +7640,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7113
7640
  const llama_token eos = llama_token_eos(&ctx->model);
7114
7641
 
7115
7642
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7643
+ candidates_decoded.reserve(candidates->size);
7116
7644
  std::vector<llama_grammar_candidate> candidates_grammar;
7645
+ candidates_grammar.reserve(candidates->size);
7117
7646
 
7118
7647
  for (size_t i = 0; i < candidates->size; ++i) {
7119
7648
  const llama_token id = candidates->data[i].id;
@@ -7443,7 +7972,7 @@ struct llama_beam_search_data {
7443
7972
  }
7444
7973
 
7445
7974
  // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
7446
- // The repetative patterns below reflect the 2 stages of heaps:
7975
+ // The repetitive patterns below reflect the 2 stages of heaps:
7447
7976
  // * Gather elements until the vector is full, then call std::make_heap() on it.
7448
7977
  // * If the heap is full and a new element is found that should be included, pop the
7449
7978
  // least element to the back(), replace it with the new, then push it into the heap.
@@ -7650,18 +8179,21 @@ static void llama_convert_tensor_internal(
7650
8179
  return;
7651
8180
  }
7652
8181
 
7653
- auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
7654
- auto block_size_bytes = ggml_type_size(tensor->type);
8182
+ size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
8183
+ size_t block_size_bytes = ggml_type_size(tensor->type);
7655
8184
 
7656
8185
  GGML_ASSERT(nelements % block_size == 0);
7657
- auto nblocks = nelements / block_size;
7658
- auto blocks_per_thread = nblocks / nthread;
7659
- auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8186
+ size_t nblocks = nelements / block_size;
8187
+ size_t blocks_per_thread = nblocks / nthread;
8188
+ size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8189
+
8190
+ size_t in_buff_offs = 0;
8191
+ size_t out_buff_offs = 0;
7660
8192
 
7661
- for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
7662
- auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
7663
- auto thr_elems = thr_blocks * block_size; // number of elements for this thread
7664
- auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
8193
+ for (int tnum = 0; tnum < nthread; tnum++) {
8194
+ size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
8195
+ size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
8196
+ size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
7665
8197
 
7666
8198
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
7667
8199
  if (typ == GGML_TYPE_F16) {
@@ -7678,11 +8210,9 @@ static void llama_convert_tensor_internal(
7678
8210
  workers.clear();
7679
8211
  }
7680
8212
 
7681
- static ggml_type get_k_quant_type(
7682
- quantize_state_internal & qs,
7683
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
7684
- ) {
8213
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
7685
8214
  const std::string name = ggml_get_name(tensor);
8215
+
7686
8216
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7687
8217
  const llm_arch arch = qs.model.arch;
7688
8218
  const auto tn = LLM_TN(arch);
@@ -7716,7 +8246,18 @@ static ggml_type get_k_quant_type(
7716
8246
  // nearly negligible increase in model size by quantizing this tensor with more bits:
7717
8247
  if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
7718
8248
  }
8249
+ if (qs.model.hparams.n_expert == 8) {
8250
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8251
+ // TODO: explore better strategies
8252
+ new_type = GGML_TYPE_Q8_0;
8253
+ }
7719
8254
  ++qs.i_attention_wv;
8255
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8256
+ if (qs.model.hparams.n_expert == 8) {
8257
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8258
+ // TODO: explore better strategies
8259
+ new_type = GGML_TYPE_Q8_0;
8260
+ }
7720
8261
  } else if (name.find("ffn_down.weight") != std::string::npos) {
7721
8262
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
7722
8263
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -7831,7 +8372,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7831
8372
  constexpr bool use_mmap = false;
7832
8373
  #endif
7833
8374
 
7834
- llama_model_loader ml(fname_inp, use_mmap);
8375
+ llama_model_loader ml(fname_inp, use_mmap, NULL);
7835
8376
  if (ml.use_mmap) {
7836
8377
  ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7837
8378
  }
@@ -7925,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7925
8466
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
7926
8467
 
7927
8468
  // quantize only 2D tensors
7928
- quantize &= (tensor->n_dims == 2);
8469
+ quantize &= (ggml_n_dims(tensor) == 2);
7929
8470
  quantize &= params->quantize_output_tensor || name != "output.weight";
7930
8471
  quantize &= !params->only_copy;
7931
8472
 
8473
+ // do not quantize expert gating tensors
8474
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
8475
+
7932
8476
  enum ggml_type new_type;
7933
8477
  void * new_data;
7934
8478
  size_t new_size;
@@ -8127,7 +8671,7 @@ static int llama_apply_lora_from_file_internal(
8127
8671
  std::vector<uint8_t> base_buf;
8128
8672
  if (path_base_model) {
8129
8673
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8130
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
8674
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8131
8675
 
8132
8676
  size_t ctx_size;
8133
8677
  size_t mmapped_size;
@@ -8355,6 +8899,7 @@ struct llama_model_params llama_model_default_params() {
8355
8899
  /*.tensor_split =*/ nullptr,
8356
8900
  /*.progress_callback =*/ nullptr,
8357
8901
  /*.progress_callback_user_data =*/ nullptr,
8902
+ /*.kv_overrides =*/ nullptr,
8358
8903
  /*.vocab_only =*/ false,
8359
8904
  /*.use_mmap =*/ true,
8360
8905
  /*.use_mlock =*/ false,
@@ -8382,10 +8927,12 @@ struct llama_context_params llama_context_default_params() {
8382
8927
  /*.yarn_beta_fast =*/ 32.0f,
8383
8928
  /*.yarn_beta_slow =*/ 1.0f,
8384
8929
  /*.yarn_orig_ctx =*/ 0,
8930
+ /*.type_k =*/ GGML_TYPE_F16,
8931
+ /*.type_v =*/ GGML_TYPE_F16,
8385
8932
  /*.mul_mat_q =*/ true,
8386
- /*.f16_kv =*/ true,
8387
8933
  /*.logits_all =*/ false,
8388
8934
  /*.embedding =*/ false,
8935
+ /*.offload_kqv =*/ true,
8389
8936
  };
8390
8937
 
8391
8938
  return result;
@@ -8502,6 +9049,7 @@ struct llama_context * llama_new_context_with_model(
8502
9049
  cparams.yarn_beta_fast = params.yarn_beta_fast;
8503
9050
  cparams.yarn_beta_slow = params.yarn_beta_slow;
8504
9051
  cparams.mul_mat_q = params.mul_mat_q;
9052
+ cparams.offload_kqv = params.offload_kqv;
8505
9053
 
8506
9054
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
8507
9055
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -8535,19 +9083,36 @@ struct llama_context * llama_new_context_with_model(
8535
9083
  ctx->rng = std::mt19937(params.seed);
8536
9084
  ctx->logits_all = params.logits_all;
8537
9085
 
8538
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
9086
+ const ggml_type type_k = params.type_k;
9087
+ const ggml_type type_v = params.type_v;
9088
+
9089
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
9090
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
8539
9091
 
8540
9092
  // reserve memory for context buffers
8541
9093
  if (!hparams.vocab_only) {
8542
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
9094
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
8543
9095
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
8544
9096
  llama_free(ctx);
8545
9097
  return nullptr;
8546
9098
  }
8547
9099
 
8548
9100
  {
8549
- const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8550
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
9101
+ size_t memory_size_k = 0;
9102
+ size_t memory_size_v = 0;
9103
+
9104
+ for (auto & k : ctx->kv_self.k_l) {
9105
+ memory_size_k += ggml_nbytes(k);
9106
+ }
9107
+
9108
+ for (auto & v : ctx->kv_self.v_l) {
9109
+ memory_size_v += ggml_nbytes(v);
9110
+ }
9111
+
9112
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
9113
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
9114
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
9115
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
8551
9116
  }
8552
9117
 
8553
9118
  // resized during inference
@@ -8618,8 +9183,12 @@ struct llama_context * llama_new_context_with_model(
8618
9183
  }
8619
9184
 
8620
9185
  size_t kv_vram_size = 0;
8621
- add_tensor(ctx->kv_self.k, kv_vram_size);
8622
- add_tensor(ctx->kv_self.v, kv_vram_size);
9186
+ for (auto & k : ctx->kv_self.k_l) {
9187
+ add_tensor(k, kv_vram_size);
9188
+ }
9189
+ for (auto & v : ctx->kv_self.v_l) {
9190
+ add_tensor(v, kv_vram_size);
9191
+ }
8623
9192
 
8624
9193
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8625
9194
  size_t total_vram_size = model_vram_size + ctx_vram_size;
@@ -9089,37 +9658,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9089
9658
  data_ctx->write(&kv_used, sizeof(kv_used));
9090
9659
 
9091
9660
  if (kv_buf_size) {
9092
- const size_t elt_size = ggml_element_size(kv_self.k);
9661
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9093
9662
 
9094
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9663
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9095
9664
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9096
9665
 
9097
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9098
- std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9099
- kout3d->data = kout3d_data.data();
9666
+ std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9667
+ std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9100
9668
 
9101
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9102
- std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
9103
- vout3d->data = vout3d_data.data();
9669
+ for (int il = 0; il < (int) n_layer; ++il) {
9670
+ ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9671
+ kout2d_data[il].resize(ggml_nbytes(kout2d));
9672
+ kout2d->data = kout2d_data[il].data();
9104
9673
 
9105
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9106
- n_embd, kv_head, n_layer,
9107
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9674
+ ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9675
+ vout2d_data[il].resize(ggml_nbytes(vout2d));
9676
+ vout2d->data = vout2d_data[il].data();
9108
9677
 
9109
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9110
- kv_head, n_embd, n_layer,
9111
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9678
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9679
+ n_embd, kv_head,
9680
+ elt_size*n_embd, 0);
9681
+
9682
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9683
+ kv_head, n_embd,
9684
+ elt_size*n_ctx, 0);
9685
+
9686
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9687
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9688
+ }
9112
9689
 
9113
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9114
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9115
9690
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9116
9691
 
9117
9692
  ggml_free(cpy_ctx);
9118
9693
 
9119
- // our data is now in the kout3d_data and vout3d_data buffers
9694
+ // our data is now in the kout2d_data and vout2d_data buffers
9120
9695
  // write them to file
9121
- data_ctx->write(kout3d_data.data(), kout3d_data.size());
9122
- data_ctx->write(vout3d_data.data(), vout3d_data.size());
9696
+ for (uint32_t il = 0; il < n_layer; ++il) {
9697
+ data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9698
+ data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9699
+ }
9123
9700
  }
9124
9701
 
9125
9702
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9219,29 +9796,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9219
9796
  if (kv_buf_size) {
9220
9797
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9221
9798
 
9222
- const size_t elt_size = ggml_element_size(kv_self.k);
9799
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9223
9800
 
9224
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9801
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9225
9802
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9226
9803
 
9227
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9228
- kin3d->data = (void *) inp;
9229
- inp += ggml_nbytes(kin3d);
9804
+ for (int il = 0; il < n_layer; ++il) {
9805
+ ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9806
+ kin2d->data = (void *) inp;
9807
+ inp += ggml_nbytes(kin2d);
9808
+
9809
+ ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9810
+ vin2d->data = (void *) inp;
9811
+ inp += ggml_nbytes(vin2d);
9230
9812
 
9231
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9232
- vin3d->data = (void *) inp;
9233
- inp += ggml_nbytes(vin3d);
9813
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9814
+ n_embd, kv_head,
9815
+ elt_size*n_embd, 0);
9234
9816
 
9235
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9236
- n_embd, kv_head, n_layer,
9237
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9817
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9818
+ kv_head, n_embd,
9819
+ elt_size*n_ctx, 0);
9238
9820
 
9239
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9240
- kv_head, n_embd, n_layer,
9241
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9821
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9822
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9823
+ }
9242
9824
 
9243
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9244
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9245
9825
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9246
9826
 
9247
9827
  ggml_free(cpy_ctx);