llama_cpp 0.9.5 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,6 +74,7 @@
74
74
  #include <set>
75
75
  #include <sstream>
76
76
  #include <thread>
77
+ #include <type_traits>
77
78
  #include <unordered_map>
78
79
 
79
80
  #if defined(_MSC_VER)
@@ -90,7 +91,8 @@
90
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
91
92
  #endif
92
93
 
93
- #define LLAMA_MAX_NODES 8192
94
+ #define LLAMA_MAX_NODES 8192
95
+ #define LLAMA_MAX_EXPERTS 8
94
96
 
95
97
  //
96
98
  // logging
@@ -192,6 +194,7 @@ enum llm_arch {
192
194
  LLM_ARCH_REFACT,
193
195
  LLM_ARCH_BLOOM,
194
196
  LLM_ARCH_STABLELM,
197
+ LLM_ARCH_QWEN,
195
198
  LLM_ARCH_UNKNOWN,
196
199
  };
197
200
 
@@ -208,6 +211,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
208
211
  { LLM_ARCH_REFACT, "refact" },
209
212
  { LLM_ARCH_BLOOM, "bloom" },
210
213
  { LLM_ARCH_STABLELM, "stablelm" },
214
+ { LLM_ARCH_QWEN, "qwen" },
211
215
  };
212
216
 
213
217
  enum llm_kv {
@@ -228,6 +232,8 @@ enum llm_kv {
228
232
  LLM_KV_FEED_FORWARD_LENGTH,
229
233
  LLM_KV_USE_PARALLEL_RESIDUAL,
230
234
  LLM_KV_TENSOR_DATA_LAYOUT,
235
+ LLM_KV_EXPERT_COUNT,
236
+ LLM_KV_EXPERT_USED_COUNT,
231
237
 
232
238
  LLM_KV_ATTENTION_HEAD_COUNT,
233
239
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -278,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
278
284
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
279
285
  { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
280
286
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
287
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
288
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
281
289
 
282
290
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
283
291
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -335,10 +343,14 @@ enum llm_tensor {
335
343
  LLM_TENSOR_ATTN_NORM,
336
344
  LLM_TENSOR_ATTN_NORM_2,
337
345
  LLM_TENSOR_ATTN_ROT_EMBD,
346
+ LLM_TENSOR_FFN_GATE_INP,
347
+ LLM_TENSOR_FFN_NORM,
338
348
  LLM_TENSOR_FFN_GATE,
339
349
  LLM_TENSOR_FFN_DOWN,
340
350
  LLM_TENSOR_FFN_UP,
341
- LLM_TENSOR_FFN_NORM,
351
+ LLM_TENSOR_FFN_DOWN_EXP,
352
+ LLM_TENSOR_FFN_GATE_EXP,
353
+ LLM_TENSOR_FFN_UP_EXP,
342
354
  LLM_TENSOR_ATTN_Q_NORM,
343
355
  LLM_TENSOR_ATTN_K_NORM,
344
356
  };
@@ -357,10 +369,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
357
369
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
358
370
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
359
371
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
372
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
360
373
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
361
374
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
362
375
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
363
376
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
377
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
378
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
379
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
364
380
  },
365
381
  },
366
382
  {
@@ -518,6 +534,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
518
534
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
519
535
  },
520
536
  },
537
+ {
538
+ LLM_ARCH_QWEN,
539
+ {
540
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
541
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
542
+ { LLM_TENSOR_OUTPUT, "output" },
543
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
544
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
545
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
546
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
547
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
548
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
549
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
550
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551
+ },
552
+ },
521
553
 
522
554
  {
523
555
  LLM_ARCH_UNKNOWN,
@@ -566,27 +598,16 @@ struct LLM_TN {
566
598
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
567
599
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
568
600
  }
601
+
602
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
603
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
604
+ }
569
605
  };
570
606
 
571
607
  //
572
608
  // gguf helpers
573
609
  //
574
610
 
575
- #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
576
- do { \
577
- const std::string skey(key); \
578
- const int kid = gguf_find_key(ctx, skey.c_str()); \
579
- if (kid >= 0) { \
580
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
581
- if (ktype != (type)) { \
582
- throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
583
- } \
584
- (dst) = func(ctx, kid); \
585
- } else if (req) { \
586
- throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
587
- } \
588
- } while (0)
589
-
590
611
  static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
591
612
  { LLAMA_ROPE_SCALING_NONE, "none" },
592
613
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -620,7 +641,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
620
641
  }
621
642
  }
622
643
 
623
- static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
644
+ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
624
645
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
646
 
626
647
  switch (type) {
@@ -1155,6 +1176,8 @@ struct llama_hparams {
1155
1176
  uint32_t n_layer;
1156
1177
  uint32_t n_rot;
1157
1178
  uint32_t n_ff;
1179
+ uint32_t n_expert = 0;
1180
+ uint32_t n_expert_used = 0;
1158
1181
 
1159
1182
  float f_norm_eps;
1160
1183
  float f_norm_rms_eps;
@@ -1169,15 +1192,18 @@ struct llama_hparams {
1169
1192
  float f_max_alibi_bias;
1170
1193
 
1171
1194
  bool operator!=(const llama_hparams & other) const {
1172
- if (this->vocab_only != other.vocab_only) return true;
1173
- if (this->n_vocab != other.n_vocab) return true;
1174
- if (this->n_ctx_train != other.n_ctx_train) return true;
1175
- if (this->n_embd != other.n_embd) return true;
1176
- if (this->n_head != other.n_head) return true;
1177
- if (this->n_head_kv != other.n_head_kv) return true;
1178
- if (this->n_layer != other.n_layer) return true;
1179
- if (this->n_rot != other.n_rot) return true;
1180
- if (this->n_ff != other.n_ff) return true;
1195
+ if (this->vocab_only != other.vocab_only) return true;
1196
+ if (this->n_vocab != other.n_vocab) return true;
1197
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1198
+ if (this->n_embd != other.n_embd) return true;
1199
+ if (this->n_head != other.n_head) return true;
1200
+ if (this->n_head_kv != other.n_head_kv) return true;
1201
+ if (this->n_layer != other.n_layer) return true;
1202
+ if (this->n_rot != other.n_rot) return true;
1203
+ if (this->n_ff != other.n_ff) return true;
1204
+ if (this->n_expert != other.n_expert) return true;
1205
+ if (this->n_expert_used != other.n_expert_used) return true;
1206
+
1181
1207
  if (this->rope_finetuned != other.rope_finetuned) return true;
1182
1208
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1183
1209
 
@@ -1222,6 +1248,7 @@ struct llama_cparams {
1222
1248
  float yarn_beta_slow;
1223
1249
 
1224
1250
  bool mul_mat_q;
1251
+ bool offload_kqv;
1225
1252
  };
1226
1253
 
1227
1254
  struct llama_layer {
@@ -1243,6 +1270,9 @@ struct llama_layer {
1243
1270
  struct ggml_tensor * wqkv;
1244
1271
 
1245
1272
  // attention bias
1273
+ struct ggml_tensor * bq;
1274
+ struct ggml_tensor * bk;
1275
+ struct ggml_tensor * bv;
1246
1276
  struct ggml_tensor * bo;
1247
1277
  struct ggml_tensor * bqkv;
1248
1278
 
@@ -1255,6 +1285,12 @@ struct llama_layer {
1255
1285
  struct ggml_tensor * ffn_down; // w2
1256
1286
  struct ggml_tensor * ffn_up; // w3
1257
1287
 
1288
+ // ff MoE
1289
+ struct ggml_tensor * ffn_gate_inp;
1290
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1291
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1292
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1293
+
1258
1294
  // ff bias
1259
1295
  struct ggml_tensor * ffn_down_b; // b2
1260
1296
  struct ggml_tensor * ffn_up_b; // b3
@@ -1287,8 +1323,8 @@ struct llama_kv_cache {
1287
1323
 
1288
1324
  std::vector<llama_kv_cell> cells;
1289
1325
 
1290
- struct ggml_tensor * k = NULL;
1291
- struct ggml_tensor * v = NULL;
1326
+ std::vector<struct ggml_tensor *> k_l; // per layer
1327
+ std::vector<struct ggml_tensor *> v_l;
1292
1328
 
1293
1329
  struct ggml_context * ctx = NULL;
1294
1330
 
@@ -1301,8 +1337,10 @@ struct llama_kv_cache {
1301
1337
 
1302
1338
  #ifdef GGML_USE_CUBLAS
1303
1339
  if (ggml_cublas_loaded()) {
1304
- ggml_cuda_free_data(k);
1305
- ggml_cuda_free_data(v);
1340
+ for (size_t i = 0; i < k_l.size(); ++i) {
1341
+ ggml_cuda_free_data(k_l[i]);
1342
+ ggml_cuda_free_data(v_l[i]);
1343
+ }
1306
1344
  }
1307
1345
  #endif
1308
1346
  }
@@ -1492,9 +1530,11 @@ struct llama_context {
1492
1530
  static bool llama_kv_cache_init(
1493
1531
  const struct llama_hparams & hparams,
1494
1532
  struct llama_kv_cache & cache,
1495
- ggml_type wtype,
1533
+ ggml_type ktype,
1534
+ ggml_type vtype,
1496
1535
  uint32_t n_ctx,
1497
- int n_gpu_layers) {
1536
+ int n_gpu_layers,
1537
+ bool offload) {
1498
1538
  const uint32_t n_embd = hparams.n_embd_gqa();
1499
1539
  const uint32_t n_layer = hparams.n_layer;
1500
1540
 
@@ -1510,7 +1550,7 @@ static bool llama_kv_cache_init(
1510
1550
  cache.cells.clear();
1511
1551
  cache.cells.resize(n_ctx);
1512
1552
 
1513
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1553
+ cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
1514
1554
  memset(cache.buf.data, 0, cache.buf.size);
1515
1555
 
1516
1556
  struct ggml_init_params params;
@@ -1520,37 +1560,44 @@ static bool llama_kv_cache_init(
1520
1560
 
1521
1561
  cache.ctx = ggml_init(params);
1522
1562
 
1563
+ size_t vram_kv_cache = 0;
1564
+
1523
1565
  if (!cache.ctx) {
1524
1566
  LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1525
1567
  return false;
1526
1568
  }
1527
1569
 
1528
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1529
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1530
- ggml_set_name(cache.k, "cache_k");
1531
- ggml_set_name(cache.v, "cache_v");
1570
+ cache.k_l.reserve(n_layer);
1571
+ cache.v_l.reserve(n_layer);
1532
1572
 
1533
- (void) n_gpu_layers;
1573
+ const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1534
1574
 
1535
- #ifdef GGML_USE_CUBLAS
1536
- if (ggml_cublas_loaded()) {
1537
- size_t vram_kv_cache = 0;
1575
+ GGML_UNUSED(offload);
1538
1576
 
1539
- if (n_gpu_layers > (int)n_layer + 1) {
1540
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1541
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1542
- vram_kv_cache += ggml_nbytes(cache.v);
1543
- }
1544
- if (n_gpu_layers > (int)n_layer + 2) {
1545
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1546
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1547
- vram_kv_cache += ggml_nbytes(cache.k);
1548
- }
1549
- if (vram_kv_cache > 0) {
1550
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1577
+ for (int i = 0; i < (int) n_layer; i++) {
1578
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1579
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1580
+ ggml_format_name(k, "cache_k_l%d", i);
1581
+ ggml_format_name(v, "cache_v_l%d", i);
1582
+ cache.k_l.push_back(k);
1583
+ cache.v_l.push_back(v);
1584
+ #ifdef GGML_USE_CUBLAS
1585
+ if (i >= i_gpu_start) {
1586
+ if (offload) {
1587
+ ggml_cuda_assign_buffers_no_scratch(k);
1588
+ vram_kv_cache += ggml_nbytes(k);
1589
+ ggml_cuda_assign_buffers_no_scratch(v);
1590
+ vram_kv_cache += ggml_nbytes(v);
1591
+ }
1551
1592
  }
1593
+ #endif // GGML_USE_CUBLAS
1552
1594
  }
1553
- #endif
1595
+
1596
+ if (vram_kv_cache > 0) {
1597
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1598
+ }
1599
+
1600
+ GGML_UNUSED(n_gpu_layers);
1554
1601
 
1555
1602
  return true;
1556
1603
  }
@@ -1771,6 +1818,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
1771
1818
  return buf;
1772
1819
  }
1773
1820
 
1821
+ namespace GGUFMeta {
1822
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
1823
+ struct GKV_Base_Type {
1824
+ static constexpr gguf_type gt = gt_;
1825
+
1826
+ static T getter(const gguf_context * ctx, const int kid) {
1827
+ return gfun(ctx, kid);
1828
+ }
1829
+ };
1830
+
1831
+ template<typename T> struct GKV_Base;
1832
+
1833
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
1834
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
1835
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
1836
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
1837
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
1838
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
1839
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
1840
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
1841
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
1842
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
1843
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
1844
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
1845
+
1846
+ template<> struct GKV_Base<std::string> {
1847
+ static constexpr gguf_type gt = GGUF_TYPE_STRING;
1848
+
1849
+ static std::string getter(const gguf_context * ctx, const int kid) {
1850
+ return gguf_get_val_str(ctx, kid);
1851
+ }
1852
+ };
1853
+
1854
+ struct ArrayInfo{
1855
+ const gguf_type gt;
1856
+ const size_t length;
1857
+ const void * data;
1858
+ };
1859
+
1860
+ template<> struct GKV_Base<ArrayInfo> {
1861
+ public:
1862
+ static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
1863
+ static ArrayInfo getter(const gguf_context *ctx, const int k) {
1864
+ return ArrayInfo {
1865
+ gguf_get_arr_type(ctx, k),
1866
+ size_t(gguf_get_arr_n(ctx, k)),
1867
+ gguf_get_arr_data(ctx, k),
1868
+ };
1869
+ }
1870
+ };
1871
+
1872
+ template<typename T>
1873
+ class GKV: public GKV_Base<T> {
1874
+ GKV() = delete;
1875
+
1876
+ public:
1877
+ static T get_kv(const gguf_context * ctx, const int k) {
1878
+ const enum gguf_type kt = gguf_get_kv_type(ctx, k);
1879
+
1880
+ if (kt != GKV::gt) {
1881
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
1882
+ gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
1883
+ }
1884
+ return GKV::getter(ctx, k);
1885
+ }
1886
+
1887
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
1888
+ switch (ty) {
1889
+ case LLAMA_KV_OVERRIDE_BOOL: return "bool";
1890
+ case LLAMA_KV_OVERRIDE_INT: return "int";
1891
+ case LLAMA_KV_OVERRIDE_FLOAT: return "float";
1892
+ }
1893
+ return "unknown";
1894
+ }
1895
+
1896
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
1897
+ if (!override) { return false; }
1898
+ if (override->tag == expected_type) {
1899
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
1900
+ __func__, override_type_to_str(override->tag), override->key);
1901
+ switch (override->tag) {
1902
+ case LLAMA_KV_OVERRIDE_BOOL: {
1903
+ printf("%s\n", override->bool_value ? "true" : "false");
1904
+ } break;
1905
+ case LLAMA_KV_OVERRIDE_INT: {
1906
+ printf("%" PRId64 "\n", override->int_value);
1907
+ } break;
1908
+ case LLAMA_KV_OVERRIDE_FLOAT: {
1909
+ printf("%.6f\n", override->float_value);
1910
+ } break;
1911
+ default:
1912
+ // Shouldn't be possible to end up here, but just in case...
1913
+ throw std::runtime_error(
1914
+ format("Unsupported attempt to override %s type for metadata key %s\n",
1915
+ override_type_to_str(override->tag), override->key));
1916
+ }
1917
+ return true;
1918
+ }
1919
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
1920
+ __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
1921
+ return false;
1922
+ }
1923
+
1924
+ template<typename OT>
1925
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
1926
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1927
+ if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
1928
+ target = override->bool_value;
1929
+ return true;
1930
+ }
1931
+ return true;
1932
+ }
1933
+
1934
+ template<typename OT>
1935
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
1936
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1937
+ if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
1938
+ target = override->int_value;
1939
+ return true;
1940
+ }
1941
+ return false;
1942
+ }
1943
+
1944
+ template<typename OT>
1945
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
1946
+ try_override(T & target, const struct llama_model_kv_override *override) {
1947
+ if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
1948
+ target = override->float_value;
1949
+ return true;
1950
+ }
1951
+ return false;
1952
+ }
1953
+
1954
+ template<typename OT>
1955
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
1956
+ try_override(T & target, const struct llama_model_kv_override *override) {
1957
+ (void)target;
1958
+ (void)override;
1959
+ if (!override) { return false; }
1960
+ // Currently, we should never end up here so it would be a bug if we do.
1961
+ throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
1962
+ override ? override->key : "NULL"));
1963
+ }
1964
+
1965
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
1966
+ if (try_override<T>(target, override)) {
1967
+ return true;
1968
+ }
1969
+ if (k < 0) { return false; }
1970
+ target = get_kv(ctx, k);
1971
+ return true;
1972
+ }
1973
+
1974
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
1975
+ return set(ctx, gguf_find_key(ctx, key), target, override);
1976
+ }
1977
+
1978
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
1979
+ return set(ctx, key.c_str(), target, override);
1980
+ }
1981
+ };
1982
+ }
1983
+
1774
1984
  struct llama_model_loader {
1775
1985
  int n_kv = 0;
1776
1986
  int n_tensors = 0;
@@ -1786,21 +1996,34 @@ struct llama_model_loader {
1786
1996
  llama_fver fver;
1787
1997
 
1788
1998
  std::unique_ptr<llama_mmap> mapping;
1999
+ std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
1789
2000
 
1790
2001
  struct gguf_context * ctx_gguf = NULL;
1791
2002
  struct ggml_context * ctx_meta = NULL;
1792
2003
 
1793
- llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
2004
+ std::string arch_name;
2005
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2006
+
2007
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
1794
2008
  struct gguf_init_params params = {
1795
2009
  /*.no_alloc = */ true,
1796
2010
  /*.ctx = */ &ctx_meta,
1797
2011
  };
1798
2012
 
2013
+ if (param_overrides_p != nullptr) {
2014
+ for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2015
+ kv_overrides.insert({std::string(p->key), *p});
2016
+ }
2017
+ }
2018
+
1799
2019
  ctx_gguf = gguf_init_from_file(fname.c_str(), params);
1800
2020
  if (!ctx_gguf) {
1801
2021
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
1802
2022
  }
1803
2023
 
2024
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2025
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2026
+
1804
2027
  n_kv = gguf_get_n_kv(ctx_gguf);
1805
2028
  n_tensors = gguf_get_n_tensors(ctx_gguf);
1806
2029
 
@@ -1868,6 +2091,7 @@ struct llama_model_loader {
1868
2091
  }
1869
2092
  }
1870
2093
 
2094
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1871
2095
  for (int i = 0; i < n_kv; i++) {
1872
2096
  const char * name = gguf_get_key(ctx_gguf, i);
1873
2097
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1913,19 +2137,59 @@ struct llama_model_loader {
1913
2137
  }
1914
2138
  }
1915
2139
 
1916
- std::string get_arch_name() const {
1917
- const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
2140
+ template<typename T>
2141
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2142
+ get_arr_n(const std::string & key, T & result, const bool required = true) {
2143
+ const int kid = gguf_find_key(ctx_gguf, key.c_str());
1918
2144
 
1919
- std::string arch_name;
1920
- GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
2145
+ if (kid < 0) {
2146
+ if (required) {
2147
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2148
+ }
2149
+ return false;
2150
+ }
2151
+
2152
+ struct GGUFMeta::ArrayInfo arr_info =
2153
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
2154
+
2155
+
2156
+ result = arr_info.length;
2157
+ return true;
2158
+ }
2159
+
2160
+ template<typename T>
2161
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2162
+ get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
2163
+ return get_arr_n(llm_kv(kid), result, required);
2164
+ }
2165
+
2166
+ template<typename T>
2167
+ bool get_key(const std::string & key, T & result, const bool required = true) {
2168
+ auto it = kv_overrides.find(key);
2169
+
2170
+ const struct llama_model_kv_override * override =
2171
+ it != kv_overrides.end() ? &it->second : nullptr;
2172
+
2173
+ const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
1921
2174
 
2175
+ if (required && !found) {
2176
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2177
+ }
2178
+
2179
+ return found;
2180
+ }
2181
+
2182
+ template<typename T>
2183
+ bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
2184
+ return get_key(llm_kv(kid), result, required);
2185
+ }
2186
+
2187
+ std::string get_arch_name() const {
1922
2188
  return arch_name;
1923
2189
  }
1924
2190
 
1925
2191
  enum llm_arch get_arch() const {
1926
- const std::string arch_name = get_arch_name();
1927
-
1928
- return llm_arch_from_string(arch_name);
2192
+ return llm_kv.arch;
1929
2193
  }
1930
2194
 
1931
2195
  const char * get_tensor_name(int i) const {
@@ -1965,10 +2229,13 @@ struct llama_model_loader {
1965
2229
  return tensor;
1966
2230
  }
1967
2231
 
1968
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
2232
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
1969
2233
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1970
2234
 
1971
2235
  if (cur == NULL) {
2236
+ if (!required) {
2237
+ return NULL;
2238
+ }
1972
2239
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
1973
2240
  }
1974
2241
 
@@ -2172,11 +2439,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2172
2439
  static void llm_load_hparams(
2173
2440
  llama_model_loader & ml,
2174
2441
  llama_model & model) {
2175
- struct gguf_context * ctx = ml.ctx_gguf;
2176
-
2177
- const auto kv = LLM_KV(model.arch);
2178
-
2179
2442
  auto & hparams = model.hparams;
2443
+ const gguf_context * ctx = ml.ctx_gguf;
2180
2444
 
2181
2445
  // get metadata as string
2182
2446
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -2190,42 +2454,51 @@ static void llm_load_hparams(
2190
2454
  }
2191
2455
 
2192
2456
  // get general kv
2193
- GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2457
+ ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
2194
2458
 
2195
2459
  // get hparams kv
2196
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
2197
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
2198
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
2199
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
2200
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
2201
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
2460
+ ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
2461
+ ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
2462
+ ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
2463
+ ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2464
+ ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2465
+ ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2466
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2467
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2468
+
2469
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2470
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2471
+ if (hparams.n_expert > 0) {
2472
+ GGML_ASSERT(hparams.n_expert_used > 0);
2473
+ } else {
2474
+ GGML_ASSERT(hparams.n_expert_used == 0);
2475
+ }
2202
2476
 
2203
2477
  // n_head_kv is optional, default to n_head
2204
2478
  hparams.n_head_kv = hparams.n_head;
2205
- GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
2479
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
2206
2480
 
2207
- hparams.rope_finetuned = false;
2208
- GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
2209
- kv(LLM_KV_ROPE_SCALING_FINETUNED));
2481
+ bool rope_finetuned = false;
2482
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2483
+ hparams.rope_finetuned = rope_finetuned;
2210
2484
 
2211
2485
  hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
2212
- GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
2213
- kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
2486
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
2214
2487
 
2215
2488
  // rope_freq_base (optional)
2216
2489
  hparams.rope_freq_base_train = 10000.0f;
2217
- GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
2490
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
2218
2491
 
2219
2492
  std::string rope_scaling("linear");
2220
- GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
2493
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2221
2494
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2222
2495
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
2223
2496
 
2224
2497
  // rope_freq_scale (inverse of the kv) is optional
2225
2498
  float ropescale = 0.0f;
2226
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
2227
- if (ropescale == 0.0f) { // try the old key name
2228
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
2499
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
2500
+ // try the old key name
2501
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
2229
2502
  }
2230
2503
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
2231
2504
 
@@ -2233,7 +2506,7 @@ static void llm_load_hparams(
2233
2506
  {
2234
2507
  hparams.n_rot = hparams.n_embd / hparams.n_head;
2235
2508
 
2236
- GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
2509
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
2237
2510
 
2238
2511
  if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
2239
2512
  if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2248,7 +2521,7 @@ static void llm_load_hparams(
2248
2521
  switch (model.arch) {
2249
2522
  case LLM_ARCH_LLAMA:
2250
2523
  {
2251
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2524
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2252
2525
 
2253
2526
  switch (hparams.n_layer) {
2254
2527
  case 26: model.type = e_model::MODEL_3B; break;
@@ -2262,7 +2535,7 @@ static void llm_load_hparams(
2262
2535
  } break;
2263
2536
  case LLM_ARCH_FALCON:
2264
2537
  {
2265
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2538
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2266
2539
 
2267
2540
  switch (hparams.n_layer) {
2268
2541
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2272,7 +2545,7 @@ static void llm_load_hparams(
2272
2545
  } break;
2273
2546
  case LLM_ARCH_BAICHUAN:
2274
2547
  {
2275
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2548
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2276
2549
  switch (hparams.n_layer) {
2277
2550
  case 32: model.type = e_model::MODEL_7B; break;
2278
2551
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2281,7 +2554,7 @@ static void llm_load_hparams(
2281
2554
  } break;
2282
2555
  case LLM_ARCH_STARCODER:
2283
2556
  {
2284
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2557
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2285
2558
  switch (hparams.n_layer) {
2286
2559
  case 24: model.type = e_model::MODEL_1B; break;
2287
2560
  case 36: model.type = e_model::MODEL_3B; break;
@@ -2292,7 +2565,7 @@ static void llm_load_hparams(
2292
2565
  } break;
2293
2566
  case LLM_ARCH_PERSIMMON:
2294
2567
  {
2295
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2568
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2296
2569
  switch (hparams.n_layer) {
2297
2570
  case 36: model.type = e_model::MODEL_8B; break;
2298
2571
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2300,7 +2573,7 @@ static void llm_load_hparams(
2300
2573
  } break;
2301
2574
  case LLM_ARCH_REFACT:
2302
2575
  {
2303
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2576
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2304
2577
  switch (hparams.n_layer) {
2305
2578
  case 32: model.type = e_model::MODEL_1B; break;
2306
2579
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2308,7 +2581,7 @@ static void llm_load_hparams(
2308
2581
  } break;
2309
2582
  case LLM_ARCH_BLOOM:
2310
2583
  {
2311
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2584
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2312
2585
 
2313
2586
  switch (hparams.n_layer) {
2314
2587
  case 24: model.type = e_model::MODEL_1B; break;
@@ -2323,9 +2596,9 @@ static void llm_load_hparams(
2323
2596
  {
2324
2597
  hparams.f_clamp_kqv = 0.0f;
2325
2598
 
2326
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2327
- GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2328
- GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2599
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2600
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
2601
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
2329
2602
 
2330
2603
  switch (hparams.n_layer) {
2331
2604
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2335,13 +2608,23 @@ static void llm_load_hparams(
2335
2608
  } break;
2336
2609
  case LLM_ARCH_STABLELM:
2337
2610
  {
2338
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2611
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2339
2612
 
2340
2613
  switch (hparams.n_layer) {
2341
2614
  case 32: model.type = e_model::MODEL_3B; break;
2342
2615
  default: model.type = e_model::MODEL_UNKNOWN;
2343
2616
  }
2344
2617
  } break;
2618
+ case LLM_ARCH_QWEN:
2619
+ {
2620
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2621
+
2622
+ switch (hparams.n_layer) {
2623
+ case 32: model.type = e_model::MODEL_7B; break;
2624
+ case 40: model.type = e_model::MODEL_13B; break;
2625
+ default: model.type = e_model::MODEL_UNKNOWN;
2626
+ }
2627
+ } break;
2345
2628
 
2346
2629
  default: (void)0;
2347
2630
  }
@@ -2383,7 +2666,7 @@ static void llm_load_vocab(
2383
2666
  {
2384
2667
  std::string tokenizer_name;
2385
2668
 
2386
- GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
2669
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
2387
2670
 
2388
2671
  if (tokenizer_name == "llama") {
2389
2672
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2473,34 +2756,31 @@ static void llm_load_vocab(
2473
2756
  };
2474
2757
  for (const auto & it : special_token_types) {
2475
2758
  const std::string & key = kv(std::get<0>(it));
2476
- int32_t & id = std::get<1>(it), old_id = id;
2759
+ int32_t & id = std::get<1>(it);
2477
2760
 
2478
- GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2479
- // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2480
- // can only come from the default value, so there's no point in
2481
- // validating that.
2482
- if (size_t(id + 1) > vocab.id_to_token.size()) {
2483
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2484
- __func__, key.c_str(), id, old_id);
2485
- id = old_id;
2761
+ uint32_t new_id;
2762
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
2763
+ continue;
2764
+ }
2765
+ if (new_id >= vocab.id_to_token.size()) {
2766
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
2767
+ __func__, key.c_str(), new_id, id);
2768
+ } else {
2769
+ id = new_id;
2486
2770
  }
2487
2771
 
2488
2772
  }
2489
2773
 
2490
2774
  // Handle add_bos_token and add_eos_token
2491
- std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2492
- int kid = gguf_find_key(ctx, key.c_str());
2493
- enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2494
- vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2495
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2496
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2497
- }
2498
- key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2499
- kid = gguf_find_key(ctx, key.c_str());
2500
- ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2501
- vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2502
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2503
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2775
+ {
2776
+ bool temp = true;
2777
+
2778
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2779
+ vocab.special_add_bos = int(temp);
2780
+ }
2781
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2782
+ vocab.special_add_eos = int(temp);
2783
+ }
2504
2784
  }
2505
2785
  }
2506
2786
 
@@ -2511,7 +2791,7 @@ static void llm_load_vocab(
2511
2791
  // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2512
2792
  // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2513
2793
  // are special tokens.
2514
- // From testing, this appears to corelate 1:1 with special tokens.
2794
+ // From testing, this appears to correlate 1:1 with special tokens.
2515
2795
  //
2516
2796
 
2517
2797
  // Counting special tokens and verifying in only one direction
@@ -2624,6 +2904,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2624
2904
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2625
2905
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2626
2906
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2907
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
2908
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2627
2909
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2628
2910
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2629
2911
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2733,14 +3015,7 @@ static void llm_load_tensors(
2733
3015
  ggml_backend_type backend_output;
2734
3016
 
2735
3017
  if (n_gpu_layers > int(n_layer)) {
2736
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2737
- // on Windows however this is detrimental unless everything is on the GPU
2738
- #ifndef _WIN32
2739
- backend_norm = llama_backend_offload;
2740
- #else
2741
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2742
- #endif // _WIN32
2743
-
3018
+ backend_norm = llama_backend_offload;
2744
3019
  backend_output = llama_backend_offload_split;
2745
3020
  } else {
2746
3021
  backend_norm = GGML_BACKEND_CPU;
@@ -2777,17 +3052,55 @@ static void llm_load_tensors(
2777
3052
  layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2778
3053
  layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2779
3054
 
3055
+ // optional bias tensors
3056
+ layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3057
+ layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3058
+ layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3059
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3060
+
2780
3061
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
3062
 
2782
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2783
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2784
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3063
+ layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3064
+
3065
+ if (layer.ffn_gate_inp == nullptr) {
3066
+ GGML_ASSERT(hparams.n_expert == 0);
3067
+ GGML_ASSERT(hparams.n_expert_used == 0);
3068
+
3069
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3070
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3071
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3072
+ } else {
3073
+ GGML_ASSERT(hparams.n_expert > 0);
3074
+ GGML_ASSERT(hparams.n_expert_used > 0);
3075
+
3076
+ // MoE branch
3077
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3078
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3079
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3080
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3081
+ }
3082
+ }
2785
3083
 
2786
3084
  if (backend == GGML_BACKEND_GPU) {
2787
3085
  vram_weights +=
2788
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2789
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2790
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3086
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3087
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3088
+ (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3089
+ (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3090
+ (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3091
+ (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3092
+ ggml_nbytes(layer.ffn_norm);
3093
+
3094
+ if (layer.ffn_gate_inp == nullptr) {
3095
+ vram_weights +=
3096
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3097
+ } else {
3098
+ vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3099
+ for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3100
+ vram_weights +=
3101
+ ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3102
+ }
3103
+ }
2791
3104
  }
2792
3105
  }
2793
3106
  } break;
@@ -2799,14 +3112,7 @@ static void llm_load_tensors(
2799
3112
  ggml_backend_type backend_output;
2800
3113
 
2801
3114
  if (n_gpu_layers > int(n_layer)) {
2802
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2803
- // on Windows however this is detrimental unless everything is on the GPU
2804
- #ifndef _WIN32
2805
- backend_norm = llama_backend_offload;
2806
- #else
2807
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2808
- #endif // _WIN32
2809
-
3115
+ backend_norm = llama_backend_offload;
2810
3116
  backend_output = llama_backend_offload_split;
2811
3117
  } else {
2812
3118
  backend_norm = GGML_BACKEND_CPU;
@@ -2869,14 +3175,7 @@ static void llm_load_tensors(
2869
3175
  ggml_backend_type backend_output;
2870
3176
 
2871
3177
  if (n_gpu_layers > int(n_layer)) {
2872
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2873
- // on Windows however this is detrimental unless everything is on the GPU
2874
- #ifndef _WIN32
2875
- backend_norm = llama_backend_offload;
2876
- #else
2877
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2878
- #endif // _WIN32
2879
-
3178
+ backend_norm = llama_backend_offload;
2880
3179
  backend_output = llama_backend_offload_split;
2881
3180
  } else {
2882
3181
  backend_norm = GGML_BACKEND_CPU;
@@ -2946,14 +3245,7 @@ static void llm_load_tensors(
2946
3245
  ggml_backend_type backend_output;
2947
3246
 
2948
3247
  if (n_gpu_layers > int(n_layer)) {
2949
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2950
- // on Windows however this is detrimental unless everything is on the GPU
2951
- #ifndef _WIN32
2952
- backend_norm = llama_backend_offload;
2953
- #else
2954
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2955
- #endif // _WIN32
2956
-
3248
+ backend_norm = llama_backend_offload;
2957
3249
  backend_output = llama_backend_offload_split;
2958
3250
  } else {
2959
3251
  backend_norm = GGML_BACKEND_CPU;
@@ -3023,21 +3315,7 @@ static void llm_load_tensors(
3023
3315
  ggml_backend_type backend_output;
3024
3316
 
3025
3317
  if (n_gpu_layers > int(n_layer)) {
3026
- #ifdef GGML_USE_CUBLAS
3027
- if (n_gpu_layers > int(n_layer + 1)) {
3028
- LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3029
- __func__, n_layer + 1);
3030
- throw std::runtime_error("Persimmon CUDA offload failed");
3031
- }
3032
- #endif
3033
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3034
- // on Windows however this is detrimental unless everything is on the GPU
3035
- #ifndef _WIN32
3036
- backend_norm = llama_backend_offload;
3037
- #else
3038
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3039
- #endif // _WIN32
3040
-
3318
+ backend_norm = llama_backend_offload;
3041
3319
  backend_output = llama_backend_offload_split;
3042
3320
  } else {
3043
3321
  backend_norm = GGML_BACKEND_CPU;
@@ -3096,14 +3374,7 @@ static void llm_load_tensors(
3096
3374
  ggml_backend_type backend_output;
3097
3375
 
3098
3376
  if (n_gpu_layers > int(n_layer)) {
3099
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3100
- // on Windows however this is detrimental unless everything is on the GPU
3101
- #ifndef _WIN32
3102
- backend_norm = llama_backend_offload;
3103
- #else
3104
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3105
- #endif // _WIN32
3106
-
3377
+ backend_norm = llama_backend_offload;
3107
3378
  backend_output = llama_backend_offload_split;
3108
3379
  } else {
3109
3380
  backend_norm = GGML_BACKEND_CPU;
@@ -3174,14 +3445,7 @@ static void llm_load_tensors(
3174
3445
  ggml_backend_type backend_output;
3175
3446
 
3176
3447
  if (n_gpu_layers > int(n_layer)) {
3177
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3178
- // on Windows however this is detrimental unless everything is on the GPU
3179
- #ifndef _WIN32
3180
- backend_norm = llama_backend_offload;
3181
- #else
3182
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3183
- #endif // _WIN32
3184
-
3448
+ backend_norm = llama_backend_offload;
3185
3449
  backend_output = llama_backend_offload_split;
3186
3450
  } else {
3187
3451
  backend_norm = GGML_BACKEND_CPU;
@@ -3241,14 +3505,7 @@ static void llm_load_tensors(
3241
3505
  ggml_backend_type backend_output;
3242
3506
 
3243
3507
  if (n_gpu_layers > int(n_layer)) {
3244
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3245
- // on Windows however this is detrimental unless everything is on the GPU
3246
- #ifndef _WIN32
3247
- backend_norm = llama_backend_offload;
3248
- #else
3249
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3250
- #endif // _WIN32
3251
-
3508
+ backend_norm = llama_backend_offload;
3252
3509
  backend_output = llama_backend_offload_split;
3253
3510
  } else {
3254
3511
  backend_norm = GGML_BACKEND_CPU;
@@ -3305,6 +3562,64 @@ static void llm_load_tensors(
3305
3562
  }
3306
3563
  }
3307
3564
  } break;
3565
+ case LLM_ARCH_QWEN:
3566
+ {
3567
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3568
+ {
3569
+ ggml_backend_type backend_norm;
3570
+ ggml_backend_type backend_output;
3571
+
3572
+ if (n_gpu_layers > int(n_layer)) {
3573
+ backend_norm = llama_backend_offload;
3574
+ backend_output = llama_backend_offload_split;
3575
+ } else {
3576
+ backend_norm = GGML_BACKEND_CPU;
3577
+ backend_output = GGML_BACKEND_CPU;
3578
+ }
3579
+
3580
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3581
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3582
+
3583
+ if (backend_norm == GGML_BACKEND_GPU) {
3584
+ vram_weights += ggml_nbytes(model.output_norm);
3585
+ }
3586
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3587
+ vram_weights += ggml_nbytes(model.output);
3588
+ }
3589
+ }
3590
+
3591
+ const uint32_t n_ff = hparams.n_ff / 2;
3592
+
3593
+ const int i_gpu_start = n_layer - n_gpu_layers;
3594
+
3595
+ model.layers.resize(n_layer);
3596
+
3597
+ for (uint32_t i = 0; i < n_layer; ++i) {
3598
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3599
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3600
+
3601
+ auto & layer = model.layers[i];
3602
+
3603
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3604
+
3605
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3606
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3607
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3608
+
3609
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3610
+
3611
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3612
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3613
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3614
+
3615
+ if (backend == GGML_BACKEND_GPU) {
3616
+ vram_weights +=
3617
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3618
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3619
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3620
+ }
3621
+ }
3622
+ } break;
3308
3623
 
3309
3624
  default:
3310
3625
  throw std::runtime_error("unknown architecture");
@@ -3331,8 +3646,8 @@ static void llm_load_tensors(
3331
3646
  }
3332
3647
 
3333
3648
  #ifdef GGML_USE_CUBLAS
3334
- const int max_backend_supported_layers = hparams.n_layer + 3;
3335
- const int max_offloadable_layers = hparams.n_layer + 3;
3649
+ const int max_backend_supported_layers = hparams.n_layer + 1;
3650
+ const int max_offloadable_layers = hparams.n_layer + 1;
3336
3651
  #elif GGML_USE_CLBLAST
3337
3652
  const int max_backend_supported_layers = hparams.n_layer + 1;
3338
3653
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -3373,7 +3688,7 @@ static void llm_load_tensors(
3373
3688
 
3374
3689
  static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3375
3690
  try {
3376
- llama_model_loader ml(fname, params.use_mmap);
3691
+ llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3377
3692
 
3378
3693
  model.hparams.vocab_only = params.vocab_only;
3379
3694
 
@@ -3500,11 +3815,11 @@ static void llm_build_k_shift(
3500
3815
  struct ggml_tensor * tmp =
3501
3816
  // we rotate only the first n_rot dimensions
3502
3817
  ggml_rope_custom_inplace(ctx,
3503
- ggml_view_3d(ctx, kv.k,
3818
+ ggml_view_3d(ctx, kv.k_l[il],
3504
3819
  n_embd_head, n_head_kv, n_ctx,
3505
- ggml_element_size(kv.k)*n_embd_head,
3506
- ggml_element_size(kv.k)*n_embd_gqa,
3507
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
3820
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
3821
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3822
+ 0),
3508
3823
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3509
3824
  ext_factor, attn_factor, beta_fast, beta_slow);
3510
3825
  cb(tmp, "K_shifted", il);
@@ -3531,13 +3846,13 @@ static void llm_build_kv_store(
3531
3846
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
3532
3847
  cb(v_cur_t, "v_cur_t", il);
3533
3848
 
3534
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
3535
- (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3849
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3850
+ (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3536
3851
  cb(k_cache_view, "k_cache_view", il);
3537
3852
 
3538
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
3539
- ( n_ctx)*ggml_element_size(kv.v),
3540
- (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
3853
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
3854
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
3855
+ (kv_head)*ggml_element_size(kv.v_l[il]));
3541
3856
  cb(v_cache_view, "v_cache_view", il);
3542
3857
 
3543
3858
  // important: storing RoPE-ed version of K in the KV cache!
@@ -3689,11 +4004,11 @@ static struct ggml_tensor * llm_build_kqv(
3689
4004
  cb(q, "q", il);
3690
4005
 
3691
4006
  struct ggml_tensor * k =
3692
- ggml_view_3d(ctx, kv.k,
4007
+ ggml_view_3d(ctx, kv.k_l[il],
3693
4008
  n_embd_head, n_kv, n_head_kv,
3694
- ggml_element_size(kv.k)*n_embd_gqa,
3695
- ggml_element_size(kv.k)*n_embd_head,
3696
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
4009
+ ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4010
+ ggml_row_size(kv.k_l[il]->type, n_embd_head),
4011
+ 0);
3697
4012
  cb(k, "k", il);
3698
4013
 
3699
4014
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@@ -3724,11 +4039,11 @@ static struct ggml_tensor * llm_build_kqv(
3724
4039
 
3725
4040
  // split cached v into n_head heads
3726
4041
  struct ggml_tensor * v =
3727
- ggml_view_3d(ctx, kv.v,
4042
+ ggml_view_3d(ctx, kv.v_l[il],
3728
4043
  n_kv, n_embd_head, n_head_kv,
3729
- ggml_element_size(kv.v)*n_ctx,
3730
- ggml_element_size(kv.v)*n_ctx*n_embd_head,
3731
- ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
4044
+ ggml_element_size(kv.v_l[il])*n_ctx,
4045
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
4046
+ 0);
3732
4047
  cb(v, "v", il);
3733
4048
 
3734
4049
  struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
@@ -3766,6 +4081,8 @@ struct llm_build_context {
3766
4081
  const int64_t n_head_kv;
3767
4082
  const int64_t n_embd_head;
3768
4083
  const int64_t n_embd_gqa;
4084
+ const int64_t n_expert;
4085
+ const int64_t n_expert_used;
3769
4086
 
3770
4087
  const float freq_base;
3771
4088
  const float freq_scale;
@@ -3807,6 +4124,8 @@ struct llm_build_context {
3807
4124
  n_head_kv (hparams.n_head_kv),
3808
4125
  n_embd_head (hparams.n_embd_head()),
3809
4126
  n_embd_gqa (hparams.n_embd_gqa()),
4127
+ n_expert (hparams.n_expert),
4128
+ n_expert_used (hparams.n_expert_used),
3810
4129
  freq_base (cparams.rope_freq_base),
3811
4130
  freq_scale (cparams.rope_freq_scale),
3812
4131
  ext_factor (cparams.yarn_ext_factor),
@@ -3886,12 +4205,24 @@ struct llm_build_context {
3886
4205
  // compute Q and K and RoPE them
3887
4206
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3888
4207
  cb(Qcur, "Qcur", il);
4208
+ if (model.layers[il].bq) {
4209
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4210
+ cb(Qcur, "Qcur", il);
4211
+ }
3889
4212
 
3890
4213
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3891
4214
  cb(Kcur, "Kcur", il);
4215
+ if (model.layers[il].bk) {
4216
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4217
+ cb(Kcur, "Kcur", il);
4218
+ }
3892
4219
 
3893
4220
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3894
4221
  cb(Vcur, "Vcur", il);
4222
+ if (model.layers[il].bv) {
4223
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4224
+ cb(Vcur, "Vcur", il);
4225
+ }
3895
4226
 
3896
4227
  Qcur = ggml_rope_custom(
3897
4228
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -3910,7 +4241,7 @@ struct llm_build_context {
3910
4241
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3911
4242
 
3912
4243
  cur = llm_build_kqv(ctx0, hparams, kv_self,
3913
- model.layers[il].wo, NULL,
4244
+ model.layers[il].wo, model.layers[il].bo,
3914
4245
  Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
3915
4246
  cb(cur, "kqv_out", il);
3916
4247
  }
@@ -3919,7 +4250,7 @@ struct llm_build_context {
3919
4250
  cb(ffn_inp, "ffn_inp", il);
3920
4251
 
3921
4252
  // feed-forward network
3922
- {
4253
+ if (model.layers[il].ffn_gate_inp == nullptr) {
3923
4254
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
3924
4255
  model.layers[il].ffn_norm, NULL,
3925
4256
  LLM_NORM_RMS, cb, il);
@@ -3931,6 +4262,69 @@ struct llm_build_context {
3931
4262
  model.layers[il].ffn_down, NULL,
3932
4263
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
3933
4264
  cb(cur, "ffn_out", il);
4265
+ } else {
4266
+ // MoE branch
4267
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4268
+ model.layers[il].ffn_norm, NULL,
4269
+ LLM_NORM_RMS, cb, il);
4270
+ cb(cur, "ffn_norm", il);
4271
+
4272
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4273
+ cb(logits, "ffn_moe_logits", il);
4274
+
4275
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4276
+ cb(probs, "ffn_moe_probs", il);
4277
+
4278
+ // select experts
4279
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
4280
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
4281
+
4282
+ ggml_tensor * weights = ggml_get_rows(ctx0,
4283
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
4284
+ cb(weights, "ffn_moe_weights", il);
4285
+
4286
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
4287
+
4288
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4289
+ cb(weights_sum, "ffn_moe_weights_sum", il);
4290
+
4291
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4292
+ cb(weights, "ffn_moe_weights_norm", il);
4293
+
4294
+ // compute expert outputs
4295
+ ggml_tensor * moe_out = nullptr;
4296
+
4297
+ for (int i = 0; i < n_expert_used; ++i) {
4298
+ ggml_tensor * cur_expert;
4299
+
4300
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4301
+ cb(cur_up, "ffn_moe_up", il);
4302
+
4303
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
4304
+ cb(cur_gate, "ffn_moe_gate", il);
4305
+
4306
+ cur_gate = ggml_silu(ctx0, cur_gate);
4307
+ cb(cur_gate, "ffn_moe_silu", il);
4308
+
4309
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4310
+ cb(cur_expert, "ffn_moe_gate_par", il);
4311
+
4312
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4313
+ cb(cur_expert, "ffn_moe_down", il);
4314
+
4315
+ cur_expert = ggml_mul(ctx0, cur_expert,
4316
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
4317
+ cb(cur_expert, "ffn_moe_weighted", il);
4318
+
4319
+ if (i == 0) {
4320
+ moe_out = cur_expert;
4321
+ } else {
4322
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
4323
+ cb(moe_out, "ffn_moe_out", il);
4324
+ }
4325
+ }
4326
+
4327
+ cur = moe_out;
3934
4328
  }
3935
4329
 
3936
4330
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4308,6 +4702,7 @@ struct llm_build_context {
4308
4702
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4309
4703
  cb(inpL, "imp_embd", -1);
4310
4704
 
4705
+ // inp_pos - contains the positions
4311
4706
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4312
4707
  cb(inp_pos, "inp_pos", -1);
4313
4708
 
@@ -4315,6 +4710,7 @@ struct llm_build_context {
4315
4710
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4316
4711
  cb(KQ_scale, "KQ_scale", -1);
4317
4712
 
4713
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4318
4714
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4319
4715
  cb(KQ_mask, "KQ_mask", -1);
4320
4716
 
@@ -4903,6 +5299,121 @@ struct llm_build_context {
4903
5299
 
4904
5300
  return gf;
4905
5301
  }
5302
+
5303
+ struct ggml_cgraph * build_qwen() {
5304
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5305
+
5306
+ struct ggml_tensor * cur;
5307
+ struct ggml_tensor * inpL;
5308
+
5309
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5310
+ cb(inpL, "inp_embd", -1);
5311
+
5312
+ // inp_pos - contains the positions
5313
+ struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5314
+ cb(inp_pos, "inp_pos", -1);
5315
+
5316
+ // KQ_scale
5317
+ struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5318
+ cb(KQ_scale, "KQ_scale", -1);
5319
+
5320
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5321
+ struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5322
+ cb(KQ_mask, "KQ_mask", -1);
5323
+
5324
+ // shift the entire K-cache if needed
5325
+ if (do_rope_shift) {
5326
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5327
+ }
5328
+
5329
+ for (int il = 0; il < n_layer; ++il) {
5330
+ struct ggml_tensor * inpSA = inpL;
5331
+
5332
+ cur = llm_build_norm(ctx0, inpL, hparams,
5333
+ model.layers[il].attn_norm, NULL,
5334
+ LLM_NORM_RMS, cb, il);
5335
+ cb(cur, "attn_norm", il);
5336
+
5337
+ // self-attention
5338
+ {
5339
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5340
+ cb(cur, "wqkv", il);
5341
+
5342
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5343
+ cb(cur, "bqkv", il);
5344
+
5345
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5346
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5347
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5348
+
5349
+ cb(Qcur, "Qcur", il);
5350
+ cb(Kcur, "Kcur", il);
5351
+ cb(Vcur, "Vcur", il);
5352
+
5353
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5354
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5355
+
5356
+ // using mode = 2 for neox mode
5357
+ Qcur = ggml_rope_custom(
5358
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5359
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5360
+ );
5361
+ cb(Qcur, "Qcur", il);
5362
+
5363
+ Kcur = ggml_rope_custom(
5364
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5365
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5366
+ );
5367
+ cb(Kcur, "Kcur", il);
5368
+
5369
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5370
+
5371
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
5372
+ model.layers[il].wo, NULL,
5373
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5374
+ cb(cur, "kqv_out", il);
5375
+ }
5376
+
5377
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5378
+ cb(ffn_inp, "ffn_inp", il);
5379
+
5380
+ // feed-forward forward
5381
+ {
5382
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5383
+ model.layers[il].ffn_norm, NULL,
5384
+ LLM_NORM_RMS, cb, il);
5385
+ cb(cur, "ffn_norm", il);
5386
+
5387
+ cur = llm_build_ffn(ctx0, cur,
5388
+ model.layers[il].ffn_up, NULL,
5389
+ model.layers[il].ffn_gate, NULL,
5390
+ model.layers[il].ffn_down, NULL,
5391
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5392
+ cb(cur, "ffn_out", il);
5393
+ }
5394
+
5395
+ cur = ggml_add(ctx0, cur, ffn_inp);
5396
+ cb(cur, "l_out", il);
5397
+
5398
+ // input for next layer
5399
+ inpL = cur;
5400
+ }
5401
+
5402
+ cur = inpL;
5403
+
5404
+ cur = llm_build_norm(ctx0, cur, hparams,
5405
+ model.output_norm, NULL,
5406
+ LLM_NORM_RMS, cb, -1);
5407
+ cb(cur, "result_norm", -1);
5408
+
5409
+ // lm_head
5410
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5411
+ cb(cur, "result_output", -1);
5412
+
5413
+ ggml_build_forward_expand(gf, cur);
5414
+
5415
+ return gf;
5416
+ }
4906
5417
  };
4907
5418
 
4908
5419
  //
@@ -4913,8 +5424,8 @@ struct llm_build_context {
4913
5424
  enum llm_offload_func_e {
4914
5425
  OFFLOAD_FUNC_NOP,
4915
5426
  OFFLOAD_FUNC,
4916
- OFFLOAD_FUNC_KQ,
4917
- OFFLOAD_FUNC_V,
5427
+ OFFLOAD_FUNC_FRC, // force offload
5428
+ OFFLOAD_FUNC_KQV,
4918
5429
  OFFLOAD_FUNC_NR,
4919
5430
  OFFLOAD_FUNC_EMB,
4920
5431
  OFFLOAD_FUNC_OUT,
@@ -5000,11 +5511,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5511
  //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
5001
5512
  { "pos_embd", OFFLOAD_FUNC_NR },
5002
5513
 
5003
- { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope)
5004
- { "KQ_scale", OFFLOAD_FUNC_KQ },
5005
- { "KQ_mask", OFFLOAD_FUNC_KQ },
5006
- { "K_shift", OFFLOAD_FUNC_KQ },
5007
- { "K_shifted", OFFLOAD_FUNC_KQ },
5514
+ { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5515
+ { "KQ_scale", OFFLOAD_FUNC_FRC },
5516
+ { "KQ_mask", OFFLOAD_FUNC_FRC },
5517
+ { "K_shift", OFFLOAD_FUNC_FRC },
5518
+
5519
+ { "K_shifted", OFFLOAD_FUNC },
5008
5520
 
5009
5521
  { "inp_norm", OFFLOAD_FUNC_NR },
5010
5522
  { "inp_norm_w", OFFLOAD_FUNC_NR },
@@ -5017,38 +5529,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5017
5529
  { "attn_norm", OFFLOAD_FUNC },
5018
5530
  { "attn_norm_2", OFFLOAD_FUNC },
5019
5531
 
5020
- { "wqkv", OFFLOAD_FUNC_KQ },
5021
- { "bqkv", OFFLOAD_FUNC_KQ },
5022
- { "wqkv_clamped", OFFLOAD_FUNC_KQ },
5023
-
5024
- { "tmpk", OFFLOAD_FUNC_KQ },
5025
- { "tmpq", OFFLOAD_FUNC_KQ },
5026
- { "tmpv", OFFLOAD_FUNC_V },
5027
- { "Kcur", OFFLOAD_FUNC_KQ },
5028
- { "Qcur", OFFLOAD_FUNC_KQ },
5029
- { "Vcur", OFFLOAD_FUNC_V },
5030
-
5031
- { "krot", OFFLOAD_FUNC_KQ },
5032
- { "qrot", OFFLOAD_FUNC_KQ },
5033
- { "kpass", OFFLOAD_FUNC_KQ },
5034
- { "qpass", OFFLOAD_FUNC_KQ },
5035
- { "krotated", OFFLOAD_FUNC_KQ },
5036
- { "qrotated", OFFLOAD_FUNC_KQ },
5037
-
5038
- { "q", OFFLOAD_FUNC_KQ },
5039
- { "k", OFFLOAD_FUNC_KQ },
5040
- { "kq", OFFLOAD_FUNC_KQ },
5041
- { "kq_scaled", OFFLOAD_FUNC_KQ },
5042
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5043
- { "kq_masked", OFFLOAD_FUNC_KQ },
5044
- { "kq_soft_max", OFFLOAD_FUNC_V },
5045
- { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5046
- { "v", OFFLOAD_FUNC_V },
5047
- { "kqv", OFFLOAD_FUNC_V },
5048
- { "kqv_merged", OFFLOAD_FUNC_V },
5049
- { "kqv_merged_cont", OFFLOAD_FUNC_V },
5050
- { "kqv_wo", OFFLOAD_FUNC_V },
5051
- { "kqv_out", OFFLOAD_FUNC_V },
5532
+ { "wqkv", OFFLOAD_FUNC_KQV },
5533
+ { "bqkv", OFFLOAD_FUNC_KQV },
5534
+ { "wqkv_clamped", OFFLOAD_FUNC_KQV },
5535
+
5536
+ { "tmpk", OFFLOAD_FUNC_KQV },
5537
+ { "tmpq", OFFLOAD_FUNC_KQV },
5538
+ { "tmpv", OFFLOAD_FUNC_KQV },
5539
+ { "Kcur", OFFLOAD_FUNC_KQV },
5540
+ { "Qcur", OFFLOAD_FUNC_KQV },
5541
+ { "Vcur", OFFLOAD_FUNC_KQV },
5542
+
5543
+ { "krot", OFFLOAD_FUNC_KQV },
5544
+ { "qrot", OFFLOAD_FUNC_KQV },
5545
+ { "kpass", OFFLOAD_FUNC_KQV },
5546
+ { "qpass", OFFLOAD_FUNC_KQV },
5547
+ { "krotated", OFFLOAD_FUNC_KQV },
5548
+ { "qrotated", OFFLOAD_FUNC_KQV },
5549
+
5550
+ { "q", OFFLOAD_FUNC_KQV },
5551
+ { "k", OFFLOAD_FUNC_KQV },
5552
+ { "kq", OFFLOAD_FUNC_KQV },
5553
+ { "kq_scaled", OFFLOAD_FUNC_KQV },
5554
+ { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
5555
+ { "kq_masked", OFFLOAD_FUNC_KQV },
5556
+ { "kq_soft_max", OFFLOAD_FUNC_KQV },
5557
+ { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
5558
+ { "v", OFFLOAD_FUNC_KQV },
5559
+ { "kqv", OFFLOAD_FUNC_KQV },
5560
+ { "kqv_merged", OFFLOAD_FUNC_KQV },
5561
+ { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
5562
+ { "kqv_wo", OFFLOAD_FUNC_KQV },
5563
+ { "kqv_out", OFFLOAD_FUNC_KQV },
5052
5564
 
5053
5565
  { "ffn_inp", OFFLOAD_FUNC },
5054
5566
  { "ffn_norm", OFFLOAD_FUNC },
@@ -5067,6 +5579,20 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5067
5579
  { "ffn_relu", OFFLOAD_FUNC },
5068
5580
  { "ffn_sqr(relu)", OFFLOAD_FUNC },
5069
5581
 
5582
+ { "ffn_moe_logits", OFFLOAD_FUNC },
5583
+ { "ffn_moe_probs", OFFLOAD_FUNC },
5584
+ { "ffn_moe_argsort", OFFLOAD_FUNC },
5585
+ { "ffn_moe_weights", OFFLOAD_FUNC },
5586
+ { "ffn_moe_weights_sum", OFFLOAD_FUNC },
5587
+ { "ffn_moe_weights_norm", OFFLOAD_FUNC },
5588
+ { "ffn_moe_weighted", OFFLOAD_FUNC },
5589
+ { "ffn_moe_up", OFFLOAD_FUNC },
5590
+ { "ffn_moe_gate", OFFLOAD_FUNC },
5591
+ { "ffn_moe_silu", OFFLOAD_FUNC },
5592
+ { "ffn_moe_gate_par", OFFLOAD_FUNC },
5593
+ { "ffn_moe_down", OFFLOAD_FUNC },
5594
+ { "ffn_moe_out", OFFLOAD_FUNC },
5595
+
5070
5596
  { "l_out", OFFLOAD_FUNC },
5071
5597
 
5072
5598
  { "result_norm", OFFLOAD_FUNC_EMB },
@@ -5240,15 +5766,15 @@ static struct ggml_cgraph * llama_build_graph(
5240
5766
  { OFFLOAD_FUNC_NOP, "CPU" },
5241
5767
  { OFFLOAD_FUNC_OUT, "CPU" },
5242
5768
  #ifdef GGML_USE_CUBLAS
5243
- { OFFLOAD_FUNC, "GPU (CUDA)" },
5244
- { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" },
5245
- { OFFLOAD_FUNC_V, "GPU (CUDA) V" },
5246
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5769
+ { OFFLOAD_FUNC, "GPU (CUDA)" },
5770
+ { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5771
+ { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
5772
+ { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5247
5773
  { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
5248
5774
  #else
5249
5775
  { OFFLOAD_FUNC, "CPU" },
5250
- { OFFLOAD_FUNC_KQ, "CPU" },
5251
- { OFFLOAD_FUNC_V, "CPU" },
5776
+ { OFFLOAD_FUNC_FRC, "CPU" },
5777
+ { OFFLOAD_FUNC_KQV, "CPU" },
5252
5778
  { OFFLOAD_FUNC_NR, "CPU" },
5253
5779
  { OFFLOAD_FUNC_EMB, "CPU" },
5254
5780
  #endif // GGML_USE_CUBLAS
@@ -5281,18 +5807,23 @@ static struct ggml_cgraph * llama_build_graph(
5281
5807
  }
5282
5808
  }
5283
5809
  break;
5284
- case OFFLOAD_FUNC_NR:
5285
- if (n_gpu_layers <= n_layer + 0) {
5810
+ case OFFLOAD_FUNC_FRC:
5811
+ if (!lctx.cparams.offload_kqv) {
5286
5812
  func_e = OFFLOAD_FUNC_NOP;
5287
- }
5288
- break;
5289
- case OFFLOAD_FUNC_V:
5290
- if (n_gpu_layers <= n_layer + 1) {
5813
+ } break;
5814
+ case OFFLOAD_FUNC_KQV:
5815
+ if (!lctx.cparams.offload_kqv) {
5291
5816
  func_e = OFFLOAD_FUNC_NOP;
5817
+ } else {
5818
+ if (n_gpu_layers < n_layer) {
5819
+ if (il < i_gpu_start) {
5820
+ func_e = OFFLOAD_FUNC_NOP;
5821
+ }
5822
+ }
5292
5823
  }
5293
5824
  break;
5294
- case OFFLOAD_FUNC_KQ:
5295
- if (n_gpu_layers <= n_layer + 2) {
5825
+ case OFFLOAD_FUNC_NR:
5826
+ if (n_gpu_layers <= n_layer + 0) {
5296
5827
  func_e = OFFLOAD_FUNC_NOP;
5297
5828
  }
5298
5829
  break;
@@ -5317,8 +5848,8 @@ static struct ggml_cgraph * llama_build_graph(
5317
5848
  case OFFLOAD_FUNC_NOP:
5318
5849
  case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
5319
5850
  case OFFLOAD_FUNC:
5320
- case OFFLOAD_FUNC_KQ:
5321
- case OFFLOAD_FUNC_V:
5851
+ case OFFLOAD_FUNC_KQV:
5852
+ case OFFLOAD_FUNC_FRC:
5322
5853
  case OFFLOAD_FUNC_NR:
5323
5854
  case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
5324
5855
  default: GGML_ASSERT(false);
@@ -5377,6 +5908,10 @@ static struct ggml_cgraph * llama_build_graph(
5377
5908
  {
5378
5909
  result = llm.build_stablelm();
5379
5910
  } break;
5911
+ case LLM_ARCH_QWEN:
5912
+ {
5913
+ result = llm.build_qwen();
5914
+ } break;
5380
5915
  default:
5381
5916
  GGML_ASSERT(false);
5382
5917
  }
@@ -5454,7 +5989,7 @@ static int llama_decode_internal(
5454
5989
  const int64_t n_embd = hparams.n_embd;
5455
5990
  const int64_t n_vocab = hparams.n_vocab;
5456
5991
 
5457
- // helpers for smoother batch API transistion
5992
+ // helpers for smoother batch API transition
5458
5993
  // after deprecating the llama_eval calls, these will be removed
5459
5994
  std::vector<llama_pos> pos;
5460
5995
 
@@ -5499,8 +6034,8 @@ static int llama_decode_internal(
5499
6034
  // a heuristic, to avoid attending the full cache if it is not yet utilized
5500
6035
  // after enough generations, the benefit from this heuristic disappears
5501
6036
  // if we start defragmenting the cache, the benefit from this will be more important
5502
- //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5503
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
6037
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
6038
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
5504
6039
 
5505
6040
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5506
6041
 
@@ -5551,7 +6086,7 @@ static int llama_decode_internal(
5551
6086
  n_threads = std::min(4, n_threads);
5552
6087
  }
5553
6088
 
5554
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
6089
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
5555
6090
  if (ggml_cpu_has_cublas() && fully_offloaded) {
5556
6091
  n_threads = 1;
5557
6092
  }
@@ -6233,12 +6768,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
6233
6768
 
6234
6769
  // loop over the text
6235
6770
  while (true) {
6236
- // find the first occurence of a given special token in this fragment
6771
+ // find the first occurrence of a given special token in this fragment
6237
6772
  // passing offset argument only limit the "search area" but match coordinates
6238
6773
  // are still relative to the source full raw_text
6239
6774
  auto match = raw_text->find(special_token, raw_text_base_offset);
6240
6775
 
6241
- // no occurences found, stop processing this fragment for a given special token
6776
+ // no occurrences found, stop processing this fragment for a given special token
6242
6777
  if (match == std::string::npos) break;
6243
6778
 
6244
6779
  // check if match is within bounds of offset <-> length
@@ -6410,14 +6945,13 @@ struct llama_grammar_candidate {
6410
6945
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
6411
6946
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6412
6947
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6413
- const char * src,
6414
- size_t n_src,
6948
+ const std::string & src,
6415
6949
  llama_partial_utf8 partial_start) {
6416
6950
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6417
- const char * pos = src;
6951
+ const char * pos = src.c_str();
6418
6952
  std::vector<uint32_t> code_points;
6419
6953
  // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
- code_points.reserve(n_src + 1);
6954
+ code_points.reserve(src.size() + 1);
6421
6955
  uint32_t value = partial_start.value;
6422
6956
  int n_remain = partial_start.n_remain;
6423
6957
 
@@ -6468,13 +7002,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6468
7002
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6469
7003
  }
6470
7004
 
6471
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
- std::string src,
6473
- llama_partial_utf8 partial_start
6474
- ) {
6475
- return decode_utf8(src.c_str(), src.size(), partial_start);
6476
- }
6477
-
6478
7005
  // returns true iff pos points to the end of one of the definitions of a rule
6479
7006
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6480
7007
  switch (pos->type) {
@@ -7113,7 +7640,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7113
7640
  const llama_token eos = llama_token_eos(&ctx->model);
7114
7641
 
7115
7642
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7643
+ candidates_decoded.reserve(candidates->size);
7116
7644
  std::vector<llama_grammar_candidate> candidates_grammar;
7645
+ candidates_grammar.reserve(candidates->size);
7117
7646
 
7118
7647
  for (size_t i = 0; i < candidates->size; ++i) {
7119
7648
  const llama_token id = candidates->data[i].id;
@@ -7443,7 +7972,7 @@ struct llama_beam_search_data {
7443
7972
  }
7444
7973
 
7445
7974
  // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
7446
- // The repetative patterns below reflect the 2 stages of heaps:
7975
+ // The repetitive patterns below reflect the 2 stages of heaps:
7447
7976
  // * Gather elements until the vector is full, then call std::make_heap() on it.
7448
7977
  // * If the heap is full and a new element is found that should be included, pop the
7449
7978
  // least element to the back(), replace it with the new, then push it into the heap.
@@ -7650,18 +8179,21 @@ static void llama_convert_tensor_internal(
7650
8179
  return;
7651
8180
  }
7652
8181
 
7653
- auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
7654
- auto block_size_bytes = ggml_type_size(tensor->type);
8182
+ size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
8183
+ size_t block_size_bytes = ggml_type_size(tensor->type);
7655
8184
 
7656
8185
  GGML_ASSERT(nelements % block_size == 0);
7657
- auto nblocks = nelements / block_size;
7658
- auto blocks_per_thread = nblocks / nthread;
7659
- auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8186
+ size_t nblocks = nelements / block_size;
8187
+ size_t blocks_per_thread = nblocks / nthread;
8188
+ size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8189
+
8190
+ size_t in_buff_offs = 0;
8191
+ size_t out_buff_offs = 0;
7660
8192
 
7661
- for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
7662
- auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
7663
- auto thr_elems = thr_blocks * block_size; // number of elements for this thread
7664
- auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
8193
+ for (int tnum = 0; tnum < nthread; tnum++) {
8194
+ size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
8195
+ size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
8196
+ size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
7665
8197
 
7666
8198
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
7667
8199
  if (typ == GGML_TYPE_F16) {
@@ -7678,11 +8210,9 @@ static void llama_convert_tensor_internal(
7678
8210
  workers.clear();
7679
8211
  }
7680
8212
 
7681
- static ggml_type get_k_quant_type(
7682
- quantize_state_internal & qs,
7683
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
7684
- ) {
8213
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
7685
8214
  const std::string name = ggml_get_name(tensor);
8215
+
7686
8216
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7687
8217
  const llm_arch arch = qs.model.arch;
7688
8218
  const auto tn = LLM_TN(arch);
@@ -7716,7 +8246,18 @@ static ggml_type get_k_quant_type(
7716
8246
  // nearly negligible increase in model size by quantizing this tensor with more bits:
7717
8247
  if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
7718
8248
  }
8249
+ if (qs.model.hparams.n_expert == 8) {
8250
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8251
+ // TODO: explore better strategies
8252
+ new_type = GGML_TYPE_Q8_0;
8253
+ }
7719
8254
  ++qs.i_attention_wv;
8255
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8256
+ if (qs.model.hparams.n_expert == 8) {
8257
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8258
+ // TODO: explore better strategies
8259
+ new_type = GGML_TYPE_Q8_0;
8260
+ }
7720
8261
  } else if (name.find("ffn_down.weight") != std::string::npos) {
7721
8262
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
7722
8263
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -7831,7 +8372,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7831
8372
  constexpr bool use_mmap = false;
7832
8373
  #endif
7833
8374
 
7834
- llama_model_loader ml(fname_inp, use_mmap);
8375
+ llama_model_loader ml(fname_inp, use_mmap, NULL);
7835
8376
  if (ml.use_mmap) {
7836
8377
  ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7837
8378
  }
@@ -7925,10 +8466,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7925
8466
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
7926
8467
 
7927
8468
  // quantize only 2D tensors
7928
- quantize &= (tensor->n_dims == 2);
8469
+ quantize &= (ggml_n_dims(tensor) == 2);
7929
8470
  quantize &= params->quantize_output_tensor || name != "output.weight";
7930
8471
  quantize &= !params->only_copy;
7931
8472
 
8473
+ // do not quantize expert gating tensors
8474
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
8475
+
7932
8476
  enum ggml_type new_type;
7933
8477
  void * new_data;
7934
8478
  size_t new_size;
@@ -8127,7 +8671,7 @@ static int llama_apply_lora_from_file_internal(
8127
8671
  std::vector<uint8_t> base_buf;
8128
8672
  if (path_base_model) {
8129
8673
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8130
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
8674
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8131
8675
 
8132
8676
  size_t ctx_size;
8133
8677
  size_t mmapped_size;
@@ -8355,6 +8899,7 @@ struct llama_model_params llama_model_default_params() {
8355
8899
  /*.tensor_split =*/ nullptr,
8356
8900
  /*.progress_callback =*/ nullptr,
8357
8901
  /*.progress_callback_user_data =*/ nullptr,
8902
+ /*.kv_overrides =*/ nullptr,
8358
8903
  /*.vocab_only =*/ false,
8359
8904
  /*.use_mmap =*/ true,
8360
8905
  /*.use_mlock =*/ false,
@@ -8382,10 +8927,12 @@ struct llama_context_params llama_context_default_params() {
8382
8927
  /*.yarn_beta_fast =*/ 32.0f,
8383
8928
  /*.yarn_beta_slow =*/ 1.0f,
8384
8929
  /*.yarn_orig_ctx =*/ 0,
8930
+ /*.type_k =*/ GGML_TYPE_F16,
8931
+ /*.type_v =*/ GGML_TYPE_F16,
8385
8932
  /*.mul_mat_q =*/ true,
8386
- /*.f16_kv =*/ true,
8387
8933
  /*.logits_all =*/ false,
8388
8934
  /*.embedding =*/ false,
8935
+ /*.offload_kqv =*/ true,
8389
8936
  };
8390
8937
 
8391
8938
  return result;
@@ -8502,6 +9049,7 @@ struct llama_context * llama_new_context_with_model(
8502
9049
  cparams.yarn_beta_fast = params.yarn_beta_fast;
8503
9050
  cparams.yarn_beta_slow = params.yarn_beta_slow;
8504
9051
  cparams.mul_mat_q = params.mul_mat_q;
9052
+ cparams.offload_kqv = params.offload_kqv;
8505
9053
 
8506
9054
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
8507
9055
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -8535,19 +9083,36 @@ struct llama_context * llama_new_context_with_model(
8535
9083
  ctx->rng = std::mt19937(params.seed);
8536
9084
  ctx->logits_all = params.logits_all;
8537
9085
 
8538
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
9086
+ const ggml_type type_k = params.type_k;
9087
+ const ggml_type type_v = params.type_v;
9088
+
9089
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
9090
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
8539
9091
 
8540
9092
  // reserve memory for context buffers
8541
9093
  if (!hparams.vocab_only) {
8542
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
9094
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
8543
9095
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
8544
9096
  llama_free(ctx);
8545
9097
  return nullptr;
8546
9098
  }
8547
9099
 
8548
9100
  {
8549
- const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8550
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
9101
+ size_t memory_size_k = 0;
9102
+ size_t memory_size_v = 0;
9103
+
9104
+ for (auto & k : ctx->kv_self.k_l) {
9105
+ memory_size_k += ggml_nbytes(k);
9106
+ }
9107
+
9108
+ for (auto & v : ctx->kv_self.v_l) {
9109
+ memory_size_v += ggml_nbytes(v);
9110
+ }
9111
+
9112
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
9113
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
9114
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
9115
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
8551
9116
  }
8552
9117
 
8553
9118
  // resized during inference
@@ -8618,8 +9183,12 @@ struct llama_context * llama_new_context_with_model(
8618
9183
  }
8619
9184
 
8620
9185
  size_t kv_vram_size = 0;
8621
- add_tensor(ctx->kv_self.k, kv_vram_size);
8622
- add_tensor(ctx->kv_self.v, kv_vram_size);
9186
+ for (auto & k : ctx->kv_self.k_l) {
9187
+ add_tensor(k, kv_vram_size);
9188
+ }
9189
+ for (auto & v : ctx->kv_self.v_l) {
9190
+ add_tensor(v, kv_vram_size);
9191
+ }
8623
9192
 
8624
9193
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8625
9194
  size_t total_vram_size = model_vram_size + ctx_vram_size;
@@ -9089,37 +9658,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9089
9658
  data_ctx->write(&kv_used, sizeof(kv_used));
9090
9659
 
9091
9660
  if (kv_buf_size) {
9092
- const size_t elt_size = ggml_element_size(kv_self.k);
9661
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9093
9662
 
9094
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9663
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9095
9664
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9096
9665
 
9097
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9098
- std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9099
- kout3d->data = kout3d_data.data();
9666
+ std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9667
+ std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9100
9668
 
9101
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9102
- std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
9103
- vout3d->data = vout3d_data.data();
9669
+ for (int il = 0; il < (int) n_layer; ++il) {
9670
+ ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9671
+ kout2d_data[il].resize(ggml_nbytes(kout2d));
9672
+ kout2d->data = kout2d_data[il].data();
9104
9673
 
9105
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9106
- n_embd, kv_head, n_layer,
9107
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9674
+ ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9675
+ vout2d_data[il].resize(ggml_nbytes(vout2d));
9676
+ vout2d->data = vout2d_data[il].data();
9108
9677
 
9109
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9110
- kv_head, n_embd, n_layer,
9111
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9678
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9679
+ n_embd, kv_head,
9680
+ elt_size*n_embd, 0);
9681
+
9682
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9683
+ kv_head, n_embd,
9684
+ elt_size*n_ctx, 0);
9685
+
9686
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9687
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9688
+ }
9112
9689
 
9113
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9114
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9115
9690
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9116
9691
 
9117
9692
  ggml_free(cpy_ctx);
9118
9693
 
9119
- // our data is now in the kout3d_data and vout3d_data buffers
9694
+ // our data is now in the kout2d_data and vout2d_data buffers
9120
9695
  // write them to file
9121
- data_ctx->write(kout3d_data.data(), kout3d_data.size());
9122
- data_ctx->write(vout3d_data.data(), vout3d_data.size());
9696
+ for (uint32_t il = 0; il < n_layer; ++il) {
9697
+ data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9698
+ data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9699
+ }
9123
9700
  }
9124
9701
 
9125
9702
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9219,29 +9796,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9219
9796
  if (kv_buf_size) {
9220
9797
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9221
9798
 
9222
- const size_t elt_size = ggml_element_size(kv_self.k);
9799
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9223
9800
 
9224
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9801
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9225
9802
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9226
9803
 
9227
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9228
- kin3d->data = (void *) inp;
9229
- inp += ggml_nbytes(kin3d);
9804
+ for (int il = 0; il < n_layer; ++il) {
9805
+ ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9806
+ kin2d->data = (void *) inp;
9807
+ inp += ggml_nbytes(kin2d);
9808
+
9809
+ ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9810
+ vin2d->data = (void *) inp;
9811
+ inp += ggml_nbytes(vin2d);
9230
9812
 
9231
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9232
- vin3d->data = (void *) inp;
9233
- inp += ggml_nbytes(vin3d);
9813
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9814
+ n_embd, kv_head,
9815
+ elt_size*n_embd, 0);
9234
9816
 
9235
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9236
- n_embd, kv_head, n_layer,
9237
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9817
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9818
+ kv_head, n_embd,
9819
+ elt_size*n_ctx, 0);
9238
9820
 
9239
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9240
- kv_head, n_embd, n_layer,
9241
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9821
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9822
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9823
+ }
9242
9824
 
9243
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9244
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9245
9825
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9246
9826
 
9247
9827
  ggml_free(cpy_ctx);