llama_cpp 0.9.5 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -74,6 +74,7 @@
74
74
  #include <set>
75
75
  #include <sstream>
76
76
  #include <thread>
77
+ #include <type_traits>
77
78
  #include <unordered_map>
78
79
 
79
80
  #if defined(_MSC_VER)
@@ -192,6 +193,7 @@ enum llm_arch {
192
193
  LLM_ARCH_REFACT,
193
194
  LLM_ARCH_BLOOM,
194
195
  LLM_ARCH_STABLELM,
196
+ LLM_ARCH_QWEN,
195
197
  LLM_ARCH_UNKNOWN,
196
198
  };
197
199
 
@@ -208,6 +210,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
208
210
  { LLM_ARCH_REFACT, "refact" },
209
211
  { LLM_ARCH_BLOOM, "bloom" },
210
212
  { LLM_ARCH_STABLELM, "stablelm" },
213
+ { LLM_ARCH_QWEN, "qwen" },
211
214
  };
212
215
 
213
216
  enum llm_kv {
@@ -518,6 +521,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
518
521
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
519
522
  },
520
523
  },
524
+ {
525
+ LLM_ARCH_QWEN,
526
+ {
527
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
528
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
529
+ { LLM_TENSOR_OUTPUT, "output" },
530
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
531
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
532
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
533
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
534
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
535
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
536
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
537
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
538
+ },
539
+ },
521
540
 
522
541
  {
523
542
  LLM_ARCH_UNKNOWN,
@@ -572,21 +591,6 @@ struct LLM_TN {
572
591
  // gguf helpers
573
592
  //
574
593
 
575
- #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
576
- do { \
577
- const std::string skey(key); \
578
- const int kid = gguf_find_key(ctx, skey.c_str()); \
579
- if (kid >= 0) { \
580
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
581
- if (ktype != (type)) { \
582
- throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
583
- } \
584
- (dst) = func(ctx, kid); \
585
- } else if (req) { \
586
- throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
587
- } \
588
- } while (0)
589
-
590
594
  static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
591
595
  { LLAMA_ROPE_SCALING_NONE, "none" },
592
596
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -620,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
620
624
  }
621
625
  }
622
626
 
623
- static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
627
+ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
624
628
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
629
 
626
630
  switch (type) {
@@ -1222,6 +1226,7 @@ struct llama_cparams {
1222
1226
  float yarn_beta_slow;
1223
1227
 
1224
1228
  bool mul_mat_q;
1229
+ bool offload_kqv;
1225
1230
  };
1226
1231
 
1227
1232
  struct llama_layer {
@@ -1243,6 +1248,9 @@ struct llama_layer {
1243
1248
  struct ggml_tensor * wqkv;
1244
1249
 
1245
1250
  // attention bias
1251
+ struct ggml_tensor * bq;
1252
+ struct ggml_tensor * bk;
1253
+ struct ggml_tensor * bv;
1246
1254
  struct ggml_tensor * bo;
1247
1255
  struct ggml_tensor * bqkv;
1248
1256
 
@@ -1287,8 +1295,8 @@ struct llama_kv_cache {
1287
1295
 
1288
1296
  std::vector<llama_kv_cell> cells;
1289
1297
 
1290
- struct ggml_tensor * k = NULL;
1291
- struct ggml_tensor * v = NULL;
1298
+ std::vector<struct ggml_tensor *> k_l; // per layer
1299
+ std::vector<struct ggml_tensor *> v_l;
1292
1300
 
1293
1301
  struct ggml_context * ctx = NULL;
1294
1302
 
@@ -1301,8 +1309,10 @@ struct llama_kv_cache {
1301
1309
 
1302
1310
  #ifdef GGML_USE_CUBLAS
1303
1311
  if (ggml_cublas_loaded()) {
1304
- ggml_cuda_free_data(k);
1305
- ggml_cuda_free_data(v);
1312
+ for (size_t i = 0; i < k_l.size(); ++i) {
1313
+ ggml_cuda_free_data(k_l[i]);
1314
+ ggml_cuda_free_data(v_l[i]);
1315
+ }
1306
1316
  }
1307
1317
  #endif
1308
1318
  }
@@ -1492,9 +1502,11 @@ struct llama_context {
1492
1502
  static bool llama_kv_cache_init(
1493
1503
  const struct llama_hparams & hparams,
1494
1504
  struct llama_kv_cache & cache,
1495
- ggml_type wtype,
1505
+ ggml_type ktype,
1506
+ ggml_type vtype,
1496
1507
  uint32_t n_ctx,
1497
- int n_gpu_layers) {
1508
+ int n_gpu_layers,
1509
+ bool offload) {
1498
1510
  const uint32_t n_embd = hparams.n_embd_gqa();
1499
1511
  const uint32_t n_layer = hparams.n_layer;
1500
1512
 
@@ -1510,7 +1522,7 @@ static bool llama_kv_cache_init(
1510
1522
  cache.cells.clear();
1511
1523
  cache.cells.resize(n_ctx);
1512
1524
 
1513
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1525
+ cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1514
1526
  memset(cache.buf.data, 0, cache.buf.size);
1515
1527
 
1516
1528
  struct ggml_init_params params;
@@ -1520,37 +1532,44 @@ static bool llama_kv_cache_init(
1520
1532
 
1521
1533
  cache.ctx = ggml_init(params);
1522
1534
 
1535
+ size_t vram_kv_cache = 0;
1536
+
1523
1537
  if (!cache.ctx) {
1524
1538
  LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1525
1539
  return false;
1526
1540
  }
1527
1541
 
1528
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1529
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1530
- ggml_set_name(cache.k, "cache_k");
1531
- ggml_set_name(cache.v, "cache_v");
1542
+ cache.k_l.reserve(n_layer);
1543
+ cache.v_l.reserve(n_layer);
1532
1544
 
1533
- (void) n_gpu_layers;
1545
+ const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1534
1546
 
1535
- #ifdef GGML_USE_CUBLAS
1536
- if (ggml_cublas_loaded()) {
1537
- size_t vram_kv_cache = 0;
1547
+ GGML_UNUSED(offload);
1538
1548
 
1539
- if (n_gpu_layers > (int)n_layer + 1) {
1540
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1541
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1542
- vram_kv_cache += ggml_nbytes(cache.v);
1543
- }
1544
- if (n_gpu_layers > (int)n_layer + 2) {
1545
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1546
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1547
- vram_kv_cache += ggml_nbytes(cache.k);
1548
- }
1549
- if (vram_kv_cache > 0) {
1550
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1549
+ for (int i = 0; i < (int) n_layer; i++) {
1550
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1551
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1552
+ ggml_format_name(k, "cache_k_l%d", i);
1553
+ ggml_format_name(v, "cache_v_l%d", i);
1554
+ cache.k_l.push_back(k);
1555
+ cache.v_l.push_back(v);
1556
+ #ifdef GGML_USE_CUBLAS
1557
+ if (i >= i_gpu_start) {
1558
+ if (offload) {
1559
+ ggml_cuda_assign_buffers_no_scratch(k);
1560
+ vram_kv_cache += ggml_nbytes(k);
1561
+ ggml_cuda_assign_buffers_no_scratch(v);
1562
+ vram_kv_cache += ggml_nbytes(v);
1563
+ }
1551
1564
  }
1565
+ #endif // GGML_USE_CUBLAS
1552
1566
  }
1553
- #endif
1567
+
1568
+ if (vram_kv_cache > 0) {
1569
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1570
+ }
1571
+
1572
+ GGML_UNUSED(n_gpu_layers);
1554
1573
 
1555
1574
  return true;
1556
1575
  }
@@ -1771,6 +1790,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
1771
1790
  return buf;
1772
1791
  }
1773
1792
 
1793
+ namespace GGUFMeta {
1794
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
1795
+ struct GKV_Base_Type {
1796
+ static constexpr gguf_type gt = gt_;
1797
+
1798
+ static T getter(const gguf_context * ctx, const int kid) {
1799
+ return gfun(ctx, kid);
1800
+ }
1801
+ };
1802
+
1803
+ template<typename T> struct GKV_Base;
1804
+
1805
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
1806
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
1807
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
1808
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
1809
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
1810
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
1811
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
1812
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
1813
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
1814
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
1815
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
1816
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
1817
+
1818
+ template<> struct GKV_Base<std::string> {
1819
+ static constexpr gguf_type gt = GGUF_TYPE_STRING;
1820
+
1821
+ static std::string getter(const gguf_context * ctx, const int kid) {
1822
+ return gguf_get_val_str(ctx, kid);
1823
+ }
1824
+ };
1825
+
1826
+ struct ArrayInfo{
1827
+ const gguf_type gt;
1828
+ const size_t length;
1829
+ const void * data;
1830
+ };
1831
+
1832
+ template<> struct GKV_Base<ArrayInfo> {
1833
+ public:
1834
+ static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
1835
+ static ArrayInfo getter(const gguf_context *ctx, const int k) {
1836
+ return ArrayInfo {
1837
+ gguf_get_arr_type(ctx, k),
1838
+ size_t(gguf_get_arr_n(ctx, k)),
1839
+ gguf_get_arr_data(ctx, k),
1840
+ };
1841
+ }
1842
+ };
1843
+
1844
+ template<typename T>
1845
+ class GKV: public GKV_Base<T> {
1846
+ GKV() = delete;
1847
+
1848
+ public:
1849
+ static T get_kv(const gguf_context * ctx, const int k) {
1850
+ const enum gguf_type kt = gguf_get_kv_type(ctx, k);
1851
+
1852
+ if (kt != GKV::gt) {
1853
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
1854
+ gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
1855
+ }
1856
+ return GKV::getter(ctx, k);
1857
+ }
1858
+
1859
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
1860
+ switch (ty) {
1861
+ case LLAMA_KV_OVERRIDE_BOOL: return "bool";
1862
+ case LLAMA_KV_OVERRIDE_INT: return "int";
1863
+ case LLAMA_KV_OVERRIDE_FLOAT: return "float";
1864
+ }
1865
+ return "unknown";
1866
+ }
1867
+
1868
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
1869
+ if (!override) { return false; }
1870
+ if (override->tag == expected_type) {
1871
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
1872
+ __func__, override_type_to_str(override->tag), override->key);
1873
+ switch (override->tag) {
1874
+ case LLAMA_KV_OVERRIDE_BOOL: {
1875
+ printf("%s\n", override->bool_value ? "true" : "false");
1876
+ } break;
1877
+ case LLAMA_KV_OVERRIDE_INT: {
1878
+ printf("%" PRId64 "\n", override->int_value);
1879
+ } break;
1880
+ case LLAMA_KV_OVERRIDE_FLOAT: {
1881
+ printf("%.6f\n", override->float_value);
1882
+ } break;
1883
+ default:
1884
+ // Shouldn't be possible to end up here, but just in case...
1885
+ throw std::runtime_error(
1886
+ format("Unsupported attempt to override %s type for metadata key %s\n",
1887
+ override_type_to_str(override->tag), override->key));
1888
+ }
1889
+ return true;
1890
+ }
1891
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
1892
+ __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
1893
+ return false;
1894
+ }
1895
+
1896
+ template<typename OT>
1897
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
1898
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1899
+ if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
1900
+ target = override->bool_value;
1901
+ return true;
1902
+ }
1903
+ return true;
1904
+ }
1905
+
1906
+ template<typename OT>
1907
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
1908
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1909
+ if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
1910
+ target = override->int_value;
1911
+ return true;
1912
+ }
1913
+ return false;
1914
+ }
1915
+
1916
+ template<typename OT>
1917
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
1918
+ try_override(T & target, const struct llama_model_kv_override *override) {
1919
+ if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
1920
+ target = override->float_value;
1921
+ return true;
1922
+ }
1923
+ return false;
1924
+ }
1925
+
1926
+ template<typename OT>
1927
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
1928
+ try_override(T & target, const struct llama_model_kv_override *override) {
1929
+ (void)target;
1930
+ (void)override;
1931
+ if (!override) { return false; }
1932
+ // Currently, we should never end up here so it would be a bug if we do.
1933
+ throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
1934
+ override ? override->key : "NULL"));
1935
+ }
1936
+
1937
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
1938
+ if (try_override<T>(target, override)) {
1939
+ return true;
1940
+ }
1941
+ if (k < 0) { return false; }
1942
+ target = get_kv(ctx, k);
1943
+ return true;
1944
+ }
1945
+
1946
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
1947
+ return set(ctx, gguf_find_key(ctx, key), target, override);
1948
+ }
1949
+
1950
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
1951
+ return set(ctx, key.c_str(), target, override);
1952
+ }
1953
+ };
1954
+ }
1955
+
1774
1956
  struct llama_model_loader {
1775
1957
  int n_kv = 0;
1776
1958
  int n_tensors = 0;
@@ -1786,21 +1968,34 @@ struct llama_model_loader {
1786
1968
  llama_fver fver;
1787
1969
 
1788
1970
  std::unique_ptr<llama_mmap> mapping;
1971
+ std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
1789
1972
 
1790
1973
  struct gguf_context * ctx_gguf = NULL;
1791
1974
  struct ggml_context * ctx_meta = NULL;
1792
1975
 
1793
- llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
1976
+ std::string arch_name;
1977
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
1978
+
1979
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
1794
1980
  struct gguf_init_params params = {
1795
1981
  /*.no_alloc = */ true,
1796
1982
  /*.ctx = */ &ctx_meta,
1797
1983
  };
1798
1984
 
1985
+ if (param_overrides_p != nullptr) {
1986
+ for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
1987
+ kv_overrides.insert({std::string(p->key), *p});
1988
+ }
1989
+ }
1990
+
1799
1991
  ctx_gguf = gguf_init_from_file(fname.c_str(), params);
1800
1992
  if (!ctx_gguf) {
1801
1993
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
1802
1994
  }
1803
1995
 
1996
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
1997
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
1998
+
1804
1999
  n_kv = gguf_get_n_kv(ctx_gguf);
1805
2000
  n_tensors = gguf_get_n_tensors(ctx_gguf);
1806
2001
 
@@ -1868,6 +2063,7 @@ struct llama_model_loader {
1868
2063
  }
1869
2064
  }
1870
2065
 
2066
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1871
2067
  for (int i = 0; i < n_kv; i++) {
1872
2068
  const char * name = gguf_get_key(ctx_gguf, i);
1873
2069
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1913,19 +2109,59 @@ struct llama_model_loader {
1913
2109
  }
1914
2110
  }
1915
2111
 
1916
- std::string get_arch_name() const {
1917
- const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
2112
+ template<typename T>
2113
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2114
+ get_arr_n(const std::string & key, T & result, const bool required = true) {
2115
+ const int kid = gguf_find_key(ctx_gguf, key.c_str());
2116
+
2117
+ if (kid < 0) {
2118
+ if (required) {
2119
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2120
+ }
2121
+ return false;
2122
+ }
2123
+
2124
+ struct GGUFMeta::ArrayInfo arr_info =
2125
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
1918
2126
 
1919
- std::string arch_name;
1920
- GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
1921
2127
 
2128
+ result = arr_info.length;
2129
+ return true;
2130
+ }
2131
+
2132
+ template<typename T>
2133
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2134
+ get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
2135
+ return get_arr_n(llm_kv(kid), result, required);
2136
+ }
2137
+
2138
+ template<typename T>
2139
+ bool get_key(const std::string & key, T & result, const bool required = true) {
2140
+ auto it = kv_overrides.find(key);
2141
+
2142
+ const struct llama_model_kv_override * override =
2143
+ it != kv_overrides.end() ? &it->second : nullptr;
2144
+
2145
+ const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
2146
+
2147
+ if (required && !found) {
2148
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2149
+ }
2150
+
2151
+ return found;
2152
+ }
2153
+
2154
+ template<typename T>
2155
+ bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
2156
+ return get_key(llm_kv(kid), result, required);
2157
+ }
2158
+
2159
+ std::string get_arch_name() const {
1922
2160
  return arch_name;
1923
2161
  }
1924
2162
 
1925
2163
  enum llm_arch get_arch() const {
1926
- const std::string arch_name = get_arch_name();
1927
-
1928
- return llm_arch_from_string(arch_name);
2164
+ return llm_kv.arch;
1929
2165
  }
1930
2166
 
1931
2167
  const char * get_tensor_name(int i) const {
@@ -1965,10 +2201,13 @@ struct llama_model_loader {
1965
2201
  return tensor;
1966
2202
  }
1967
2203
 
1968
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
2204
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
1969
2205
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1970
2206
 
1971
2207
  if (cur == NULL) {
2208
+ if (!required) {
2209
+ return NULL;
2210
+ }
1972
2211
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
1973
2212
  }
1974
2213
 
@@ -2172,11 +2411,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2172
2411
  static void llm_load_hparams(
2173
2412
  llama_model_loader & ml,
2174
2413
  llama_model & model) {
2175
- struct gguf_context * ctx = ml.ctx_gguf;
2176
-
2177
- const auto kv = LLM_KV(model.arch);
2178
-
2179
2414
  auto & hparams = model.hparams;
2415
+ const gguf_context * ctx = ml.ctx_gguf;
2180
2416
 
2181
2417
  // get metadata as string
2182
2418
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -2190,42 +2426,41 @@ static void llm_load_hparams(
2190
2426
  }
2191
2427
 
2192
2428
  // get general kv
2193
- GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2429
+ ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
2194
2430
 
2195
2431
  // get hparams kv
2196
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
2197
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
2198
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
2199
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
2200
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
2201
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
2432
+ ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
2433
+ ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
2434
+ ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
2435
+ ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2436
+ ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2437
+ ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2202
2438
 
2203
2439
  // n_head_kv is optional, default to n_head
2204
2440
  hparams.n_head_kv = hparams.n_head;
2205
- GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
2441
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
2206
2442
 
2207
- hparams.rope_finetuned = false;
2208
- GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
2209
- kv(LLM_KV_ROPE_SCALING_FINETUNED));
2443
+ bool rope_finetuned = false;
2444
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2445
+ hparams.rope_finetuned = rope_finetuned;
2210
2446
 
2211
2447
  hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
2212
- GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
2213
- kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
2448
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
2214
2449
 
2215
2450
  // rope_freq_base (optional)
2216
2451
  hparams.rope_freq_base_train = 10000.0f;
2217
- GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
2452
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
2218
2453
 
2219
2454
  std::string rope_scaling("linear");
2220
- GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
2455
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2221
2456
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2222
2457
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
2223
2458
 
2224
2459
  // rope_freq_scale (inverse of the kv) is optional
2225
2460
  float ropescale = 0.0f;
2226
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
2227
- if (ropescale == 0.0f) { // try the old key name
2228
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
2461
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
2462
+ // try the old key name
2463
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
2229
2464
  }
2230
2465
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
2231
2466
 
@@ -2233,7 +2468,7 @@ static void llm_load_hparams(
2233
2468
  {
2234
2469
  hparams.n_rot = hparams.n_embd / hparams.n_head;
2235
2470
 
2236
- GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
2471
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
2237
2472
 
2238
2473
  if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
2239
2474
  if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2248,7 +2483,7 @@ static void llm_load_hparams(
2248
2483
  switch (model.arch) {
2249
2484
  case LLM_ARCH_LLAMA:
2250
2485
  {
2251
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2486
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2252
2487
 
2253
2488
  switch (hparams.n_layer) {
2254
2489
  case 26: model.type = e_model::MODEL_3B; break;
@@ -2262,7 +2497,7 @@ static void llm_load_hparams(
2262
2497
  } break;
2263
2498
  case LLM_ARCH_FALCON:
2264
2499
  {
2265
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2500
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2266
2501
 
2267
2502
  switch (hparams.n_layer) {
2268
2503
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2272,7 +2507,7 @@ static void llm_load_hparams(
2272
2507
  } break;
2273
2508
  case LLM_ARCH_BAICHUAN:
2274
2509
  {
2275
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2510
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2276
2511
  switch (hparams.n_layer) {
2277
2512
  case 32: model.type = e_model::MODEL_7B; break;
2278
2513
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2281,7 +2516,7 @@ static void llm_load_hparams(
2281
2516
  } break;
2282
2517
  case LLM_ARCH_STARCODER:
2283
2518
  {
2284
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2519
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2285
2520
  switch (hparams.n_layer) {
2286
2521
  case 24: model.type = e_model::MODEL_1B; break;
2287
2522
  case 36: model.type = e_model::MODEL_3B; break;
@@ -2292,7 +2527,7 @@ static void llm_load_hparams(
2292
2527
  } break;
2293
2528
  case LLM_ARCH_PERSIMMON:
2294
2529
  {
2295
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2530
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2296
2531
  switch (hparams.n_layer) {
2297
2532
  case 36: model.type = e_model::MODEL_8B; break;
2298
2533
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2300,7 +2535,7 @@ static void llm_load_hparams(
2300
2535
  } break;
2301
2536
  case LLM_ARCH_REFACT:
2302
2537
  {
2303
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2538
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2304
2539
  switch (hparams.n_layer) {
2305
2540
  case 32: model.type = e_model::MODEL_1B; break;
2306
2541
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2308,7 +2543,7 @@ static void llm_load_hparams(
2308
2543
  } break;
2309
2544
  case LLM_ARCH_BLOOM:
2310
2545
  {
2311
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2546
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2312
2547
 
2313
2548
  switch (hparams.n_layer) {
2314
2549
  case 24: model.type = e_model::MODEL_1B; break;
@@ -2323,9 +2558,9 @@ static void llm_load_hparams(
2323
2558
  {
2324
2559
  hparams.f_clamp_kqv = 0.0f;
2325
2560
 
2326
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2327
- GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2328
- GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2561
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2562
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
2563
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
2329
2564
 
2330
2565
  switch (hparams.n_layer) {
2331
2566
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2335,13 +2570,23 @@ static void llm_load_hparams(
2335
2570
  } break;
2336
2571
  case LLM_ARCH_STABLELM:
2337
2572
  {
2338
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2573
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2339
2574
 
2340
2575
  switch (hparams.n_layer) {
2341
2576
  case 32: model.type = e_model::MODEL_3B; break;
2342
2577
  default: model.type = e_model::MODEL_UNKNOWN;
2343
2578
  }
2344
2579
  } break;
2580
+ case LLM_ARCH_QWEN:
2581
+ {
2582
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2583
+
2584
+ switch (hparams.n_layer) {
2585
+ case 32: model.type = e_model::MODEL_7B; break;
2586
+ case 40: model.type = e_model::MODEL_13B; break;
2587
+ default: model.type = e_model::MODEL_UNKNOWN;
2588
+ }
2589
+ } break;
2345
2590
 
2346
2591
  default: (void)0;
2347
2592
  }
@@ -2383,7 +2628,7 @@ static void llm_load_vocab(
2383
2628
  {
2384
2629
  std::string tokenizer_name;
2385
2630
 
2386
- GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
2631
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
2387
2632
 
2388
2633
  if (tokenizer_name == "llama") {
2389
2634
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2473,34 +2718,31 @@ static void llm_load_vocab(
2473
2718
  };
2474
2719
  for (const auto & it : special_token_types) {
2475
2720
  const std::string & key = kv(std::get<0>(it));
2476
- int32_t & id = std::get<1>(it), old_id = id;
2721
+ int32_t & id = std::get<1>(it);
2477
2722
 
2478
- GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2479
- // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2480
- // can only come from the default value, so there's no point in
2481
- // validating that.
2482
- if (size_t(id + 1) > vocab.id_to_token.size()) {
2483
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2484
- __func__, key.c_str(), id, old_id);
2485
- id = old_id;
2723
+ uint32_t new_id;
2724
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
2725
+ continue;
2726
+ }
2727
+ if (new_id >= vocab.id_to_token.size()) {
2728
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
2729
+ __func__, key.c_str(), new_id, id);
2730
+ } else {
2731
+ id = new_id;
2486
2732
  }
2487
2733
 
2488
2734
  }
2489
2735
 
2490
2736
  // Handle add_bos_token and add_eos_token
2491
- std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2492
- int kid = gguf_find_key(ctx, key.c_str());
2493
- enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2494
- vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2495
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2496
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2497
- }
2498
- key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2499
- kid = gguf_find_key(ctx, key.c_str());
2500
- ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2501
- vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2502
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2503
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2737
+ {
2738
+ bool temp = true;
2739
+
2740
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2741
+ vocab.special_add_bos = int(temp);
2742
+ }
2743
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2744
+ vocab.special_add_eos = int(temp);
2745
+ }
2504
2746
  }
2505
2747
  }
2506
2748
 
@@ -2733,14 +2975,7 @@ static void llm_load_tensors(
2733
2975
  ggml_backend_type backend_output;
2734
2976
 
2735
2977
  if (n_gpu_layers > int(n_layer)) {
2736
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2737
- // on Windows however this is detrimental unless everything is on the GPU
2738
- #ifndef _WIN32
2739
- backend_norm = llama_backend_offload;
2740
- #else
2741
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2742
- #endif // _WIN32
2743
-
2978
+ backend_norm = llama_backend_offload;
2744
2979
  backend_output = llama_backend_offload_split;
2745
2980
  } else {
2746
2981
  backend_norm = GGML_BACKEND_CPU;
@@ -2777,6 +3012,12 @@ static void llm_load_tensors(
2777
3012
  layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2778
3013
  layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2779
3014
 
3015
+ // optional bias tensors
3016
+ layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3017
+ layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3018
+ layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3019
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3020
+
2780
3021
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
3022
 
2782
3023
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
@@ -2785,9 +3026,14 @@ static void llm_load_tensors(
2785
3026
 
2786
3027
  if (backend == GGML_BACKEND_GPU) {
2787
3028
  vram_weights +=
2788
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2789
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2790
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3029
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3030
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3031
+ (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3032
+ (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3033
+ (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3034
+ (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3035
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3036
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2791
3037
  }
2792
3038
  }
2793
3039
  } break;
@@ -2799,14 +3045,7 @@ static void llm_load_tensors(
2799
3045
  ggml_backend_type backend_output;
2800
3046
 
2801
3047
  if (n_gpu_layers > int(n_layer)) {
2802
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2803
- // on Windows however this is detrimental unless everything is on the GPU
2804
- #ifndef _WIN32
2805
- backend_norm = llama_backend_offload;
2806
- #else
2807
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2808
- #endif // _WIN32
2809
-
3048
+ backend_norm = llama_backend_offload;
2810
3049
  backend_output = llama_backend_offload_split;
2811
3050
  } else {
2812
3051
  backend_norm = GGML_BACKEND_CPU;
@@ -2869,14 +3108,7 @@ static void llm_load_tensors(
2869
3108
  ggml_backend_type backend_output;
2870
3109
 
2871
3110
  if (n_gpu_layers > int(n_layer)) {
2872
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2873
- // on Windows however this is detrimental unless everything is on the GPU
2874
- #ifndef _WIN32
2875
- backend_norm = llama_backend_offload;
2876
- #else
2877
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2878
- #endif // _WIN32
2879
-
3111
+ backend_norm = llama_backend_offload;
2880
3112
  backend_output = llama_backend_offload_split;
2881
3113
  } else {
2882
3114
  backend_norm = GGML_BACKEND_CPU;
@@ -2946,14 +3178,7 @@ static void llm_load_tensors(
2946
3178
  ggml_backend_type backend_output;
2947
3179
 
2948
3180
  if (n_gpu_layers > int(n_layer)) {
2949
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2950
- // on Windows however this is detrimental unless everything is on the GPU
2951
- #ifndef _WIN32
2952
- backend_norm = llama_backend_offload;
2953
- #else
2954
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2955
- #endif // _WIN32
2956
-
3181
+ backend_norm = llama_backend_offload;
2957
3182
  backend_output = llama_backend_offload_split;
2958
3183
  } else {
2959
3184
  backend_norm = GGML_BACKEND_CPU;
@@ -3023,21 +3248,7 @@ static void llm_load_tensors(
3023
3248
  ggml_backend_type backend_output;
3024
3249
 
3025
3250
  if (n_gpu_layers > int(n_layer)) {
3026
- #ifdef GGML_USE_CUBLAS
3027
- if (n_gpu_layers > int(n_layer + 1)) {
3028
- LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3029
- __func__, n_layer + 1);
3030
- throw std::runtime_error("Persimmon CUDA offload failed");
3031
- }
3032
- #endif
3033
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3034
- // on Windows however this is detrimental unless everything is on the GPU
3035
- #ifndef _WIN32
3036
- backend_norm = llama_backend_offload;
3037
- #else
3038
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3039
- #endif // _WIN32
3040
-
3251
+ backend_norm = llama_backend_offload;
3041
3252
  backend_output = llama_backend_offload_split;
3042
3253
  } else {
3043
3254
  backend_norm = GGML_BACKEND_CPU;
@@ -3096,14 +3307,7 @@ static void llm_load_tensors(
3096
3307
  ggml_backend_type backend_output;
3097
3308
 
3098
3309
  if (n_gpu_layers > int(n_layer)) {
3099
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3100
- // on Windows however this is detrimental unless everything is on the GPU
3101
- #ifndef _WIN32
3102
- backend_norm = llama_backend_offload;
3103
- #else
3104
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3105
- #endif // _WIN32
3106
-
3310
+ backend_norm = llama_backend_offload;
3107
3311
  backend_output = llama_backend_offload_split;
3108
3312
  } else {
3109
3313
  backend_norm = GGML_BACKEND_CPU;
@@ -3174,14 +3378,7 @@ static void llm_load_tensors(
3174
3378
  ggml_backend_type backend_output;
3175
3379
 
3176
3380
  if (n_gpu_layers > int(n_layer)) {
3177
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3178
- // on Windows however this is detrimental unless everything is on the GPU
3179
- #ifndef _WIN32
3180
- backend_norm = llama_backend_offload;
3181
- #else
3182
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3183
- #endif // _WIN32
3184
-
3381
+ backend_norm = llama_backend_offload;
3185
3382
  backend_output = llama_backend_offload_split;
3186
3383
  } else {
3187
3384
  backend_norm = GGML_BACKEND_CPU;
@@ -3241,14 +3438,7 @@ static void llm_load_tensors(
3241
3438
  ggml_backend_type backend_output;
3242
3439
 
3243
3440
  if (n_gpu_layers > int(n_layer)) {
3244
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3245
- // on Windows however this is detrimental unless everything is on the GPU
3246
- #ifndef _WIN32
3247
- backend_norm = llama_backend_offload;
3248
- #else
3249
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3250
- #endif // _WIN32
3251
-
3441
+ backend_norm = llama_backend_offload;
3252
3442
  backend_output = llama_backend_offload_split;
3253
3443
  } else {
3254
3444
  backend_norm = GGML_BACKEND_CPU;
@@ -3305,6 +3495,64 @@ static void llm_load_tensors(
3305
3495
  }
3306
3496
  }
3307
3497
  } break;
3498
+ case LLM_ARCH_QWEN:
3499
+ {
3500
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3501
+ {
3502
+ ggml_backend_type backend_norm;
3503
+ ggml_backend_type backend_output;
3504
+
3505
+ if (n_gpu_layers > int(n_layer)) {
3506
+ backend_norm = llama_backend_offload;
3507
+ backend_output = llama_backend_offload_split;
3508
+ } else {
3509
+ backend_norm = GGML_BACKEND_CPU;
3510
+ backend_output = GGML_BACKEND_CPU;
3511
+ }
3512
+
3513
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3514
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3515
+
3516
+ if (backend_norm == GGML_BACKEND_GPU) {
3517
+ vram_weights += ggml_nbytes(model.output_norm);
3518
+ }
3519
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3520
+ vram_weights += ggml_nbytes(model.output);
3521
+ }
3522
+ }
3523
+
3524
+ const uint32_t n_ff = hparams.n_ff / 2;
3525
+
3526
+ const int i_gpu_start = n_layer - n_gpu_layers;
3527
+
3528
+ model.layers.resize(n_layer);
3529
+
3530
+ for (uint32_t i = 0; i < n_layer; ++i) {
3531
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3532
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3533
+
3534
+ auto & layer = model.layers[i];
3535
+
3536
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3537
+
3538
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3539
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3540
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3541
+
3542
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3543
+
3544
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3545
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3546
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3547
+
3548
+ if (backend == GGML_BACKEND_GPU) {
3549
+ vram_weights +=
3550
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3551
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3552
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3553
+ }
3554
+ }
3555
+ } break;
3308
3556
 
3309
3557
  default:
3310
3558
  throw std::runtime_error("unknown architecture");
@@ -3331,8 +3579,8 @@ static void llm_load_tensors(
3331
3579
  }
3332
3580
 
3333
3581
  #ifdef GGML_USE_CUBLAS
3334
- const int max_backend_supported_layers = hparams.n_layer + 3;
3335
- const int max_offloadable_layers = hparams.n_layer + 3;
3582
+ const int max_backend_supported_layers = hparams.n_layer + 1;
3583
+ const int max_offloadable_layers = hparams.n_layer + 1;
3336
3584
  #elif GGML_USE_CLBLAST
3337
3585
  const int max_backend_supported_layers = hparams.n_layer + 1;
3338
3586
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -3373,7 +3621,7 @@ static void llm_load_tensors(
3373
3621
 
3374
3622
  static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3375
3623
  try {
3376
- llama_model_loader ml(fname, params.use_mmap);
3624
+ llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3377
3625
 
3378
3626
  model.hparams.vocab_only = params.vocab_only;
3379
3627
 
@@ -3500,11 +3748,11 @@ static void llm_build_k_shift(
3500
3748
  struct ggml_tensor * tmp =
3501
3749
  // we rotate only the first n_rot dimensions
3502
3750
  ggml_rope_custom_inplace(ctx,
3503
- ggml_view_3d(ctx, kv.k,
3751
+ ggml_view_3d(ctx, kv.k_l[il],
3504
3752
  n_embd_head, n_head_kv, n_ctx,
3505
- ggml_element_size(kv.k)*n_embd_head,
3506
- ggml_element_size(kv.k)*n_embd_gqa,
3507
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
3753
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3754
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3755
+ 0),
3508
3756
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3509
3757
  ext_factor, attn_factor, beta_fast, beta_slow);
3510
3758
  cb(tmp, "K_shifted", il);
@@ -3531,13 +3779,13 @@ static void llm_build_kv_store(
3531
3779
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
3532
3780
  cb(v_cur_t, "v_cur_t", il);
3533
3781
 
3534
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
3535
- (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3782
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3783
+ (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3536
3784
  cb(k_cache_view, "k_cache_view", il);
3537
3785
 
3538
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
3539
- ( n_ctx)*ggml_element_size(kv.v),
3540
- (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
3786
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
3787
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
3788
+ (kv_head)*ggml_element_size(kv.v_l[il]));
3541
3789
  cb(v_cache_view, "v_cache_view", il);
3542
3790
 
3543
3791
  // important: storing RoPE-ed version of K in the KV cache!
@@ -3689,11 +3937,11 @@ static struct ggml_tensor * llm_build_kqv(
3689
3937
  cb(q, "q", il);
3690
3938
 
3691
3939
  struct ggml_tensor * k =
3692
- ggml_view_3d(ctx, kv.k,
3940
+ ggml_view_3d(ctx, kv.k_l[il],
3693
3941
  n_embd_head, n_kv, n_head_kv,
3694
- ggml_element_size(kv.k)*n_embd_gqa,
3695
- ggml_element_size(kv.k)*n_embd_head,
3696
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
3942
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3943
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3944
+ 0);
3697
3945
  cb(k, "k", il);
3698
3946
 
3699
3947
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@@ -3724,11 +3972,11 @@ static struct ggml_tensor * llm_build_kqv(
3724
3972
 
3725
3973
  // split cached v into n_head heads
3726
3974
  struct ggml_tensor * v =
3727
- ggml_view_3d(ctx, kv.v,
3975
+ ggml_view_3d(ctx, kv.v_l[il],
3728
3976
  n_kv, n_embd_head, n_head_kv,
3729
- ggml_element_size(kv.v)*n_ctx,
3730
- ggml_element_size(kv.v)*n_ctx*n_embd_head,
3731
- ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
3977
+ ggml_element_size(kv.v_l[il])*n_ctx,
3978
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
3979
+ 0);
3732
3980
  cb(v, "v", il);
3733
3981
 
3734
3982
  struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
@@ -3886,12 +4134,24 @@ struct llm_build_context {
3886
4134
  // compute Q and K and RoPE them
3887
4135
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3888
4136
  cb(Qcur, "Qcur", il);
4137
+ if (model.layers[il].bq) {
4138
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4139
+ cb(Qcur, "Qcur", il);
4140
+ }
3889
4141
 
3890
4142
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3891
4143
  cb(Kcur, "Kcur", il);
4144
+ if (model.layers[il].bk) {
4145
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4146
+ cb(Kcur, "Kcur", il);
4147
+ }
3892
4148
 
3893
4149
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3894
4150
  cb(Vcur, "Vcur", il);
4151
+ if (model.layers[il].bv) {
4152
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4153
+ cb(Vcur, "Vcur", il);
4154
+ }
3895
4155
 
3896
4156
  Qcur = ggml_rope_custom(
3897
4157
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -3910,7 +4170,7 @@ struct llm_build_context {
3910
4170
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3911
4171
 
3912
4172
  cur = llm_build_kqv(ctx0, hparams, kv_self,
3913
- model.layers[il].wo, NULL,
4173
+ model.layers[il].wo, model.layers[il].bo,
3914
4174
  Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
3915
4175
  cb(cur, "kqv_out", il);
3916
4176
  }
@@ -4308,6 +4568,7 @@ struct llm_build_context {
4308
4568
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4309
4569
  cb(inpL, "imp_embd", -1);
4310
4570
 
4571
+ // inp_pos - contains the positions
4311
4572
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4312
4573
  cb(inp_pos, "inp_pos", -1);
4313
4574
 
@@ -4315,6 +4576,7 @@ struct llm_build_context {
4315
4576
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4316
4577
  cb(KQ_scale, "KQ_scale", -1);
4317
4578
 
4579
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4318
4580
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4319
4581
  cb(KQ_mask, "KQ_mask", -1);
4320
4582
 
@@ -4903,6 +5165,121 @@ struct llm_build_context {
4903
5165
 
4904
5166
  return gf;
4905
5167
  }
5168
+
5169
+ struct ggml_cgraph * build_qwen() {
5170
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5171
+
5172
+ struct ggml_tensor * cur;
5173
+ struct ggml_tensor * inpL;
5174
+
5175
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5176
+ cb(inpL, "inp_embd", -1);
5177
+
5178
+ // inp_pos - contains the positions
5179
+ struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5180
+ cb(inp_pos, "inp_pos", -1);
5181
+
5182
+ // KQ_scale
5183
+ struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5184
+ cb(KQ_scale, "KQ_scale", -1);
5185
+
5186
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5187
+ struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5188
+ cb(KQ_mask, "KQ_mask", -1);
5189
+
5190
+ // shift the entire K-cache if needed
5191
+ if (do_rope_shift) {
5192
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5193
+ }
5194
+
5195
+ for (int il = 0; il < n_layer; ++il) {
5196
+ struct ggml_tensor * inpSA = inpL;
5197
+
5198
+ cur = llm_build_norm(ctx0, inpL, hparams,
5199
+ model.layers[il].attn_norm, NULL,
5200
+ LLM_NORM_RMS, cb, il);
5201
+ cb(cur, "attn_norm", il);
5202
+
5203
+ // self-attention
5204
+ {
5205
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5206
+ cb(cur, "wqkv", il);
5207
+
5208
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5209
+ cb(cur, "bqkv", il);
5210
+
5211
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5212
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5213
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5214
+
5215
+ cb(Qcur, "Qcur", il);
5216
+ cb(Kcur, "Kcur", il);
5217
+ cb(Vcur, "Vcur", il);
5218
+
5219
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5220
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5221
+
5222
+ // using mode = 2 for neox mode
5223
+ Qcur = ggml_rope_custom(
5224
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5225
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5226
+ );
5227
+ cb(Qcur, "Qcur", il);
5228
+
5229
+ Kcur = ggml_rope_custom(
5230
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5231
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5232
+ );
5233
+ cb(Kcur, "Kcur", il);
5234
+
5235
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5236
+
5237
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
5238
+ model.layers[il].wo, NULL,
5239
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5240
+ cb(cur, "kqv_out", il);
5241
+ }
5242
+
5243
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5244
+ cb(ffn_inp, "ffn_inp", il);
5245
+
5246
+ // feed-forward forward
5247
+ {
5248
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5249
+ model.layers[il].ffn_norm, NULL,
5250
+ LLM_NORM_RMS, cb, il);
5251
+ cb(cur, "ffn_norm", il);
5252
+
5253
+ cur = llm_build_ffn(ctx0, cur,
5254
+ model.layers[il].ffn_up, NULL,
5255
+ model.layers[il].ffn_gate, NULL,
5256
+ model.layers[il].ffn_down, NULL,
5257
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5258
+ cb(cur, "ffn_out", il);
5259
+ }
5260
+
5261
+ cur = ggml_add(ctx0, cur, ffn_inp);
5262
+ cb(cur, "l_out", il);
5263
+
5264
+ // input for next layer
5265
+ inpL = cur;
5266
+ }
5267
+
5268
+ cur = inpL;
5269
+
5270
+ cur = llm_build_norm(ctx0, cur, hparams,
5271
+ model.output_norm, NULL,
5272
+ LLM_NORM_RMS, cb, -1);
5273
+ cb(cur, "result_norm", -1);
5274
+
5275
+ // lm_head
5276
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5277
+ cb(cur, "result_output", -1);
5278
+
5279
+ ggml_build_forward_expand(gf, cur);
5280
+
5281
+ return gf;
5282
+ }
4906
5283
  };
4907
5284
 
4908
5285
  //
@@ -4913,8 +5290,8 @@ struct llm_build_context {
4913
5290
  enum llm_offload_func_e {
4914
5291
  OFFLOAD_FUNC_NOP,
4915
5292
  OFFLOAD_FUNC,
4916
- OFFLOAD_FUNC_KQ,
4917
- OFFLOAD_FUNC_V,
5293
+ OFFLOAD_FUNC_FRC, // force offload
5294
+ OFFLOAD_FUNC_KQV,
4918
5295
  OFFLOAD_FUNC_NR,
4919
5296
  OFFLOAD_FUNC_EMB,
4920
5297
  OFFLOAD_FUNC_OUT,
@@ -5000,11 +5377,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5377
  //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
5001
5378
  { "pos_embd", OFFLOAD_FUNC_NR },
5002
5379
 
5003
- { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope)
5004
- { "KQ_scale", OFFLOAD_FUNC_KQ },
5005
- { "KQ_mask", OFFLOAD_FUNC_KQ },
5006
- { "K_shift", OFFLOAD_FUNC_KQ },
5007
- { "K_shifted", OFFLOAD_FUNC_KQ },
5380
+ { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5381
+ { "KQ_scale", OFFLOAD_FUNC_FRC },
5382
+ { "KQ_mask", OFFLOAD_FUNC_FRC },
5383
+ { "K_shift", OFFLOAD_FUNC_FRC },
5384
+
5385
+ { "K_shifted", OFFLOAD_FUNC },
5008
5386
 
5009
5387
  { "inp_norm", OFFLOAD_FUNC_NR },
5010
5388
  { "inp_norm_w", OFFLOAD_FUNC_NR },
@@ -5017,38 +5395,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5017
5395
  { "attn_norm", OFFLOAD_FUNC },
5018
5396
  { "attn_norm_2", OFFLOAD_FUNC },
5019
5397
 
5020
- { "wqkv", OFFLOAD_FUNC_KQ },
5021
- { "bqkv", OFFLOAD_FUNC_KQ },
5022
- { "wqkv_clamped", OFFLOAD_FUNC_KQ },
5023
-
5024
- { "tmpk", OFFLOAD_FUNC_KQ },
5025
- { "tmpq", OFFLOAD_FUNC_KQ },
5026
- { "tmpv", OFFLOAD_FUNC_V },
5027
- { "Kcur", OFFLOAD_FUNC_KQ },
5028
- { "Qcur", OFFLOAD_FUNC_KQ },
5029
- { "Vcur", OFFLOAD_FUNC_V },
5030
-
5031
- { "krot", OFFLOAD_FUNC_KQ },
5032
- { "qrot", OFFLOAD_FUNC_KQ },
5033
- { "kpass", OFFLOAD_FUNC_KQ },
5034
- { "qpass", OFFLOAD_FUNC_KQ },
5035
- { "krotated", OFFLOAD_FUNC_KQ },
5036
- { "qrotated", OFFLOAD_FUNC_KQ },
5037
-
5038
- { "q", OFFLOAD_FUNC_KQ },
5039
- { "k", OFFLOAD_FUNC_KQ },
5040
- { "kq", OFFLOAD_FUNC_KQ },
5041
- { "kq_scaled", OFFLOAD_FUNC_KQ },
5042
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5043
- { "kq_masked", OFFLOAD_FUNC_KQ },
5044
- { "kq_soft_max", OFFLOAD_FUNC_V },
5045
- { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5046
- { "v", OFFLOAD_FUNC_V },
5047
- { "kqv", OFFLOAD_FUNC_V },
5048
- { "kqv_merged", OFFLOAD_FUNC_V },
5049
- { "kqv_merged_cont", OFFLOAD_FUNC_V },
5050
- { "kqv_wo", OFFLOAD_FUNC_V },
5051
- { "kqv_out", OFFLOAD_FUNC_V },
5398
+ { "wqkv", OFFLOAD_FUNC_KQV },
5399
+ { "bqkv", OFFLOAD_FUNC_KQV },
5400
+ { "wqkv_clamped", OFFLOAD_FUNC_KQV },
5401
+
5402
+ { "tmpk", OFFLOAD_FUNC_KQV },
5403
+ { "tmpq", OFFLOAD_FUNC_KQV },
5404
+ { "tmpv", OFFLOAD_FUNC_KQV },
5405
+ { "Kcur", OFFLOAD_FUNC_KQV },
5406
+ { "Qcur", OFFLOAD_FUNC_KQV },
5407
+ { "Vcur", OFFLOAD_FUNC_KQV },
5408
+
5409
+ { "krot", OFFLOAD_FUNC_KQV },
5410
+ { "qrot", OFFLOAD_FUNC_KQV },
5411
+ { "kpass", OFFLOAD_FUNC_KQV },
5412
+ { "qpass", OFFLOAD_FUNC_KQV },
5413
+ { "krotated", OFFLOAD_FUNC_KQV },
5414
+ { "qrotated", OFFLOAD_FUNC_KQV },
5415
+
5416
+ { "q", OFFLOAD_FUNC_KQV },
5417
+ { "k", OFFLOAD_FUNC_KQV },
5418
+ { "kq", OFFLOAD_FUNC_KQV },
5419
+ { "kq_scaled", OFFLOAD_FUNC_KQV },
5420
+ { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
5421
+ { "kq_masked", OFFLOAD_FUNC_KQV },
5422
+ { "kq_soft_max", OFFLOAD_FUNC_KQV },
5423
+ { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
5424
+ { "v", OFFLOAD_FUNC_KQV },
5425
+ { "kqv", OFFLOAD_FUNC_KQV },
5426
+ { "kqv_merged", OFFLOAD_FUNC_KQV },
5427
+ { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
5428
+ { "kqv_wo", OFFLOAD_FUNC_KQV },
5429
+ { "kqv_out", OFFLOAD_FUNC_KQV },
5052
5430
 
5053
5431
  { "ffn_inp", OFFLOAD_FUNC },
5054
5432
  { "ffn_norm", OFFLOAD_FUNC },
@@ -5240,15 +5618,15 @@ static struct ggml_cgraph * llama_build_graph(
5240
5618
  { OFFLOAD_FUNC_NOP, "CPU" },
5241
5619
  { OFFLOAD_FUNC_OUT, "CPU" },
5242
5620
  #ifdef GGML_USE_CUBLAS
5243
- { OFFLOAD_FUNC, "GPU (CUDA)" },
5244
- { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" },
5245
- { OFFLOAD_FUNC_V, "GPU (CUDA) V" },
5246
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5621
+ { OFFLOAD_FUNC, "GPU (CUDA)" },
5622
+ { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5623
+ { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
5624
+ { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5247
5625
  { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
5248
5626
  #else
5249
5627
  { OFFLOAD_FUNC, "CPU" },
5250
- { OFFLOAD_FUNC_KQ, "CPU" },
5251
- { OFFLOAD_FUNC_V, "CPU" },
5628
+ { OFFLOAD_FUNC_FRC, "CPU" },
5629
+ { OFFLOAD_FUNC_KQV, "CPU" },
5252
5630
  { OFFLOAD_FUNC_NR, "CPU" },
5253
5631
  { OFFLOAD_FUNC_EMB, "CPU" },
5254
5632
  #endif // GGML_USE_CUBLAS
@@ -5281,18 +5659,23 @@ static struct ggml_cgraph * llama_build_graph(
5281
5659
  }
5282
5660
  }
5283
5661
  break;
5284
- case OFFLOAD_FUNC_NR:
5285
- if (n_gpu_layers <= n_layer + 0) {
5662
+ case OFFLOAD_FUNC_FRC:
5663
+ if (!lctx.cparams.offload_kqv) {
5286
5664
  func_e = OFFLOAD_FUNC_NOP;
5287
- }
5288
- break;
5289
- case OFFLOAD_FUNC_V:
5290
- if (n_gpu_layers <= n_layer + 1) {
5665
+ } break;
5666
+ case OFFLOAD_FUNC_KQV:
5667
+ if (!lctx.cparams.offload_kqv) {
5291
5668
  func_e = OFFLOAD_FUNC_NOP;
5669
+ } else {
5670
+ if (n_gpu_layers < n_layer) {
5671
+ if (il < i_gpu_start) {
5672
+ func_e = OFFLOAD_FUNC_NOP;
5673
+ }
5674
+ }
5292
5675
  }
5293
5676
  break;
5294
- case OFFLOAD_FUNC_KQ:
5295
- if (n_gpu_layers <= n_layer + 2) {
5677
+ case OFFLOAD_FUNC_NR:
5678
+ if (n_gpu_layers <= n_layer + 0) {
5296
5679
  func_e = OFFLOAD_FUNC_NOP;
5297
5680
  }
5298
5681
  break;
@@ -5317,8 +5700,8 @@ static struct ggml_cgraph * llama_build_graph(
5317
5700
  case OFFLOAD_FUNC_NOP:
5318
5701
  case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
5319
5702
  case OFFLOAD_FUNC:
5320
- case OFFLOAD_FUNC_KQ:
5321
- case OFFLOAD_FUNC_V:
5703
+ case OFFLOAD_FUNC_KQV:
5704
+ case OFFLOAD_FUNC_FRC:
5322
5705
  case OFFLOAD_FUNC_NR:
5323
5706
  case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
5324
5707
  default: GGML_ASSERT(false);
@@ -5377,6 +5760,10 @@ static struct ggml_cgraph * llama_build_graph(
5377
5760
  {
5378
5761
  result = llm.build_stablelm();
5379
5762
  } break;
5763
+ case LLM_ARCH_QWEN:
5764
+ {
5765
+ result = llm.build_qwen();
5766
+ } break;
5380
5767
  default:
5381
5768
  GGML_ASSERT(false);
5382
5769
  }
@@ -5499,8 +5886,8 @@ static int llama_decode_internal(
5499
5886
  // a heuristic, to avoid attending the full cache if it is not yet utilized
5500
5887
  // after enough generations, the benefit from this heuristic disappears
5501
5888
  // if we start defragmenting the cache, the benefit from this will be more important
5502
- //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5503
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5889
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
5890
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
5504
5891
 
5505
5892
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5506
5893
 
@@ -5551,7 +5938,7 @@ static int llama_decode_internal(
5551
5938
  n_threads = std::min(4, n_threads);
5552
5939
  }
5553
5940
 
5554
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5941
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
5555
5942
  if (ggml_cpu_has_cublas() && fully_offloaded) {
5556
5943
  n_threads = 1;
5557
5944
  }
@@ -6410,14 +6797,13 @@ struct llama_grammar_candidate {
6410
6797
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
6411
6798
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6412
6799
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6413
- const char * src,
6414
- size_t n_src,
6800
+ const std::string & src,
6415
6801
  llama_partial_utf8 partial_start) {
6416
6802
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6417
- const char * pos = src;
6803
+ const char * pos = src.c_str();
6418
6804
  std::vector<uint32_t> code_points;
6419
6805
  // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
- code_points.reserve(n_src + 1);
6806
+ code_points.reserve(src.size() + 1);
6421
6807
  uint32_t value = partial_start.value;
6422
6808
  int n_remain = partial_start.n_remain;
6423
6809
 
@@ -6468,13 +6854,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6468
6854
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6469
6855
  }
6470
6856
 
6471
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
- std::string src,
6473
- llama_partial_utf8 partial_start
6474
- ) {
6475
- return decode_utf8(src.c_str(), src.size(), partial_start);
6476
- }
6477
-
6478
6857
  // returns true iff pos points to the end of one of the definitions of a rule
6479
6858
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6480
6859
  switch (pos->type) {
@@ -7113,11 +7492,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7113
7492
  const llama_token eos = llama_token_eos(&ctx->model);
7114
7493
 
7115
7494
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7495
+ candidates_decoded.reserve(candidates->size);
7116
7496
  std::vector<llama_grammar_candidate> candidates_grammar;
7497
+ candidates_grammar.reserve(candidates->size);
7117
7498
 
7118
7499
  for (size_t i = 0; i < candidates->size; ++i) {
7119
7500
  const llama_token id = candidates->data[i].id;
7120
- const std::string piece = llama_token_to_piece(ctx, id);
7501
+ const std::string & piece = ctx->model.vocab.id_to_token[id].text;
7121
7502
  if (id == eos) {
7122
7503
  if (!allow_eos) {
7123
7504
  candidates->data[i].logit = -INFINITY;
@@ -7329,7 +7710,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7329
7710
  GGML_ASSERT(false);
7330
7711
  }
7331
7712
 
7332
- const std::string piece = llama_token_to_piece(ctx, token);
7713
+ const std::string & piece = ctx->model.vocab.id_to_token[token].text;
7333
7714
 
7334
7715
  // Note terminating 0 in decoded string
7335
7716
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -7650,18 +8031,21 @@ static void llama_convert_tensor_internal(
7650
8031
  return;
7651
8032
  }
7652
8033
 
7653
- auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
7654
- auto block_size_bytes = ggml_type_size(tensor->type);
8034
+ size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
8035
+ size_t block_size_bytes = ggml_type_size(tensor->type);
7655
8036
 
7656
8037
  GGML_ASSERT(nelements % block_size == 0);
7657
- auto nblocks = nelements / block_size;
7658
- auto blocks_per_thread = nblocks / nthread;
7659
- auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8038
+ size_t nblocks = nelements / block_size;
8039
+ size_t blocks_per_thread = nblocks / nthread;
8040
+ size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8041
+
8042
+ size_t in_buff_offs = 0;
8043
+ size_t out_buff_offs = 0;
7660
8044
 
7661
- for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
7662
- auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
7663
- auto thr_elems = thr_blocks * block_size; // number of elements for this thread
7664
- auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
8045
+ for (int tnum = 0; tnum < nthread; tnum++) {
8046
+ size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
8047
+ size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
8048
+ size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
7665
8049
 
7666
8050
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
7667
8051
  if (typ == GGML_TYPE_F16) {
@@ -7831,7 +8215,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7831
8215
  constexpr bool use_mmap = false;
7832
8216
  #endif
7833
8217
 
7834
- llama_model_loader ml(fname_inp, use_mmap);
8218
+ llama_model_loader ml(fname_inp, use_mmap, NULL);
7835
8219
  if (ml.use_mmap) {
7836
8220
  ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7837
8221
  }
@@ -8127,7 +8511,7 @@ static int llama_apply_lora_from_file_internal(
8127
8511
  std::vector<uint8_t> base_buf;
8128
8512
  if (path_base_model) {
8129
8513
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8130
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
8514
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8131
8515
 
8132
8516
  size_t ctx_size;
8133
8517
  size_t mmapped_size;
@@ -8355,6 +8739,7 @@ struct llama_model_params llama_model_default_params() {
8355
8739
  /*.tensor_split =*/ nullptr,
8356
8740
  /*.progress_callback =*/ nullptr,
8357
8741
  /*.progress_callback_user_data =*/ nullptr,
8742
+ /*.kv_overrides =*/ nullptr,
8358
8743
  /*.vocab_only =*/ false,
8359
8744
  /*.use_mmap =*/ true,
8360
8745
  /*.use_mlock =*/ false,
@@ -8382,10 +8767,12 @@ struct llama_context_params llama_context_default_params() {
8382
8767
  /*.yarn_beta_fast =*/ 32.0f,
8383
8768
  /*.yarn_beta_slow =*/ 1.0f,
8384
8769
  /*.yarn_orig_ctx =*/ 0,
8770
+ /*.type_k =*/ GGML_TYPE_F16,
8771
+ /*.type_v =*/ GGML_TYPE_F16,
8385
8772
  /*.mul_mat_q =*/ true,
8386
- /*.f16_kv =*/ true,
8387
8773
  /*.logits_all =*/ false,
8388
8774
  /*.embedding =*/ false,
8775
+ /*.offload_kqv =*/ true,
8389
8776
  };
8390
8777
 
8391
8778
  return result;
@@ -8502,6 +8889,7 @@ struct llama_context * llama_new_context_with_model(
8502
8889
  cparams.yarn_beta_fast = params.yarn_beta_fast;
8503
8890
  cparams.yarn_beta_slow = params.yarn_beta_slow;
8504
8891
  cparams.mul_mat_q = params.mul_mat_q;
8892
+ cparams.offload_kqv = params.offload_kqv;
8505
8893
 
8506
8894
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
8507
8895
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -8535,19 +8923,36 @@ struct llama_context * llama_new_context_with_model(
8535
8923
  ctx->rng = std::mt19937(params.seed);
8536
8924
  ctx->logits_all = params.logits_all;
8537
8925
 
8538
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
8926
+ const ggml_type type_k = params.type_k;
8927
+ const ggml_type type_v = params.type_v;
8928
+
8929
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
8930
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
8539
8931
 
8540
8932
  // reserve memory for context buffers
8541
8933
  if (!hparams.vocab_only) {
8542
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
8934
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
8543
8935
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
8544
8936
  llama_free(ctx);
8545
8937
  return nullptr;
8546
8938
  }
8547
8939
 
8548
8940
  {
8549
- const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8550
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8941
+ size_t memory_size_k = 0;
8942
+ size_t memory_size_v = 0;
8943
+
8944
+ for (auto & k : ctx->kv_self.k_l) {
8945
+ memory_size_k += ggml_nbytes(k);
8946
+ }
8947
+
8948
+ for (auto & v : ctx->kv_self.v_l) {
8949
+ memory_size_v += ggml_nbytes(v);
8950
+ }
8951
+
8952
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
8953
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
8954
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
8955
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
8551
8956
  }
8552
8957
 
8553
8958
  // resized during inference
@@ -8618,8 +9023,12 @@ struct llama_context * llama_new_context_with_model(
8618
9023
  }
8619
9024
 
8620
9025
  size_t kv_vram_size = 0;
8621
- add_tensor(ctx->kv_self.k, kv_vram_size);
8622
- add_tensor(ctx->kv_self.v, kv_vram_size);
9026
+ for (auto & k : ctx->kv_self.k_l) {
9027
+ add_tensor(k, kv_vram_size);
9028
+ }
9029
+ for (auto & v : ctx->kv_self.v_l) {
9030
+ add_tensor(v, kv_vram_size);
9031
+ }
8623
9032
 
8624
9033
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8625
9034
  size_t total_vram_size = model_vram_size + ctx_vram_size;
@@ -9089,37 +9498,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9089
9498
  data_ctx->write(&kv_used, sizeof(kv_used));
9090
9499
 
9091
9500
  if (kv_buf_size) {
9092
- const size_t elt_size = ggml_element_size(kv_self.k);
9501
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9093
9502
 
9094
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9503
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9095
9504
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9096
9505
 
9097
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9098
- std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9099
- kout3d->data = kout3d_data.data();
9506
+ std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9507
+ std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9508
+
9509
+ for (int il = 0; il < (int) n_layer; ++il) {
9510
+ ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9511
+ kout2d_data[il].resize(ggml_nbytes(kout2d));
9512
+ kout2d->data = kout2d_data[il].data();
9100
9513
 
9101
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9102
- std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
9103
- vout3d->data = vout3d_data.data();
9514
+ ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9515
+ vout2d_data[il].resize(ggml_nbytes(vout2d));
9516
+ vout2d->data = vout2d_data[il].data();
9104
9517
 
9105
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9106
- n_embd, kv_head, n_layer,
9107
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9518
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9519
+ n_embd, kv_head,
9520
+ elt_size*n_embd, 0);
9108
9521
 
9109
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9110
- kv_head, n_embd, n_layer,
9111
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9522
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9523
+ kv_head, n_embd,
9524
+ elt_size*n_ctx, 0);
9525
+
9526
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9527
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9528
+ }
9112
9529
 
9113
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9114
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9115
9530
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9116
9531
 
9117
9532
  ggml_free(cpy_ctx);
9118
9533
 
9119
- // our data is now in the kout3d_data and vout3d_data buffers
9534
+ // our data is now in the kout2d_data and vout2d_data buffers
9120
9535
  // write them to file
9121
- data_ctx->write(kout3d_data.data(), kout3d_data.size());
9122
- data_ctx->write(vout3d_data.data(), vout3d_data.size());
9536
+ for (uint32_t il = 0; il < n_layer; ++il) {
9537
+ data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9538
+ data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9539
+ }
9123
9540
  }
9124
9541
 
9125
9542
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9219,29 +9636,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9219
9636
  if (kv_buf_size) {
9220
9637
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9221
9638
 
9222
- const size_t elt_size = ggml_element_size(kv_self.k);
9639
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9223
9640
 
9224
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9641
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9225
9642
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9226
9643
 
9227
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9228
- kin3d->data = (void *) inp;
9229
- inp += ggml_nbytes(kin3d);
9644
+ for (int il = 0; il < n_layer; ++il) {
9645
+ ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9646
+ kin2d->data = (void *) inp;
9647
+ inp += ggml_nbytes(kin2d);
9230
9648
 
9231
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9232
- vin3d->data = (void *) inp;
9233
- inp += ggml_nbytes(vin3d);
9649
+ ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9650
+ vin2d->data = (void *) inp;
9651
+ inp += ggml_nbytes(vin2d);
9234
9652
 
9235
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9236
- n_embd, kv_head, n_layer,
9237
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9653
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9654
+ n_embd, kv_head,
9655
+ elt_size*n_embd, 0);
9238
9656
 
9239
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9240
- kv_head, n_embd, n_layer,
9241
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9657
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9658
+ kv_head, n_embd,
9659
+ elt_size*n_ctx, 0);
9660
+
9661
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9662
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9663
+ }
9242
9664
 
9243
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9244
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9245
9665
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9246
9666
 
9247
9667
  ggml_free(cpy_ctx);