llama_cpp 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,6 +74,7 @@
74
74
  #include <set>
75
75
  #include <sstream>
76
76
  #include <thread>
77
+ #include <type_traits>
77
78
  #include <unordered_map>
78
79
 
79
80
  #if defined(_MSC_VER)
@@ -192,6 +193,7 @@ enum llm_arch {
192
193
  LLM_ARCH_REFACT,
193
194
  LLM_ARCH_BLOOM,
194
195
  LLM_ARCH_STABLELM,
196
+ LLM_ARCH_QWEN,
195
197
  LLM_ARCH_UNKNOWN,
196
198
  };
197
199
 
@@ -208,6 +210,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
208
210
  { LLM_ARCH_REFACT, "refact" },
209
211
  { LLM_ARCH_BLOOM, "bloom" },
210
212
  { LLM_ARCH_STABLELM, "stablelm" },
213
+ { LLM_ARCH_QWEN, "qwen" },
211
214
  };
212
215
 
213
216
  enum llm_kv {
@@ -518,6 +521,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
518
521
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
519
522
  },
520
523
  },
524
+ {
525
+ LLM_ARCH_QWEN,
526
+ {
527
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
528
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
529
+ { LLM_TENSOR_OUTPUT, "output" },
530
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
531
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
532
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
533
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
534
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
535
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
536
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
537
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
538
+ },
539
+ },
521
540
 
522
541
  {
523
542
  LLM_ARCH_UNKNOWN,
@@ -572,21 +591,6 @@ struct LLM_TN {
572
591
  // gguf helpers
573
592
  //
574
593
 
575
- #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
576
- do { \
577
- const std::string skey(key); \
578
- const int kid = gguf_find_key(ctx, skey.c_str()); \
579
- if (kid >= 0) { \
580
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
581
- if (ktype != (type)) { \
582
- throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
583
- } \
584
- (dst) = func(ctx, kid); \
585
- } else if (req) { \
586
- throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
587
- } \
588
- } while (0)
589
-
590
594
  static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
591
595
  { LLAMA_ROPE_SCALING_NONE, "none" },
592
596
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -620,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
620
624
  }
621
625
  }
622
626
 
623
- static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
627
+ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
624
628
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
629
 
626
630
  switch (type) {
@@ -1222,6 +1226,7 @@ struct llama_cparams {
1222
1226
  float yarn_beta_slow;
1223
1227
 
1224
1228
  bool mul_mat_q;
1229
+ bool offload_kqv;
1225
1230
  };
1226
1231
 
1227
1232
  struct llama_layer {
@@ -1243,6 +1248,9 @@ struct llama_layer {
1243
1248
  struct ggml_tensor * wqkv;
1244
1249
 
1245
1250
  // attention bias
1251
+ struct ggml_tensor * bq;
1252
+ struct ggml_tensor * bk;
1253
+ struct ggml_tensor * bv;
1246
1254
  struct ggml_tensor * bo;
1247
1255
  struct ggml_tensor * bqkv;
1248
1256
 
@@ -1287,8 +1295,8 @@ struct llama_kv_cache {
1287
1295
 
1288
1296
  std::vector<llama_kv_cell> cells;
1289
1297
 
1290
- struct ggml_tensor * k = NULL;
1291
- struct ggml_tensor * v = NULL;
1298
+ std::vector<struct ggml_tensor *> k_l; // per layer
1299
+ std::vector<struct ggml_tensor *> v_l;
1292
1300
 
1293
1301
  struct ggml_context * ctx = NULL;
1294
1302
 
@@ -1301,8 +1309,10 @@ struct llama_kv_cache {
1301
1309
 
1302
1310
  #ifdef GGML_USE_CUBLAS
1303
1311
  if (ggml_cublas_loaded()) {
1304
- ggml_cuda_free_data(k);
1305
- ggml_cuda_free_data(v);
1312
+ for (size_t i = 0; i < k_l.size(); ++i) {
1313
+ ggml_cuda_free_data(k_l[i]);
1314
+ ggml_cuda_free_data(v_l[i]);
1315
+ }
1306
1316
  }
1307
1317
  #endif
1308
1318
  }
@@ -1492,9 +1502,11 @@ struct llama_context {
1492
1502
  static bool llama_kv_cache_init(
1493
1503
  const struct llama_hparams & hparams,
1494
1504
  struct llama_kv_cache & cache,
1495
- ggml_type wtype,
1505
+ ggml_type ktype,
1506
+ ggml_type vtype,
1496
1507
  uint32_t n_ctx,
1497
- int n_gpu_layers) {
1508
+ int n_gpu_layers,
1509
+ bool offload) {
1498
1510
  const uint32_t n_embd = hparams.n_embd_gqa();
1499
1511
  const uint32_t n_layer = hparams.n_layer;
1500
1512
 
@@ -1510,7 +1522,7 @@ static bool llama_kv_cache_init(
1510
1522
  cache.cells.clear();
1511
1523
  cache.cells.resize(n_ctx);
1512
1524
 
1513
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1525
+ cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1514
1526
  memset(cache.buf.data, 0, cache.buf.size);
1515
1527
 
1516
1528
  struct ggml_init_params params;
@@ -1520,37 +1532,44 @@ static bool llama_kv_cache_init(
1520
1532
 
1521
1533
  cache.ctx = ggml_init(params);
1522
1534
 
1535
+ size_t vram_kv_cache = 0;
1536
+
1523
1537
  if (!cache.ctx) {
1524
1538
  LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1525
1539
  return false;
1526
1540
  }
1527
1541
 
1528
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1529
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
1530
- ggml_set_name(cache.k, "cache_k");
1531
- ggml_set_name(cache.v, "cache_v");
1542
+ cache.k_l.reserve(n_layer);
1543
+ cache.v_l.reserve(n_layer);
1532
1544
 
1533
- (void) n_gpu_layers;
1545
+ const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
1534
1546
 
1535
- #ifdef GGML_USE_CUBLAS
1536
- if (ggml_cublas_loaded()) {
1537
- size_t vram_kv_cache = 0;
1547
+ GGML_UNUSED(offload);
1538
1548
 
1539
- if (n_gpu_layers > (int)n_layer + 1) {
1540
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1541
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1542
- vram_kv_cache += ggml_nbytes(cache.v);
1543
- }
1544
- if (n_gpu_layers > (int)n_layer + 2) {
1545
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1546
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1547
- vram_kv_cache += ggml_nbytes(cache.k);
1548
- }
1549
- if (vram_kv_cache > 0) {
1550
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1549
+ for (int i = 0; i < (int) n_layer; i++) {
1550
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1551
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1552
+ ggml_format_name(k, "cache_k_l%d", i);
1553
+ ggml_format_name(v, "cache_v_l%d", i);
1554
+ cache.k_l.push_back(k);
1555
+ cache.v_l.push_back(v);
1556
+ #ifdef GGML_USE_CUBLAS
1557
+ if (i >= i_gpu_start) {
1558
+ if (offload) {
1559
+ ggml_cuda_assign_buffers_no_scratch(k);
1560
+ vram_kv_cache += ggml_nbytes(k);
1561
+ ggml_cuda_assign_buffers_no_scratch(v);
1562
+ vram_kv_cache += ggml_nbytes(v);
1563
+ }
1551
1564
  }
1565
+ #endif // GGML_USE_CUBLAS
1552
1566
  }
1553
- #endif
1567
+
1568
+ if (vram_kv_cache > 0) {
1569
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1570
+ }
1571
+
1572
+ GGML_UNUSED(n_gpu_layers);
1554
1573
 
1555
1574
  return true;
1556
1575
  }
@@ -1771,6 +1790,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
1771
1790
  return buf;
1772
1791
  }
1773
1792
 
1793
+ namespace GGUFMeta {
1794
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
1795
+ struct GKV_Base_Type {
1796
+ static constexpr gguf_type gt = gt_;
1797
+
1798
+ static T getter(const gguf_context * ctx, const int kid) {
1799
+ return gfun(ctx, kid);
1800
+ }
1801
+ };
1802
+
1803
+ template<typename T> struct GKV_Base;
1804
+
1805
+ template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
1806
+ template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
1807
+ template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
1808
+ template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
1809
+ template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
1810
+ template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
1811
+ template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
1812
+ template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
1813
+ template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
1814
+ template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
1815
+ template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
1816
+ template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
1817
+
1818
+ template<> struct GKV_Base<std::string> {
1819
+ static constexpr gguf_type gt = GGUF_TYPE_STRING;
1820
+
1821
+ static std::string getter(const gguf_context * ctx, const int kid) {
1822
+ return gguf_get_val_str(ctx, kid);
1823
+ }
1824
+ };
1825
+
1826
+ struct ArrayInfo{
1827
+ const gguf_type gt;
1828
+ const size_t length;
1829
+ const void * data;
1830
+ };
1831
+
1832
+ template<> struct GKV_Base<ArrayInfo> {
1833
+ public:
1834
+ static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
1835
+ static ArrayInfo getter(const gguf_context *ctx, const int k) {
1836
+ return ArrayInfo {
1837
+ gguf_get_arr_type(ctx, k),
1838
+ size_t(gguf_get_arr_n(ctx, k)),
1839
+ gguf_get_arr_data(ctx, k),
1840
+ };
1841
+ }
1842
+ };
1843
+
1844
+ template<typename T>
1845
+ class GKV: public GKV_Base<T> {
1846
+ GKV() = delete;
1847
+
1848
+ public:
1849
+ static T get_kv(const gguf_context * ctx, const int k) {
1850
+ const enum gguf_type kt = gguf_get_kv_type(ctx, k);
1851
+
1852
+ if (kt != GKV::gt) {
1853
+ throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
1854
+ gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
1855
+ }
1856
+ return GKV::getter(ctx, k);
1857
+ }
1858
+
1859
+ static const char * override_type_to_str(const llama_model_kv_override_type ty) {
1860
+ switch (ty) {
1861
+ case LLAMA_KV_OVERRIDE_BOOL: return "bool";
1862
+ case LLAMA_KV_OVERRIDE_INT: return "int";
1863
+ case LLAMA_KV_OVERRIDE_FLOAT: return "float";
1864
+ }
1865
+ return "unknown";
1866
+ }
1867
+
1868
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
1869
+ if (!override) { return false; }
1870
+ if (override->tag == expected_type) {
1871
+ LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
1872
+ __func__, override_type_to_str(override->tag), override->key);
1873
+ switch (override->tag) {
1874
+ case LLAMA_KV_OVERRIDE_BOOL: {
1875
+ printf("%s\n", override->bool_value ? "true" : "false");
1876
+ } break;
1877
+ case LLAMA_KV_OVERRIDE_INT: {
1878
+ printf("%" PRId64 "\n", override->int_value);
1879
+ } break;
1880
+ case LLAMA_KV_OVERRIDE_FLOAT: {
1881
+ printf("%.6f\n", override->float_value);
1882
+ } break;
1883
+ default:
1884
+ // Shouldn't be possible to end up here, but just in case...
1885
+ throw std::runtime_error(
1886
+ format("Unsupported attempt to override %s type for metadata key %s\n",
1887
+ override_type_to_str(override->tag), override->key));
1888
+ }
1889
+ return true;
1890
+ }
1891
+ LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
1892
+ __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
1893
+ return false;
1894
+ }
1895
+
1896
+ template<typename OT>
1897
+ static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
1898
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1899
+ if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
1900
+ target = override->bool_value;
1901
+ return true;
1902
+ }
1903
+ return true;
1904
+ }
1905
+
1906
+ template<typename OT>
1907
+ static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
1908
+ try_override(OT & target, const struct llama_model_kv_override *override) {
1909
+ if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
1910
+ target = override->int_value;
1911
+ return true;
1912
+ }
1913
+ return false;
1914
+ }
1915
+
1916
+ template<typename OT>
1917
+ static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
1918
+ try_override(T & target, const struct llama_model_kv_override *override) {
1919
+ if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
1920
+ target = override->float_value;
1921
+ return true;
1922
+ }
1923
+ return false;
1924
+ }
1925
+
1926
+ template<typename OT>
1927
+ static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
1928
+ try_override(T & target, const struct llama_model_kv_override *override) {
1929
+ (void)target;
1930
+ (void)override;
1931
+ if (!override) { return false; }
1932
+ // Currently, we should never end up here so it would be a bug if we do.
1933
+ throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
1934
+ override ? override->key : "NULL"));
1935
+ }
1936
+
1937
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
1938
+ if (try_override<T>(target, override)) {
1939
+ return true;
1940
+ }
1941
+ if (k < 0) { return false; }
1942
+ target = get_kv(ctx, k);
1943
+ return true;
1944
+ }
1945
+
1946
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
1947
+ return set(ctx, gguf_find_key(ctx, key), target, override);
1948
+ }
1949
+
1950
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
1951
+ return set(ctx, key.c_str(), target, override);
1952
+ }
1953
+ };
1954
+ }
1955
+
1774
1956
  struct llama_model_loader {
1775
1957
  int n_kv = 0;
1776
1958
  int n_tensors = 0;
@@ -1786,21 +1968,34 @@ struct llama_model_loader {
1786
1968
  llama_fver fver;
1787
1969
 
1788
1970
  std::unique_ptr<llama_mmap> mapping;
1971
+ std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
1789
1972
 
1790
1973
  struct gguf_context * ctx_gguf = NULL;
1791
1974
  struct ggml_context * ctx_meta = NULL;
1792
1975
 
1793
- llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
1976
+ std::string arch_name;
1977
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
1978
+
1979
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
1794
1980
  struct gguf_init_params params = {
1795
1981
  /*.no_alloc = */ true,
1796
1982
  /*.ctx = */ &ctx_meta,
1797
1983
  };
1798
1984
 
1985
+ if (param_overrides_p != nullptr) {
1986
+ for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
1987
+ kv_overrides.insert({std::string(p->key), *p});
1988
+ }
1989
+ }
1990
+
1799
1991
  ctx_gguf = gguf_init_from_file(fname.c_str(), params);
1800
1992
  if (!ctx_gguf) {
1801
1993
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
1802
1994
  }
1803
1995
 
1996
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
1997
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
1998
+
1804
1999
  n_kv = gguf_get_n_kv(ctx_gguf);
1805
2000
  n_tensors = gguf_get_n_tensors(ctx_gguf);
1806
2001
 
@@ -1868,6 +2063,7 @@ struct llama_model_loader {
1868
2063
  }
1869
2064
  }
1870
2065
 
2066
+ LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
1871
2067
  for (int i = 0; i < n_kv; i++) {
1872
2068
  const char * name = gguf_get_key(ctx_gguf, i);
1873
2069
  const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@@ -1913,19 +2109,59 @@ struct llama_model_loader {
1913
2109
  }
1914
2110
  }
1915
2111
 
1916
- std::string get_arch_name() const {
1917
- const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
2112
+ template<typename T>
2113
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2114
+ get_arr_n(const std::string & key, T & result, const bool required = true) {
2115
+ const int kid = gguf_find_key(ctx_gguf, key.c_str());
2116
+
2117
+ if (kid < 0) {
2118
+ if (required) {
2119
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2120
+ }
2121
+ return false;
2122
+ }
2123
+
2124
+ struct GGUFMeta::ArrayInfo arr_info =
2125
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
1918
2126
 
1919
- std::string arch_name;
1920
- GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
1921
2127
 
2128
+ result = arr_info.length;
2129
+ return true;
2130
+ }
2131
+
2132
+ template<typename T>
2133
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
2134
+ get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
2135
+ return get_arr_n(llm_kv(kid), result, required);
2136
+ }
2137
+
2138
+ template<typename T>
2139
+ bool get_key(const std::string & key, T & result, const bool required = true) {
2140
+ auto it = kv_overrides.find(key);
2141
+
2142
+ const struct llama_model_kv_override * override =
2143
+ it != kv_overrides.end() ? &it->second : nullptr;
2144
+
2145
+ const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
2146
+
2147
+ if (required && !found) {
2148
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
2149
+ }
2150
+
2151
+ return found;
2152
+ }
2153
+
2154
+ template<typename T>
2155
+ bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
2156
+ return get_key(llm_kv(kid), result, required);
2157
+ }
2158
+
2159
+ std::string get_arch_name() const {
1922
2160
  return arch_name;
1923
2161
  }
1924
2162
 
1925
2163
  enum llm_arch get_arch() const {
1926
- const std::string arch_name = get_arch_name();
1927
-
1928
- return llm_arch_from_string(arch_name);
2164
+ return llm_kv.arch;
1929
2165
  }
1930
2166
 
1931
2167
  const char * get_tensor_name(int i) const {
@@ -1965,10 +2201,13 @@ struct llama_model_loader {
1965
2201
  return tensor;
1966
2202
  }
1967
2203
 
1968
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
2204
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
1969
2205
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1970
2206
 
1971
2207
  if (cur == NULL) {
2208
+ if (!required) {
2209
+ return NULL;
2210
+ }
1972
2211
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
1973
2212
  }
1974
2213
 
@@ -2172,11 +2411,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2172
2411
  static void llm_load_hparams(
2173
2412
  llama_model_loader & ml,
2174
2413
  llama_model & model) {
2175
- struct gguf_context * ctx = ml.ctx_gguf;
2176
-
2177
- const auto kv = LLM_KV(model.arch);
2178
-
2179
2414
  auto & hparams = model.hparams;
2415
+ const gguf_context * ctx = ml.ctx_gguf;
2180
2416
 
2181
2417
  // get metadata as string
2182
2418
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -2190,42 +2426,41 @@ static void llm_load_hparams(
2190
2426
  }
2191
2427
 
2192
2428
  // get general kv
2193
- GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2429
+ ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
2194
2430
 
2195
2431
  // get hparams kv
2196
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
2197
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
2198
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
2199
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
2200
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
2201
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
2432
+ ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
2433
+ ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
2434
+ ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
2435
+ ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2436
+ ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2437
+ ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2202
2438
 
2203
2439
  // n_head_kv is optional, default to n_head
2204
2440
  hparams.n_head_kv = hparams.n_head;
2205
- GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
2441
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
2206
2442
 
2207
- hparams.rope_finetuned = false;
2208
- GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
2209
- kv(LLM_KV_ROPE_SCALING_FINETUNED));
2443
+ bool rope_finetuned = false;
2444
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
2445
+ hparams.rope_finetuned = rope_finetuned;
2210
2446
 
2211
2447
  hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
2212
- GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
2213
- kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
2448
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
2214
2449
 
2215
2450
  // rope_freq_base (optional)
2216
2451
  hparams.rope_freq_base_train = 10000.0f;
2217
- GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
2452
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
2218
2453
 
2219
2454
  std::string rope_scaling("linear");
2220
- GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
2455
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2221
2456
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2222
2457
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
2223
2458
 
2224
2459
  // rope_freq_scale (inverse of the kv) is optional
2225
2460
  float ropescale = 0.0f;
2226
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
2227
- if (ropescale == 0.0f) { // try the old key name
2228
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
2461
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
2462
+ // try the old key name
2463
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
2229
2464
  }
2230
2465
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
2231
2466
 
@@ -2233,7 +2468,7 @@ static void llm_load_hparams(
2233
2468
  {
2234
2469
  hparams.n_rot = hparams.n_embd / hparams.n_head;
2235
2470
 
2236
- GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
2471
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
2237
2472
 
2238
2473
  if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
2239
2474
  if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@@ -2248,7 +2483,7 @@ static void llm_load_hparams(
2248
2483
  switch (model.arch) {
2249
2484
  case LLM_ARCH_LLAMA:
2250
2485
  {
2251
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2486
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2252
2487
 
2253
2488
  switch (hparams.n_layer) {
2254
2489
  case 26: model.type = e_model::MODEL_3B; break;
@@ -2262,7 +2497,7 @@ static void llm_load_hparams(
2262
2497
  } break;
2263
2498
  case LLM_ARCH_FALCON:
2264
2499
  {
2265
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2500
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2266
2501
 
2267
2502
  switch (hparams.n_layer) {
2268
2503
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2272,7 +2507,7 @@ static void llm_load_hparams(
2272
2507
  } break;
2273
2508
  case LLM_ARCH_BAICHUAN:
2274
2509
  {
2275
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2510
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2276
2511
  switch (hparams.n_layer) {
2277
2512
  case 32: model.type = e_model::MODEL_7B; break;
2278
2513
  case 40: model.type = e_model::MODEL_13B; break;
@@ -2281,7 +2516,7 @@ static void llm_load_hparams(
2281
2516
  } break;
2282
2517
  case LLM_ARCH_STARCODER:
2283
2518
  {
2284
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2519
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2285
2520
  switch (hparams.n_layer) {
2286
2521
  case 24: model.type = e_model::MODEL_1B; break;
2287
2522
  case 36: model.type = e_model::MODEL_3B; break;
@@ -2292,7 +2527,7 @@ static void llm_load_hparams(
2292
2527
  } break;
2293
2528
  case LLM_ARCH_PERSIMMON:
2294
2529
  {
2295
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2530
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2296
2531
  switch (hparams.n_layer) {
2297
2532
  case 36: model.type = e_model::MODEL_8B; break;
2298
2533
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2300,7 +2535,7 @@ static void llm_load_hparams(
2300
2535
  } break;
2301
2536
  case LLM_ARCH_REFACT:
2302
2537
  {
2303
- GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2538
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2304
2539
  switch (hparams.n_layer) {
2305
2540
  case 32: model.type = e_model::MODEL_1B; break;
2306
2541
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -2308,7 +2543,7 @@ static void llm_load_hparams(
2308
2543
  } break;
2309
2544
  case LLM_ARCH_BLOOM:
2310
2545
  {
2311
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2546
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2312
2547
 
2313
2548
  switch (hparams.n_layer) {
2314
2549
  case 24: model.type = e_model::MODEL_1B; break;
@@ -2323,9 +2558,9 @@ static void llm_load_hparams(
2323
2558
  {
2324
2559
  hparams.f_clamp_kqv = 0.0f;
2325
2560
 
2326
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2327
- GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2328
- GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2561
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2562
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
2563
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
2329
2564
 
2330
2565
  switch (hparams.n_layer) {
2331
2566
  case 32: model.type = e_model::MODEL_7B; break;
@@ -2335,13 +2570,23 @@ static void llm_load_hparams(
2335
2570
  } break;
2336
2571
  case LLM_ARCH_STABLELM:
2337
2572
  {
2338
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2573
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2339
2574
 
2340
2575
  switch (hparams.n_layer) {
2341
2576
  case 32: model.type = e_model::MODEL_3B; break;
2342
2577
  default: model.type = e_model::MODEL_UNKNOWN;
2343
2578
  }
2344
2579
  } break;
2580
+ case LLM_ARCH_QWEN:
2581
+ {
2582
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2583
+
2584
+ switch (hparams.n_layer) {
2585
+ case 32: model.type = e_model::MODEL_7B; break;
2586
+ case 40: model.type = e_model::MODEL_13B; break;
2587
+ default: model.type = e_model::MODEL_UNKNOWN;
2588
+ }
2589
+ } break;
2345
2590
 
2346
2591
  default: (void)0;
2347
2592
  }
@@ -2383,7 +2628,7 @@ static void llm_load_vocab(
2383
2628
  {
2384
2629
  std::string tokenizer_name;
2385
2630
 
2386
- GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
2631
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
2387
2632
 
2388
2633
  if (tokenizer_name == "llama") {
2389
2634
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -2473,34 +2718,31 @@ static void llm_load_vocab(
2473
2718
  };
2474
2719
  for (const auto & it : special_token_types) {
2475
2720
  const std::string & key = kv(std::get<0>(it));
2476
- int32_t & id = std::get<1>(it), old_id = id;
2721
+ int32_t & id = std::get<1>(it);
2477
2722
 
2478
- GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2479
- // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2480
- // can only come from the default value, so there's no point in
2481
- // validating that.
2482
- if (size_t(id + 1) > vocab.id_to_token.size()) {
2483
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2484
- __func__, key.c_str(), id, old_id);
2485
- id = old_id;
2723
+ uint32_t new_id;
2724
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
2725
+ continue;
2726
+ }
2727
+ if (new_id >= vocab.id_to_token.size()) {
2728
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
2729
+ __func__, key.c_str(), new_id, id);
2730
+ } else {
2731
+ id = new_id;
2486
2732
  }
2487
2733
 
2488
2734
  }
2489
2735
 
2490
2736
  // Handle add_bos_token and add_eos_token
2491
- std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2492
- int kid = gguf_find_key(ctx, key.c_str());
2493
- enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2494
- vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2495
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2496
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2497
- }
2498
- key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2499
- kid = gguf_find_key(ctx, key.c_str());
2500
- ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2501
- vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2502
- if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2503
- LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2737
+ {
2738
+ bool temp = true;
2739
+
2740
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2741
+ vocab.special_add_bos = int(temp);
2742
+ }
2743
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2744
+ vocab.special_add_eos = int(temp);
2745
+ }
2504
2746
  }
2505
2747
  }
2506
2748
 
@@ -2733,14 +2975,7 @@ static void llm_load_tensors(
2733
2975
  ggml_backend_type backend_output;
2734
2976
 
2735
2977
  if (n_gpu_layers > int(n_layer)) {
2736
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2737
- // on Windows however this is detrimental unless everything is on the GPU
2738
- #ifndef _WIN32
2739
- backend_norm = llama_backend_offload;
2740
- #else
2741
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2742
- #endif // _WIN32
2743
-
2978
+ backend_norm = llama_backend_offload;
2744
2979
  backend_output = llama_backend_offload_split;
2745
2980
  } else {
2746
2981
  backend_norm = GGML_BACKEND_CPU;
@@ -2777,6 +3012,12 @@ static void llm_load_tensors(
2777
3012
  layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2778
3013
  layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2779
3014
 
3015
+ // optional bias tensors
3016
+ layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3017
+ layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3018
+ layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3019
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3020
+
2780
3021
  layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
3022
 
2782
3023
  layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
@@ -2785,9 +3026,14 @@ static void llm_load_tensors(
2785
3026
 
2786
3027
  if (backend == GGML_BACKEND_GPU) {
2787
3028
  vram_weights +=
2788
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2789
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2790
- ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3029
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3030
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
3031
+ (layer.bq ? ggml_nbytes(layer.bq) : 0) +
3032
+ (layer.bk ? ggml_nbytes(layer.bk) : 0) +
3033
+ (layer.bv ? ggml_nbytes(layer.bv) : 0) +
3034
+ (layer.bo ? ggml_nbytes(layer.bo) : 0) +
3035
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3036
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2791
3037
  }
2792
3038
  }
2793
3039
  } break;
@@ -2799,14 +3045,7 @@ static void llm_load_tensors(
2799
3045
  ggml_backend_type backend_output;
2800
3046
 
2801
3047
  if (n_gpu_layers > int(n_layer)) {
2802
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2803
- // on Windows however this is detrimental unless everything is on the GPU
2804
- #ifndef _WIN32
2805
- backend_norm = llama_backend_offload;
2806
- #else
2807
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2808
- #endif // _WIN32
2809
-
3048
+ backend_norm = llama_backend_offload;
2810
3049
  backend_output = llama_backend_offload_split;
2811
3050
  } else {
2812
3051
  backend_norm = GGML_BACKEND_CPU;
@@ -2869,14 +3108,7 @@ static void llm_load_tensors(
2869
3108
  ggml_backend_type backend_output;
2870
3109
 
2871
3110
  if (n_gpu_layers > int(n_layer)) {
2872
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2873
- // on Windows however this is detrimental unless everything is on the GPU
2874
- #ifndef _WIN32
2875
- backend_norm = llama_backend_offload;
2876
- #else
2877
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2878
- #endif // _WIN32
2879
-
3111
+ backend_norm = llama_backend_offload;
2880
3112
  backend_output = llama_backend_offload_split;
2881
3113
  } else {
2882
3114
  backend_norm = GGML_BACKEND_CPU;
@@ -2946,14 +3178,7 @@ static void llm_load_tensors(
2946
3178
  ggml_backend_type backend_output;
2947
3179
 
2948
3180
  if (n_gpu_layers > int(n_layer)) {
2949
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2950
- // on Windows however this is detrimental unless everything is on the GPU
2951
- #ifndef _WIN32
2952
- backend_norm = llama_backend_offload;
2953
- #else
2954
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2955
- #endif // _WIN32
2956
-
3181
+ backend_norm = llama_backend_offload;
2957
3182
  backend_output = llama_backend_offload_split;
2958
3183
  } else {
2959
3184
  backend_norm = GGML_BACKEND_CPU;
@@ -3023,21 +3248,7 @@ static void llm_load_tensors(
3023
3248
  ggml_backend_type backend_output;
3024
3249
 
3025
3250
  if (n_gpu_layers > int(n_layer)) {
3026
- #ifdef GGML_USE_CUBLAS
3027
- if (n_gpu_layers > int(n_layer + 1)) {
3028
- LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3029
- __func__, n_layer + 1);
3030
- throw std::runtime_error("Persimmon CUDA offload failed");
3031
- }
3032
- #endif
3033
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3034
- // on Windows however this is detrimental unless everything is on the GPU
3035
- #ifndef _WIN32
3036
- backend_norm = llama_backend_offload;
3037
- #else
3038
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3039
- #endif // _WIN32
3040
-
3251
+ backend_norm = llama_backend_offload;
3041
3252
  backend_output = llama_backend_offload_split;
3042
3253
  } else {
3043
3254
  backend_norm = GGML_BACKEND_CPU;
@@ -3096,14 +3307,7 @@ static void llm_load_tensors(
3096
3307
  ggml_backend_type backend_output;
3097
3308
 
3098
3309
  if (n_gpu_layers > int(n_layer)) {
3099
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3100
- // on Windows however this is detrimental unless everything is on the GPU
3101
- #ifndef _WIN32
3102
- backend_norm = llama_backend_offload;
3103
- #else
3104
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3105
- #endif // _WIN32
3106
-
3310
+ backend_norm = llama_backend_offload;
3107
3311
  backend_output = llama_backend_offload_split;
3108
3312
  } else {
3109
3313
  backend_norm = GGML_BACKEND_CPU;
@@ -3174,14 +3378,7 @@ static void llm_load_tensors(
3174
3378
  ggml_backend_type backend_output;
3175
3379
 
3176
3380
  if (n_gpu_layers > int(n_layer)) {
3177
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3178
- // on Windows however this is detrimental unless everything is on the GPU
3179
- #ifndef _WIN32
3180
- backend_norm = llama_backend_offload;
3181
- #else
3182
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3183
- #endif // _WIN32
3184
-
3381
+ backend_norm = llama_backend_offload;
3185
3382
  backend_output = llama_backend_offload_split;
3186
3383
  } else {
3187
3384
  backend_norm = GGML_BACKEND_CPU;
@@ -3241,14 +3438,7 @@ static void llm_load_tensors(
3241
3438
  ggml_backend_type backend_output;
3242
3439
 
3243
3440
  if (n_gpu_layers > int(n_layer)) {
3244
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3245
- // on Windows however this is detrimental unless everything is on the GPU
3246
- #ifndef _WIN32
3247
- backend_norm = llama_backend_offload;
3248
- #else
3249
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3250
- #endif // _WIN32
3251
-
3441
+ backend_norm = llama_backend_offload;
3252
3442
  backend_output = llama_backend_offload_split;
3253
3443
  } else {
3254
3444
  backend_norm = GGML_BACKEND_CPU;
@@ -3305,6 +3495,64 @@ static void llm_load_tensors(
3305
3495
  }
3306
3496
  }
3307
3497
  } break;
3498
+ case LLM_ARCH_QWEN:
3499
+ {
3500
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3501
+ {
3502
+ ggml_backend_type backend_norm;
3503
+ ggml_backend_type backend_output;
3504
+
3505
+ if (n_gpu_layers > int(n_layer)) {
3506
+ backend_norm = llama_backend_offload;
3507
+ backend_output = llama_backend_offload_split;
3508
+ } else {
3509
+ backend_norm = GGML_BACKEND_CPU;
3510
+ backend_output = GGML_BACKEND_CPU;
3511
+ }
3512
+
3513
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3514
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3515
+
3516
+ if (backend_norm == GGML_BACKEND_GPU) {
3517
+ vram_weights += ggml_nbytes(model.output_norm);
3518
+ }
3519
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3520
+ vram_weights += ggml_nbytes(model.output);
3521
+ }
3522
+ }
3523
+
3524
+ const uint32_t n_ff = hparams.n_ff / 2;
3525
+
3526
+ const int i_gpu_start = n_layer - n_gpu_layers;
3527
+
3528
+ model.layers.resize(n_layer);
3529
+
3530
+ for (uint32_t i = 0; i < n_layer; ++i) {
3531
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3532
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3533
+
3534
+ auto & layer = model.layers[i];
3535
+
3536
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3537
+
3538
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3539
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3540
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3541
+
3542
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3543
+
3544
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3545
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3546
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3547
+
3548
+ if (backend == GGML_BACKEND_GPU) {
3549
+ vram_weights +=
3550
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3551
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3552
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3553
+ }
3554
+ }
3555
+ } break;
3308
3556
 
3309
3557
  default:
3310
3558
  throw std::runtime_error("unknown architecture");
@@ -3331,8 +3579,8 @@ static void llm_load_tensors(
3331
3579
  }
3332
3580
 
3333
3581
  #ifdef GGML_USE_CUBLAS
3334
- const int max_backend_supported_layers = hparams.n_layer + 3;
3335
- const int max_offloadable_layers = hparams.n_layer + 3;
3582
+ const int max_backend_supported_layers = hparams.n_layer + 1;
3583
+ const int max_offloadable_layers = hparams.n_layer + 1;
3336
3584
  #elif GGML_USE_CLBLAST
3337
3585
  const int max_backend_supported_layers = hparams.n_layer + 1;
3338
3586
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -3373,7 +3621,7 @@ static void llm_load_tensors(
3373
3621
 
3374
3622
  static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3375
3623
  try {
3376
- llama_model_loader ml(fname, params.use_mmap);
3624
+ llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3377
3625
 
3378
3626
  model.hparams.vocab_only = params.vocab_only;
3379
3627
 
@@ -3500,11 +3748,11 @@ static void llm_build_k_shift(
3500
3748
  struct ggml_tensor * tmp =
3501
3749
  // we rotate only the first n_rot dimensions
3502
3750
  ggml_rope_custom_inplace(ctx,
3503
- ggml_view_3d(ctx, kv.k,
3751
+ ggml_view_3d(ctx, kv.k_l[il],
3504
3752
  n_embd_head, n_head_kv, n_ctx,
3505
- ggml_element_size(kv.k)*n_embd_head,
3506
- ggml_element_size(kv.k)*n_embd_gqa,
3507
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
3753
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3754
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3755
+ 0),
3508
3756
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3509
3757
  ext_factor, attn_factor, beta_fast, beta_slow);
3510
3758
  cb(tmp, "K_shifted", il);
@@ -3531,13 +3779,13 @@ static void llm_build_kv_store(
3531
3779
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
3532
3780
  cb(v_cur_t, "v_cur_t", il);
3533
3781
 
3534
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
3535
- (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3782
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3783
+ (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3536
3784
  cb(k_cache_view, "k_cache_view", il);
3537
3785
 
3538
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
3539
- ( n_ctx)*ggml_element_size(kv.v),
3540
- (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
3786
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
3787
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
3788
+ (kv_head)*ggml_element_size(kv.v_l[il]));
3541
3789
  cb(v_cache_view, "v_cache_view", il);
3542
3790
 
3543
3791
  // important: storing RoPE-ed version of K in the KV cache!
@@ -3689,11 +3937,11 @@ static struct ggml_tensor * llm_build_kqv(
3689
3937
  cb(q, "q", il);
3690
3938
 
3691
3939
  struct ggml_tensor * k =
3692
- ggml_view_3d(ctx, kv.k,
3940
+ ggml_view_3d(ctx, kv.k_l[il],
3693
3941
  n_embd_head, n_kv, n_head_kv,
3694
- ggml_element_size(kv.k)*n_embd_gqa,
3695
- ggml_element_size(kv.k)*n_embd_head,
3696
- ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
3942
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3943
+ ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3944
+ 0);
3697
3945
  cb(k, "k", il);
3698
3946
 
3699
3947
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@@ -3724,11 +3972,11 @@ static struct ggml_tensor * llm_build_kqv(
3724
3972
 
3725
3973
  // split cached v into n_head heads
3726
3974
  struct ggml_tensor * v =
3727
- ggml_view_3d(ctx, kv.v,
3975
+ ggml_view_3d(ctx, kv.v_l[il],
3728
3976
  n_kv, n_embd_head, n_head_kv,
3729
- ggml_element_size(kv.v)*n_ctx,
3730
- ggml_element_size(kv.v)*n_ctx*n_embd_head,
3731
- ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
3977
+ ggml_element_size(kv.v_l[il])*n_ctx,
3978
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
3979
+ 0);
3732
3980
  cb(v, "v", il);
3733
3981
 
3734
3982
  struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
@@ -3886,12 +4134,24 @@ struct llm_build_context {
3886
4134
  // compute Q and K and RoPE them
3887
4135
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3888
4136
  cb(Qcur, "Qcur", il);
4137
+ if (model.layers[il].bq) {
4138
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4139
+ cb(Qcur, "Qcur", il);
4140
+ }
3889
4141
 
3890
4142
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3891
4143
  cb(Kcur, "Kcur", il);
4144
+ if (model.layers[il].bk) {
4145
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4146
+ cb(Kcur, "Kcur", il);
4147
+ }
3892
4148
 
3893
4149
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3894
4150
  cb(Vcur, "Vcur", il);
4151
+ if (model.layers[il].bv) {
4152
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4153
+ cb(Vcur, "Vcur", il);
4154
+ }
3895
4155
 
3896
4156
  Qcur = ggml_rope_custom(
3897
4157
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -3910,7 +4170,7 @@ struct llm_build_context {
3910
4170
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3911
4171
 
3912
4172
  cur = llm_build_kqv(ctx0, hparams, kv_self,
3913
- model.layers[il].wo, NULL,
4173
+ model.layers[il].wo, model.layers[il].bo,
3914
4174
  Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
3915
4175
  cb(cur, "kqv_out", il);
3916
4176
  }
@@ -4308,6 +4568,7 @@ struct llm_build_context {
4308
4568
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4309
4569
  cb(inpL, "imp_embd", -1);
4310
4570
 
4571
+ // inp_pos - contains the positions
4311
4572
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4312
4573
  cb(inp_pos, "inp_pos", -1);
4313
4574
 
@@ -4315,6 +4576,7 @@ struct llm_build_context {
4315
4576
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4316
4577
  cb(KQ_scale, "KQ_scale", -1);
4317
4578
 
4579
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4318
4580
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4319
4581
  cb(KQ_mask, "KQ_mask", -1);
4320
4582
 
@@ -4903,6 +5165,121 @@ struct llm_build_context {
4903
5165
 
4904
5166
  return gf;
4905
5167
  }
5168
+
5169
+ struct ggml_cgraph * build_qwen() {
5170
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5171
+
5172
+ struct ggml_tensor * cur;
5173
+ struct ggml_tensor * inpL;
5174
+
5175
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5176
+ cb(inpL, "inp_embd", -1);
5177
+
5178
+ // inp_pos - contains the positions
5179
+ struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5180
+ cb(inp_pos, "inp_pos", -1);
5181
+
5182
+ // KQ_scale
5183
+ struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5184
+ cb(KQ_scale, "KQ_scale", -1);
5185
+
5186
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5187
+ struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5188
+ cb(KQ_mask, "KQ_mask", -1);
5189
+
5190
+ // shift the entire K-cache if needed
5191
+ if (do_rope_shift) {
5192
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5193
+ }
5194
+
5195
+ for (int il = 0; il < n_layer; ++il) {
5196
+ struct ggml_tensor * inpSA = inpL;
5197
+
5198
+ cur = llm_build_norm(ctx0, inpL, hparams,
5199
+ model.layers[il].attn_norm, NULL,
5200
+ LLM_NORM_RMS, cb, il);
5201
+ cb(cur, "attn_norm", il);
5202
+
5203
+ // self-attention
5204
+ {
5205
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5206
+ cb(cur, "wqkv", il);
5207
+
5208
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5209
+ cb(cur, "bqkv", il);
5210
+
5211
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5212
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5213
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
5214
+
5215
+ cb(Qcur, "Qcur", il);
5216
+ cb(Kcur, "Kcur", il);
5217
+ cb(Vcur, "Vcur", il);
5218
+
5219
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5220
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5221
+
5222
+ // using mode = 2 for neox mode
5223
+ Qcur = ggml_rope_custom(
5224
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5225
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5226
+ );
5227
+ cb(Qcur, "Qcur", il);
5228
+
5229
+ Kcur = ggml_rope_custom(
5230
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5231
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5232
+ );
5233
+ cb(Kcur, "Kcur", il);
5234
+
5235
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5236
+
5237
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
5238
+ model.layers[il].wo, NULL,
5239
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5240
+ cb(cur, "kqv_out", il);
5241
+ }
5242
+
5243
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5244
+ cb(ffn_inp, "ffn_inp", il);
5245
+
5246
+ // feed-forward forward
5247
+ {
5248
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5249
+ model.layers[il].ffn_norm, NULL,
5250
+ LLM_NORM_RMS, cb, il);
5251
+ cb(cur, "ffn_norm", il);
5252
+
5253
+ cur = llm_build_ffn(ctx0, cur,
5254
+ model.layers[il].ffn_up, NULL,
5255
+ model.layers[il].ffn_gate, NULL,
5256
+ model.layers[il].ffn_down, NULL,
5257
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5258
+ cb(cur, "ffn_out", il);
5259
+ }
5260
+
5261
+ cur = ggml_add(ctx0, cur, ffn_inp);
5262
+ cb(cur, "l_out", il);
5263
+
5264
+ // input for next layer
5265
+ inpL = cur;
5266
+ }
5267
+
5268
+ cur = inpL;
5269
+
5270
+ cur = llm_build_norm(ctx0, cur, hparams,
5271
+ model.output_norm, NULL,
5272
+ LLM_NORM_RMS, cb, -1);
5273
+ cb(cur, "result_norm", -1);
5274
+
5275
+ // lm_head
5276
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5277
+ cb(cur, "result_output", -1);
5278
+
5279
+ ggml_build_forward_expand(gf, cur);
5280
+
5281
+ return gf;
5282
+ }
4906
5283
  };
4907
5284
 
4908
5285
  //
@@ -4913,8 +5290,8 @@ struct llm_build_context {
4913
5290
  enum llm_offload_func_e {
4914
5291
  OFFLOAD_FUNC_NOP,
4915
5292
  OFFLOAD_FUNC,
4916
- OFFLOAD_FUNC_KQ,
4917
- OFFLOAD_FUNC_V,
5293
+ OFFLOAD_FUNC_FRC, // force offload
5294
+ OFFLOAD_FUNC_KQV,
4918
5295
  OFFLOAD_FUNC_NR,
4919
5296
  OFFLOAD_FUNC_EMB,
4920
5297
  OFFLOAD_FUNC_OUT,
@@ -5000,11 +5377,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5377
  //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
5001
5378
  { "pos_embd", OFFLOAD_FUNC_NR },
5002
5379
 
5003
- { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope)
5004
- { "KQ_scale", OFFLOAD_FUNC_KQ },
5005
- { "KQ_mask", OFFLOAD_FUNC_KQ },
5006
- { "K_shift", OFFLOAD_FUNC_KQ },
5007
- { "K_shifted", OFFLOAD_FUNC_KQ },
5380
+ { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
5381
+ { "KQ_scale", OFFLOAD_FUNC_FRC },
5382
+ { "KQ_mask", OFFLOAD_FUNC_FRC },
5383
+ { "K_shift", OFFLOAD_FUNC_FRC },
5384
+
5385
+ { "K_shifted", OFFLOAD_FUNC },
5008
5386
 
5009
5387
  { "inp_norm", OFFLOAD_FUNC_NR },
5010
5388
  { "inp_norm_w", OFFLOAD_FUNC_NR },
@@ -5017,38 +5395,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5017
5395
  { "attn_norm", OFFLOAD_FUNC },
5018
5396
  { "attn_norm_2", OFFLOAD_FUNC },
5019
5397
 
5020
- { "wqkv", OFFLOAD_FUNC_KQ },
5021
- { "bqkv", OFFLOAD_FUNC_KQ },
5022
- { "wqkv_clamped", OFFLOAD_FUNC_KQ },
5023
-
5024
- { "tmpk", OFFLOAD_FUNC_KQ },
5025
- { "tmpq", OFFLOAD_FUNC_KQ },
5026
- { "tmpv", OFFLOAD_FUNC_V },
5027
- { "Kcur", OFFLOAD_FUNC_KQ },
5028
- { "Qcur", OFFLOAD_FUNC_KQ },
5029
- { "Vcur", OFFLOAD_FUNC_V },
5030
-
5031
- { "krot", OFFLOAD_FUNC_KQ },
5032
- { "qrot", OFFLOAD_FUNC_KQ },
5033
- { "kpass", OFFLOAD_FUNC_KQ },
5034
- { "qpass", OFFLOAD_FUNC_KQ },
5035
- { "krotated", OFFLOAD_FUNC_KQ },
5036
- { "qrotated", OFFLOAD_FUNC_KQ },
5037
-
5038
- { "q", OFFLOAD_FUNC_KQ },
5039
- { "k", OFFLOAD_FUNC_KQ },
5040
- { "kq", OFFLOAD_FUNC_KQ },
5041
- { "kq_scaled", OFFLOAD_FUNC_KQ },
5042
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5043
- { "kq_masked", OFFLOAD_FUNC_KQ },
5044
- { "kq_soft_max", OFFLOAD_FUNC_V },
5045
- { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5046
- { "v", OFFLOAD_FUNC_V },
5047
- { "kqv", OFFLOAD_FUNC_V },
5048
- { "kqv_merged", OFFLOAD_FUNC_V },
5049
- { "kqv_merged_cont", OFFLOAD_FUNC_V },
5050
- { "kqv_wo", OFFLOAD_FUNC_V },
5051
- { "kqv_out", OFFLOAD_FUNC_V },
5398
+ { "wqkv", OFFLOAD_FUNC_KQV },
5399
+ { "bqkv", OFFLOAD_FUNC_KQV },
5400
+ { "wqkv_clamped", OFFLOAD_FUNC_KQV },
5401
+
5402
+ { "tmpk", OFFLOAD_FUNC_KQV },
5403
+ { "tmpq", OFFLOAD_FUNC_KQV },
5404
+ { "tmpv", OFFLOAD_FUNC_KQV },
5405
+ { "Kcur", OFFLOAD_FUNC_KQV },
5406
+ { "Qcur", OFFLOAD_FUNC_KQV },
5407
+ { "Vcur", OFFLOAD_FUNC_KQV },
5408
+
5409
+ { "krot", OFFLOAD_FUNC_KQV },
5410
+ { "qrot", OFFLOAD_FUNC_KQV },
5411
+ { "kpass", OFFLOAD_FUNC_KQV },
5412
+ { "qpass", OFFLOAD_FUNC_KQV },
5413
+ { "krotated", OFFLOAD_FUNC_KQV },
5414
+ { "qrotated", OFFLOAD_FUNC_KQV },
5415
+
5416
+ { "q", OFFLOAD_FUNC_KQV },
5417
+ { "k", OFFLOAD_FUNC_KQV },
5418
+ { "kq", OFFLOAD_FUNC_KQV },
5419
+ { "kq_scaled", OFFLOAD_FUNC_KQV },
5420
+ { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
5421
+ { "kq_masked", OFFLOAD_FUNC_KQV },
5422
+ { "kq_soft_max", OFFLOAD_FUNC_KQV },
5423
+ { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
5424
+ { "v", OFFLOAD_FUNC_KQV },
5425
+ { "kqv", OFFLOAD_FUNC_KQV },
5426
+ { "kqv_merged", OFFLOAD_FUNC_KQV },
5427
+ { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
5428
+ { "kqv_wo", OFFLOAD_FUNC_KQV },
5429
+ { "kqv_out", OFFLOAD_FUNC_KQV },
5052
5430
 
5053
5431
  { "ffn_inp", OFFLOAD_FUNC },
5054
5432
  { "ffn_norm", OFFLOAD_FUNC },
@@ -5240,15 +5618,15 @@ static struct ggml_cgraph * llama_build_graph(
5240
5618
  { OFFLOAD_FUNC_NOP, "CPU" },
5241
5619
  { OFFLOAD_FUNC_OUT, "CPU" },
5242
5620
  #ifdef GGML_USE_CUBLAS
5243
- { OFFLOAD_FUNC, "GPU (CUDA)" },
5244
- { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" },
5245
- { OFFLOAD_FUNC_V, "GPU (CUDA) V" },
5246
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5621
+ { OFFLOAD_FUNC, "GPU (CUDA)" },
5622
+ { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
5623
+ { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
5624
+ { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
5247
5625
  { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
5248
5626
  #else
5249
5627
  { OFFLOAD_FUNC, "CPU" },
5250
- { OFFLOAD_FUNC_KQ, "CPU" },
5251
- { OFFLOAD_FUNC_V, "CPU" },
5628
+ { OFFLOAD_FUNC_FRC, "CPU" },
5629
+ { OFFLOAD_FUNC_KQV, "CPU" },
5252
5630
  { OFFLOAD_FUNC_NR, "CPU" },
5253
5631
  { OFFLOAD_FUNC_EMB, "CPU" },
5254
5632
  #endif // GGML_USE_CUBLAS
@@ -5281,18 +5659,23 @@ static struct ggml_cgraph * llama_build_graph(
5281
5659
  }
5282
5660
  }
5283
5661
  break;
5284
- case OFFLOAD_FUNC_NR:
5285
- if (n_gpu_layers <= n_layer + 0) {
5662
+ case OFFLOAD_FUNC_FRC:
5663
+ if (!lctx.cparams.offload_kqv) {
5286
5664
  func_e = OFFLOAD_FUNC_NOP;
5287
- }
5288
- break;
5289
- case OFFLOAD_FUNC_V:
5290
- if (n_gpu_layers <= n_layer + 1) {
5665
+ } break;
5666
+ case OFFLOAD_FUNC_KQV:
5667
+ if (!lctx.cparams.offload_kqv) {
5291
5668
  func_e = OFFLOAD_FUNC_NOP;
5669
+ } else {
5670
+ if (n_gpu_layers < n_layer) {
5671
+ if (il < i_gpu_start) {
5672
+ func_e = OFFLOAD_FUNC_NOP;
5673
+ }
5674
+ }
5292
5675
  }
5293
5676
  break;
5294
- case OFFLOAD_FUNC_KQ:
5295
- if (n_gpu_layers <= n_layer + 2) {
5677
+ case OFFLOAD_FUNC_NR:
5678
+ if (n_gpu_layers <= n_layer + 0) {
5296
5679
  func_e = OFFLOAD_FUNC_NOP;
5297
5680
  }
5298
5681
  break;
@@ -5317,8 +5700,8 @@ static struct ggml_cgraph * llama_build_graph(
5317
5700
  case OFFLOAD_FUNC_NOP:
5318
5701
  case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
5319
5702
  case OFFLOAD_FUNC:
5320
- case OFFLOAD_FUNC_KQ:
5321
- case OFFLOAD_FUNC_V:
5703
+ case OFFLOAD_FUNC_KQV:
5704
+ case OFFLOAD_FUNC_FRC:
5322
5705
  case OFFLOAD_FUNC_NR:
5323
5706
  case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
5324
5707
  default: GGML_ASSERT(false);
@@ -5377,6 +5760,10 @@ static struct ggml_cgraph * llama_build_graph(
5377
5760
  {
5378
5761
  result = llm.build_stablelm();
5379
5762
  } break;
5763
+ case LLM_ARCH_QWEN:
5764
+ {
5765
+ result = llm.build_qwen();
5766
+ } break;
5380
5767
  default:
5381
5768
  GGML_ASSERT(false);
5382
5769
  }
@@ -5499,8 +5886,8 @@ static int llama_decode_internal(
5499
5886
  // a heuristic, to avoid attending the full cache if it is not yet utilized
5500
5887
  // after enough generations, the benefit from this heuristic disappears
5501
5888
  // if we start defragmenting the cache, the benefit from this will be more important
5502
- //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5503
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5889
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
5890
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
5504
5891
 
5505
5892
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5506
5893
 
@@ -5551,7 +5938,7 @@ static int llama_decode_internal(
5551
5938
  n_threads = std::min(4, n_threads);
5552
5939
  }
5553
5940
 
5554
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5941
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
5555
5942
  if (ggml_cpu_has_cublas() && fully_offloaded) {
5556
5943
  n_threads = 1;
5557
5944
  }
@@ -6410,14 +6797,13 @@ struct llama_grammar_candidate {
6410
6797
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
6411
6798
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6412
6799
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6413
- const char * src,
6414
- size_t n_src,
6800
+ const std::string & src,
6415
6801
  llama_partial_utf8 partial_start) {
6416
6802
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6417
- const char * pos = src;
6803
+ const char * pos = src.c_str();
6418
6804
  std::vector<uint32_t> code_points;
6419
6805
  // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
- code_points.reserve(n_src + 1);
6806
+ code_points.reserve(src.size() + 1);
6421
6807
  uint32_t value = partial_start.value;
6422
6808
  int n_remain = partial_start.n_remain;
6423
6809
 
@@ -6468,13 +6854,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6468
6854
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6469
6855
  }
6470
6856
 
6471
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
- std::string src,
6473
- llama_partial_utf8 partial_start
6474
- ) {
6475
- return decode_utf8(src.c_str(), src.size(), partial_start);
6476
- }
6477
-
6478
6857
  // returns true iff pos points to the end of one of the definitions of a rule
6479
6858
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6480
6859
  switch (pos->type) {
@@ -7113,11 +7492,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7113
7492
  const llama_token eos = llama_token_eos(&ctx->model);
7114
7493
 
7115
7494
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7495
+ candidates_decoded.reserve(candidates->size);
7116
7496
  std::vector<llama_grammar_candidate> candidates_grammar;
7497
+ candidates_grammar.reserve(candidates->size);
7117
7498
 
7118
7499
  for (size_t i = 0; i < candidates->size; ++i) {
7119
7500
  const llama_token id = candidates->data[i].id;
7120
- const std::string piece = llama_token_to_piece(ctx, id);
7501
+ const std::string & piece = ctx->model.vocab.id_to_token[id].text;
7121
7502
  if (id == eos) {
7122
7503
  if (!allow_eos) {
7123
7504
  candidates->data[i].logit = -INFINITY;
@@ -7329,7 +7710,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7329
7710
  GGML_ASSERT(false);
7330
7711
  }
7331
7712
 
7332
- const std::string piece = llama_token_to_piece(ctx, token);
7713
+ const std::string & piece = ctx->model.vocab.id_to_token[token].text;
7333
7714
 
7334
7715
  // Note terminating 0 in decoded string
7335
7716
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -7650,18 +8031,21 @@ static void llama_convert_tensor_internal(
7650
8031
  return;
7651
8032
  }
7652
8033
 
7653
- auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
7654
- auto block_size_bytes = ggml_type_size(tensor->type);
8034
+ size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
8035
+ size_t block_size_bytes = ggml_type_size(tensor->type);
7655
8036
 
7656
8037
  GGML_ASSERT(nelements % block_size == 0);
7657
- auto nblocks = nelements / block_size;
7658
- auto blocks_per_thread = nblocks / nthread;
7659
- auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8038
+ size_t nblocks = nelements / block_size;
8039
+ size_t blocks_per_thread = nblocks / nthread;
8040
+ size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
8041
+
8042
+ size_t in_buff_offs = 0;
8043
+ size_t out_buff_offs = 0;
7660
8044
 
7661
- for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
7662
- auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
7663
- auto thr_elems = thr_blocks * block_size; // number of elements for this thread
7664
- auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
8045
+ for (int tnum = 0; tnum < nthread; tnum++) {
8046
+ size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
8047
+ size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
8048
+ size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
7665
8049
 
7666
8050
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
7667
8051
  if (typ == GGML_TYPE_F16) {
@@ -7831,7 +8215,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7831
8215
  constexpr bool use_mmap = false;
7832
8216
  #endif
7833
8217
 
7834
- llama_model_loader ml(fname_inp, use_mmap);
8218
+ llama_model_loader ml(fname_inp, use_mmap, NULL);
7835
8219
  if (ml.use_mmap) {
7836
8220
  ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7837
8221
  }
@@ -8127,7 +8511,7 @@ static int llama_apply_lora_from_file_internal(
8127
8511
  std::vector<uint8_t> base_buf;
8128
8512
  if (path_base_model) {
8129
8513
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
8130
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
8514
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
8131
8515
 
8132
8516
  size_t ctx_size;
8133
8517
  size_t mmapped_size;
@@ -8355,6 +8739,7 @@ struct llama_model_params llama_model_default_params() {
8355
8739
  /*.tensor_split =*/ nullptr,
8356
8740
  /*.progress_callback =*/ nullptr,
8357
8741
  /*.progress_callback_user_data =*/ nullptr,
8742
+ /*.kv_overrides =*/ nullptr,
8358
8743
  /*.vocab_only =*/ false,
8359
8744
  /*.use_mmap =*/ true,
8360
8745
  /*.use_mlock =*/ false,
@@ -8382,10 +8767,12 @@ struct llama_context_params llama_context_default_params() {
8382
8767
  /*.yarn_beta_fast =*/ 32.0f,
8383
8768
  /*.yarn_beta_slow =*/ 1.0f,
8384
8769
  /*.yarn_orig_ctx =*/ 0,
8770
+ /*.type_k =*/ GGML_TYPE_F16,
8771
+ /*.type_v =*/ GGML_TYPE_F16,
8385
8772
  /*.mul_mat_q =*/ true,
8386
- /*.f16_kv =*/ true,
8387
8773
  /*.logits_all =*/ false,
8388
8774
  /*.embedding =*/ false,
8775
+ /*.offload_kqv =*/ true,
8389
8776
  };
8390
8777
 
8391
8778
  return result;
@@ -8502,6 +8889,7 @@ struct llama_context * llama_new_context_with_model(
8502
8889
  cparams.yarn_beta_fast = params.yarn_beta_fast;
8503
8890
  cparams.yarn_beta_slow = params.yarn_beta_slow;
8504
8891
  cparams.mul_mat_q = params.mul_mat_q;
8892
+ cparams.offload_kqv = params.offload_kqv;
8505
8893
 
8506
8894
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
8507
8895
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -8535,19 +8923,36 @@ struct llama_context * llama_new_context_with_model(
8535
8923
  ctx->rng = std::mt19937(params.seed);
8536
8924
  ctx->logits_all = params.logits_all;
8537
8925
 
8538
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
8926
+ const ggml_type type_k = params.type_k;
8927
+ const ggml_type type_v = params.type_v;
8928
+
8929
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
8930
+ GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
8539
8931
 
8540
8932
  // reserve memory for context buffers
8541
8933
  if (!hparams.vocab_only) {
8542
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
8934
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
8543
8935
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
8544
8936
  llama_free(ctx);
8545
8937
  return nullptr;
8546
8938
  }
8547
8939
 
8548
8940
  {
8549
- const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8550
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8941
+ size_t memory_size_k = 0;
8942
+ size_t memory_size_v = 0;
8943
+
8944
+ for (auto & k : ctx->kv_self.k_l) {
8945
+ memory_size_k += ggml_nbytes(k);
8946
+ }
8947
+
8948
+ for (auto & v : ctx->kv_self.v_l) {
8949
+ memory_size_v += ggml_nbytes(v);
8950
+ }
8951
+
8952
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
8953
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
8954
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
8955
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
8551
8956
  }
8552
8957
 
8553
8958
  // resized during inference
@@ -8618,8 +9023,12 @@ struct llama_context * llama_new_context_with_model(
8618
9023
  }
8619
9024
 
8620
9025
  size_t kv_vram_size = 0;
8621
- add_tensor(ctx->kv_self.k, kv_vram_size);
8622
- add_tensor(ctx->kv_self.v, kv_vram_size);
9026
+ for (auto & k : ctx->kv_self.k_l) {
9027
+ add_tensor(k, kv_vram_size);
9028
+ }
9029
+ for (auto & v : ctx->kv_self.v_l) {
9030
+ add_tensor(v, kv_vram_size);
9031
+ }
8623
9032
 
8624
9033
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8625
9034
  size_t total_vram_size = model_vram_size + ctx_vram_size;
@@ -9089,37 +9498,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9089
9498
  data_ctx->write(&kv_used, sizeof(kv_used));
9090
9499
 
9091
9500
  if (kv_buf_size) {
9092
- const size_t elt_size = ggml_element_size(kv_self.k);
9501
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9093
9502
 
9094
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9503
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9095
9504
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9096
9505
 
9097
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9098
- std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
9099
- kout3d->data = kout3d_data.data();
9506
+ std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9507
+ std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9508
+
9509
+ for (int il = 0; il < (int) n_layer; ++il) {
9510
+ ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9511
+ kout2d_data[il].resize(ggml_nbytes(kout2d));
9512
+ kout2d->data = kout2d_data[il].data();
9100
9513
 
9101
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9102
- std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
9103
- vout3d->data = vout3d_data.data();
9514
+ ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9515
+ vout2d_data[il].resize(ggml_nbytes(vout2d));
9516
+ vout2d->data = vout2d_data[il].data();
9104
9517
 
9105
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9106
- n_embd, kv_head, n_layer,
9107
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9518
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9519
+ n_embd, kv_head,
9520
+ elt_size*n_embd, 0);
9108
9521
 
9109
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9110
- kv_head, n_embd, n_layer,
9111
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9522
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9523
+ kv_head, n_embd,
9524
+ elt_size*n_ctx, 0);
9525
+
9526
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9527
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9528
+ }
9112
9529
 
9113
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9114
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9115
9530
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9116
9531
 
9117
9532
  ggml_free(cpy_ctx);
9118
9533
 
9119
- // our data is now in the kout3d_data and vout3d_data buffers
9534
+ // our data is now in the kout2d_data and vout2d_data buffers
9120
9535
  // write them to file
9121
- data_ctx->write(kout3d_data.data(), kout3d_data.size());
9122
- data_ctx->write(vout3d_data.data(), vout3d_data.size());
9536
+ for (uint32_t il = 0; il < n_layer; ++il) {
9537
+ data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9538
+ data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9539
+ }
9123
9540
  }
9124
9541
 
9125
9542
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9219,29 +9636,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9219
9636
  if (kv_buf_size) {
9220
9637
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
9221
9638
 
9222
- const size_t elt_size = ggml_element_size(kv_self.k);
9639
+ const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
9223
9640
 
9224
- ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9641
+ ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9225
9642
  ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
9226
9643
 
9227
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
9228
- kin3d->data = (void *) inp;
9229
- inp += ggml_nbytes(kin3d);
9644
+ for (int il = 0; il < n_layer; ++il) {
9645
+ ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9646
+ kin2d->data = (void *) inp;
9647
+ inp += ggml_nbytes(kin2d);
9230
9648
 
9231
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
9232
- vin3d->data = (void *) inp;
9233
- inp += ggml_nbytes(vin3d);
9649
+ ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9650
+ vin2d->data = (void *) inp;
9651
+ inp += ggml_nbytes(vin2d);
9234
9652
 
9235
- ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
9236
- n_embd, kv_head, n_layer,
9237
- elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
9653
+ ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9654
+ n_embd, kv_head,
9655
+ elt_size*n_embd, 0);
9238
9656
 
9239
- ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
9240
- kv_head, n_embd, n_layer,
9241
- elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
9657
+ ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9658
+ kv_head, n_embd,
9659
+ elt_size*n_ctx, 0);
9660
+
9661
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9662
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9663
+ }
9242
9664
 
9243
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9244
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9245
9665
  ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
9246
9666
 
9247
9667
  ggml_free(cpy_ctx);