llama_cpp 0.5.1 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,4 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
1
+ #define LLAMA_API_INTERNAL
6
2
  #include "llama.h"
7
3
 
8
4
  #include "ggml.h"
@@ -113,7 +109,7 @@ static size_t utf8_len(char src) {
113
109
  return lookup[highbits];
114
110
  }
115
111
 
116
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
113
  std::string result;
118
114
  for (size_t pos = 0; ; pos += search.length()) {
119
115
  auto new_pos = s.find(search, pos);
@@ -160,20 +156,24 @@ static std::string format(const char * fmt, ...) {
160
156
  enum llm_arch {
161
157
  LLM_ARCH_LLAMA,
162
158
  LLM_ARCH_FALCON,
159
+ LLM_ARCH_BAICHUAN,
163
160
  LLM_ARCH_GPT2,
164
161
  LLM_ARCH_GPTJ,
165
162
  LLM_ARCH_GPTNEOX,
166
163
  LLM_ARCH_MPT,
164
+ LLM_ARCH_STARCODER,
167
165
  LLM_ARCH_UNKNOWN,
168
166
  };
169
167
 
170
168
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
171
- { LLM_ARCH_LLAMA, "llama" },
172
- { LLM_ARCH_FALCON, "falcon" },
173
- { LLM_ARCH_GPT2, "gpt2" },
174
- { LLM_ARCH_GPTJ, "gptj" },
175
- { LLM_ARCH_GPTNEOX, "gptneox" },
176
- { LLM_ARCH_MPT, "mpt" },
169
+ { LLM_ARCH_LLAMA, "llama" },
170
+ { LLM_ARCH_FALCON, "falcon" },
171
+ { LLM_ARCH_GPT2, "gpt2" },
172
+ { LLM_ARCH_GPTJ, "gptj" },
173
+ { LLM_ARCH_GPTNEOX, "gptneox" },
174
+ { LLM_ARCH_MPT, "mpt" },
175
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
+ { LLM_ARCH_STARCODER, "starcoder" },
177
177
  };
178
178
 
179
179
  enum llm_kv {
@@ -314,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
314
314
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
315
315
  },
316
316
  },
317
+ {
318
+ LLM_ARCH_BAICHUAN,
319
+ {
320
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
321
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
322
+ { LLM_TENSOR_OUTPUT, "output" },
323
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
324
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
325
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
326
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
327
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
328
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
329
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
330
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
331
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
332
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
333
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
334
+ },
335
+ },
317
336
  {
318
337
  LLM_ARCH_FALCON,
319
338
  {
@@ -360,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360
379
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
380
  },
362
381
  },
382
+ {
383
+ LLM_ARCH_STARCODER,
384
+ {
385
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
386
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
387
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
388
+ { LLM_TENSOR_OUTPUT, "output" },
389
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
390
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
393
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
+ },
396
+ },
363
397
  {
364
398
  LLM_ARCH_UNKNOWN,
365
399
  {
@@ -658,9 +692,7 @@ struct llama_mmap {
658
692
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
659
693
  if (prefetch) {
660
694
  // Advise the kernel to preload the mapped memory
661
-
662
695
  WIN32_MEMORY_RANGE_ENTRY range;
663
-
664
696
  range.VirtualAddress = addr;
665
697
  range.NumberOfBytes = (SIZE_T)size;
666
698
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -876,9 +908,11 @@ static llama_state g_state;
876
908
  // available llama models
877
909
  enum e_model {
878
910
  MODEL_UNKNOWN,
911
+ MODEL_1B,
879
912
  MODEL_3B,
880
913
  MODEL_7B,
881
914
  MODEL_13B,
915
+ MODEL_15B,
882
916
  MODEL_30B,
883
917
  MODEL_34B,
884
918
  MODEL_40B,
@@ -888,24 +922,24 @@ enum e_model {
888
922
 
889
923
  static const size_t kB = 1024;
890
924
  static const size_t MB = kB*kB;
925
+ static const size_t GB = kB*kB*kB;
891
926
 
892
- // default hparams (LLaMA 7B)
893
927
  struct llama_hparams {
894
- uint32_t n_vocab = 32000;
895
- uint32_t n_ctx_train = 2048; // the context size used during training
896
- uint32_t n_ctx = 512; // the context size used during inference
897
- uint32_t n_embd = 4096;
898
- uint32_t n_head = 32;
899
- uint32_t n_head_kv = 32;
900
- uint32_t n_layer = 32;
901
- uint32_t n_rot = 64;
902
- uint32_t n_ff = 11008;
903
-
904
- float f_norm_eps = 1e-5;
905
- float f_norm_rms_eps = 1e-5;
906
-
907
- float rope_freq_base = 10000.0f;
908
- float rope_freq_scale = 1.0f;
928
+ uint32_t n_vocab;
929
+ uint32_t n_ctx_train; // context size the model was trained on
930
+ uint32_t n_ctx; // context size used during inference
931
+ uint32_t n_embd;
932
+ uint32_t n_head;
933
+ uint32_t n_head_kv;
934
+ uint32_t n_layer;
935
+ uint32_t n_rot;
936
+ uint32_t n_ff;
937
+
938
+ float f_norm_eps;
939
+ float f_norm_rms_eps;
940
+
941
+ float rope_freq_base;
942
+ float rope_freq_scale;
909
943
 
910
944
  bool operator!=(const llama_hparams & other) const {
911
945
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -947,13 +981,22 @@ struct llama_layer {
947
981
  struct ggml_tensor * wo;
948
982
  struct ggml_tensor * wqkv;
949
983
 
984
+ // attention bias
985
+ struct ggml_tensor * bo;
986
+ struct ggml_tensor * bqkv;
987
+
950
988
  // normalization
951
989
  struct ggml_tensor * ffn_norm;
990
+ struct ggml_tensor * ffn_norm_b;
952
991
 
953
992
  // ff
954
993
  struct ggml_tensor * w1; // ffn_gate
955
994
  struct ggml_tensor * w2; // ffn_down
956
995
  struct ggml_tensor * w3; // ffn_up
996
+
997
+ // ff bias
998
+ struct ggml_tensor * b2; // ffn_down
999
+ struct ggml_tensor * b3; // ffn_up
957
1000
  };
958
1001
 
959
1002
  struct llama_kv_cache {
@@ -1027,10 +1070,11 @@ struct llama_model {
1027
1070
 
1028
1071
  std::string name = "n/a";
1029
1072
 
1030
- llama_hparams hparams;
1073
+ llama_hparams hparams = {};
1031
1074
  llama_vocab vocab;
1032
1075
 
1033
1076
  struct ggml_tensor * tok_embeddings;
1077
+ struct ggml_tensor * pos_embeddings;
1034
1078
 
1035
1079
  struct ggml_tensor * output_norm;
1036
1080
  struct ggml_tensor * output_norm_b;
@@ -1231,6 +1275,7 @@ struct llama_model_loader {
1231
1275
  int n_created = 0;
1232
1276
 
1233
1277
  int64_t n_elements = 0;
1278
+ size_t n_bytes = 0;
1234
1279
 
1235
1280
  bool use_mmap = false;
1236
1281
 
@@ -1263,6 +1308,7 @@ struct llama_model_loader {
1263
1308
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1264
1309
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1265
1310
  n_elements += ggml_nelements(t);
1311
+ n_bytes += ggml_nbytes(t);
1266
1312
  }
1267
1313
 
1268
1314
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1541,7 +1587,7 @@ struct llama_model_loader {
1541
1587
  // load LLaMA models
1542
1588
  //
1543
1589
 
1544
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1590
+ static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1545
1591
  if (ftype & LLAMA_FTYPE_GUESSED) {
1546
1592
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1547
1593
  }
@@ -1574,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1574
1620
 
1575
1621
  static const char * llama_model_type_name(e_model type) {
1576
1622
  switch (type) {
1623
+ case MODEL_1B: return "1B";
1577
1624
  case MODEL_3B: return "3B";
1578
1625
  case MODEL_7B: return "7B";
1579
1626
  case MODEL_13B: return "13B";
1627
+ case MODEL_15B: return "15B";
1580
1628
  case MODEL_30B: return "30B";
1581
1629
  case MODEL_34B: return "34B";
1582
1630
  case MODEL_40B: return "40B";
@@ -1620,28 +1668,17 @@ static void llm_load_hparams(
1620
1668
  hparams.n_head_kv = hparams.n_head;
1621
1669
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1622
1670
 
1623
- // TODO: manually setting rope freq base and scale should override this
1624
- // FIXME: partial fix when the param specified is not the default value, but
1625
- // will not work for overriding the model value to the params default
1626
-
1627
- llama_context_params defaults = llama_context_default_params();
1628
-
1629
- // rope_freq_base
1630
- {
1631
- float ropebase = 10000.0f;
1632
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1633
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1634
- rope_freq_base = ropebase;
1635
- }
1671
+ // rope_freq_base (optional)
1672
+ if (rope_freq_base == 0.0f) {
1673
+ rope_freq_base = 10000.0f;
1674
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1636
1675
  }
1637
1676
 
1638
1677
  // rope_freq_scale (inverse of the kv) is optional
1639
- {
1678
+ if (rope_freq_scale == 0.0f) {
1640
1679
  float ropescale = 1.0f;
1641
1680
  GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1642
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1643
- rope_freq_scale = 1.0f/ropescale;
1644
- }
1681
+ rope_freq_scale = 1.0f/ropescale;
1645
1682
  }
1646
1683
 
1647
1684
  // sanity check for n_rot (optional)
@@ -1685,6 +1722,26 @@ static void llm_load_hparams(
1685
1722
  default: model.type = e_model::MODEL_UNKNOWN;
1686
1723
  }
1687
1724
  } break;
1725
+ case LLM_ARCH_BAICHUAN:
1726
+ {
1727
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1728
+ switch (hparams.n_layer) {
1729
+ case 32: model.type = e_model::MODEL_7B; break;
1730
+ case 40: model.type = e_model::MODEL_13B; break;
1731
+ default: model.type = e_model::MODEL_UNKNOWN;
1732
+ }
1733
+ } break;
1734
+ case LLM_ARCH_STARCODER:
1735
+ {
1736
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1737
+ switch (hparams.n_layer) {
1738
+ case 24: model.type = e_model::MODEL_1B; break;
1739
+ case 36: model.type = e_model::MODEL_3B; break;
1740
+ case 42: model.type = e_model::MODEL_7B; break;
1741
+ case 40: model.type = e_model::MODEL_15B; break;
1742
+ default: model.type = e_model::MODEL_UNKNOWN;
1743
+ }
1744
+ } break;
1688
1745
  default: (void)0;
1689
1746
  };
1690
1747
 
@@ -1838,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1838
1895
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1839
1896
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1840
1897
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1841
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
1898
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
+ if (ml.n_bytes < GB) {
1900
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
+ } else {
1902
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
+ }
1842
1904
 
1843
1905
  // general kv
1844
1906
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -1925,7 +1987,6 @@ static void llm_load_tensors(
1925
1987
  const int64_t n_vocab = hparams.n_vocab;
1926
1988
 
1927
1989
  const auto tn = LLM_TN(model.arch);
1928
-
1929
1990
  switch (model.arch) {
1930
1991
  case LLM_ARCH_LLAMA:
1931
1992
  {
@@ -1968,6 +2029,72 @@ static void llm_load_tensors(
1968
2029
 
1969
2030
  model.layers.resize(n_layer);
1970
2031
 
2032
+ for (uint32_t i = 0; i < n_layer; ++i) {
2033
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2034
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2035
+
2036
+ auto & layer = model.layers[i];
2037
+
2038
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2039
+
2040
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2041
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2042
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2043
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2044
+
2045
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2046
+
2047
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2048
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2049
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2050
+
2051
+ if (backend == GGML_BACKEND_GPU) {
2052
+ vram_weights +=
2053
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2054
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2055
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2056
+ }
2057
+ }
2058
+ } break;
2059
+ case LLM_ARCH_BAICHUAN:
2060
+ {
2061
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2062
+ {
2063
+ ggml_backend backend_norm;
2064
+ ggml_backend backend_output;
2065
+
2066
+ if (n_gpu_layers > int(n_layer)) {
2067
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
+ // on Windows however this is detrimental unless everything is on the GPU
2069
+ #ifndef _WIN32
2070
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2071
+ #else
2072
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
+ #endif // _WIN32
2074
+
2075
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2076
+ } else {
2077
+ backend_norm = GGML_BACKEND_CPU;
2078
+ backend_output = GGML_BACKEND_CPU;
2079
+ }
2080
+
2081
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2082
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2083
+
2084
+ if (backend_norm == GGML_BACKEND_GPU) {
2085
+ vram_weights += ggml_nbytes(model.output_norm);
2086
+ }
2087
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2088
+ vram_weights += ggml_nbytes(model.output);
2089
+ }
2090
+ }
2091
+
2092
+ const uint32_t n_ff = hparams.n_ff;
2093
+
2094
+ const int i_gpu_start = n_layer - n_gpu_layers;
2095
+
2096
+ model.layers.resize(n_layer);
2097
+
1971
2098
  for (uint32_t i = 0; i < n_layer; ++i) {
1972
2099
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1973
2100
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2073,6 +2200,85 @@ static void llm_load_tensors(
2073
2200
  }
2074
2201
  }
2075
2202
  } break;
2203
+ case LLM_ARCH_STARCODER:
2204
+ {
2205
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2206
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2207
+
2208
+ // output
2209
+ {
2210
+ ggml_backend backend_norm;
2211
+ ggml_backend backend_output;
2212
+
2213
+ if (n_gpu_layers > int(n_layer)) {
2214
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
+ // on Windows however this is detrimental unless everything is on the GPU
2216
+ #ifndef _WIN32
2217
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2218
+ #else
2219
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
+ #endif // _WIN32
2221
+
2222
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2223
+ } else {
2224
+ backend_norm = GGML_BACKEND_CPU;
2225
+ backend_output = GGML_BACKEND_CPU;
2226
+ }
2227
+
2228
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2229
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2230
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2231
+
2232
+ if (backend_norm == GGML_BACKEND_GPU) {
2233
+ vram_weights += ggml_nbytes(model.output_norm);
2234
+ vram_weights += ggml_nbytes(model.output_norm_b);
2235
+ }
2236
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2237
+ vram_weights += ggml_nbytes(model.output);
2238
+ }
2239
+ }
2240
+
2241
+ const uint32_t n_ff = hparams.n_ff;
2242
+
2243
+ const int i_gpu_start = n_layer - n_gpu_layers;
2244
+
2245
+ model.layers.resize(n_layer);
2246
+
2247
+ for (uint32_t i = 0; i < n_layer; ++i) {
2248
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2249
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2250
+
2251
+ auto & layer = model.layers[i];
2252
+
2253
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2254
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2255
+
2256
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2257
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2258
+
2259
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2260
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2261
+
2262
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2263
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2264
+
2265
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2266
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2267
+
2268
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2269
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2270
+
2271
+ if (backend == GGML_BACKEND_GPU) {
2272
+ vram_weights +=
2273
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2274
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2275
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2276
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2277
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2278
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2279
+ }
2280
+ }
2281
+ } break;
2076
2282
  default:
2077
2283
  throw std::runtime_error("unknown architecture");
2078
2284
  };
@@ -2354,11 +2560,356 @@ static struct ggml_cgraph * llm_build_llama(
2354
2560
  offload_func_kq(tmpq);
2355
2561
  ggml_set_name(tmpq, "tmpq");
2356
2562
 
2357
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2563
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2564
+ offload_func_kq(Kcur);
2565
+ ggml_set_name(Kcur, "Kcur");
2566
+
2567
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2568
+ offload_func_kq(Qcur);
2569
+ ggml_set_name(Qcur, "Qcur");
2570
+
2571
+ // store key and value to memory
2572
+ {
2573
+ // compute the transposed [N, n_embd] V matrix
2574
+
2575
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
+ offload_func_v(tmpv);
2577
+ ggml_set_name(tmpv, "tmpv");
2578
+
2579
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2580
+ offload_func_v(Vcur);
2581
+ ggml_set_name(Vcur, "Vcur");
2582
+
2583
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2584
+ offload_func_kq(k);
2585
+ ggml_set_name(k, "k");
2586
+
2587
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2588
+ ( n_ctx)*ggml_element_size(kv_self.v),
2589
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2590
+ offload_func_v(v);
2591
+ ggml_set_name(v, "v");
2592
+
2593
+ // important: storing RoPE-ed version of K in the KV cache!
2594
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2595
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2596
+ }
2597
+
2598
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2599
+ offload_func_kq(Q);
2600
+ ggml_set_name(Q, "Q");
2601
+
2602
+ struct ggml_tensor * K =
2603
+ ggml_view_3d(ctx0, kv_self.k,
2604
+ n_embd_head, n_past + N, n_head_kv,
2605
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2606
+ ggml_element_size(kv_self.k)*n_embd_head,
2607
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2608
+ offload_func_kq(K);
2609
+ ggml_set_name(K, "K");
2610
+
2611
+ // K * Q
2612
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2613
+ offload_func_kq(KQ);
2614
+ ggml_set_name(KQ, "KQ");
2615
+
2616
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2617
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2619
+ offload_func_kq(KQ_scaled);
2620
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2621
+
2622
+ // KQ_masked = mask_past(KQ_scaled)
2623
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2624
+ offload_func_kq(KQ_masked);
2625
+ ggml_set_name(KQ_masked, "KQ_masked");
2626
+
2627
+ // KQ = soft_max(KQ_masked)
2628
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2629
+ offload_func_v(KQ_soft_max);
2630
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
+
2632
+ // split cached V into n_head heads
2633
+ struct ggml_tensor * V =
2634
+ ggml_view_3d(ctx0, kv_self.v,
2635
+ n_past + N, n_embd_head, n_head_kv,
2636
+ ggml_element_size(kv_self.v)*n_ctx,
2637
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2639
+ offload_func_v(V);
2640
+ ggml_set_name(V, "V");
2641
+
2642
+ #if 1
2643
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2644
+ offload_func_v(KQV);
2645
+ ggml_set_name(KQV, "KQV");
2646
+ #else
2647
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
+ // is there a better way?
2650
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2651
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
+ #endif
2653
+
2654
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2655
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2656
+ offload_func_v(KQV_merged);
2657
+ ggml_set_name(KQV_merged, "KQV_merged");
2658
+
2659
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2660
+ cur = ggml_cpy(ctx0,
2661
+ KQV_merged,
2662
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2663
+ offload_func_v(cur);
2664
+ ggml_set_name(cur, "KQV_merged_contiguous");
2665
+
2666
+ // projection (no bias)
2667
+ cur = ggml_mul_mat(ctx0,
2668
+ model.layers[il].wo,
2669
+ cur);
2670
+ offload_func(cur);
2671
+ ggml_set_name(cur, "result_wo");
2672
+ }
2673
+
2674
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2675
+ offload_func(inpFF);
2676
+ ggml_set_name(inpFF, "inpFF");
2677
+
2678
+ // feed-forward network
2679
+ {
2680
+ // norm
2681
+ {
2682
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2683
+ offload_func(cur);
2684
+ ggml_set_name(cur, "rms_norm_1");
2685
+
2686
+ // cur = cur*ffn_norm(broadcasted)
2687
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2688
+ offload_func(cur);
2689
+ ggml_set_name(cur, "ffn_norm");
2690
+ }
2691
+
2692
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2693
+ model.layers[il].w3,
2694
+ cur);
2695
+ offload_func(tmp);
2696
+ ggml_set_name(tmp, "result_w3");
2697
+
2698
+ cur = ggml_mul_mat(ctx0,
2699
+ model.layers[il].w1,
2700
+ cur);
2701
+ offload_func(cur);
2702
+ ggml_set_name(cur, "result_w1");
2703
+
2704
+ // SILU activation
2705
+ cur = ggml_silu(ctx0, cur);
2706
+ offload_func(cur);
2707
+ ggml_set_name(cur, "silu");
2708
+
2709
+ cur = ggml_mul(ctx0, cur, tmp);
2710
+ offload_func(cur);
2711
+ ggml_set_name(cur, "silu_x_result_w3");
2712
+
2713
+ cur = ggml_mul_mat(ctx0,
2714
+ model.layers[il].w2,
2715
+ cur);
2716
+ offload_func(cur);
2717
+ ggml_set_name(cur, "result_w2");
2718
+ }
2719
+
2720
+ cur = ggml_add(ctx0, cur, inpFF);
2721
+ offload_func(cur);
2722
+ ggml_set_name(cur, "inpFF_+_result_w2");
2723
+
2724
+ // input for next layer
2725
+ inpL = cur;
2726
+ }
2727
+
2728
+ cur = inpL;
2729
+
2730
+ // norm
2731
+ {
2732
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2733
+ offload_func_nr(cur);
2734
+ ggml_set_name(cur, "rms_norm_2");
2735
+
2736
+ // cur = cur*norm(broadcasted)
2737
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2738
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2739
+ ggml_set_name(cur, "result_norm");
2740
+ }
2741
+
2742
+ // lm_head
2743
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2744
+ ggml_set_name(cur, "result_output");
2745
+
2746
+ ggml_build_forward_expand(gf, cur);
2747
+
2748
+ ggml_free(ctx0);
2749
+
2750
+ return gf;
2751
+ }
2752
+
2753
+
2754
+ static struct ggml_cgraph * llm_build_baichaun(
2755
+ llama_context & lctx,
2756
+ const llama_token * tokens,
2757
+ const float * embd,
2758
+ int n_tokens,
2759
+ int n_past) {
2760
+
2761
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
+
2763
+ const int N = n_tokens;
2764
+
2765
+ const auto & model = lctx.model;
2766
+ const auto & hparams = model.hparams;
2767
+
2768
+ const auto & kv_self = lctx.kv_self;
2769
+
2770
+ GGML_ASSERT(!!kv_self.ctx);
2771
+
2772
+ const int64_t n_embd = hparams.n_embd;
2773
+ const int64_t n_layer = hparams.n_layer;
2774
+ const int64_t n_ctx = hparams.n_ctx;
2775
+ const int64_t n_head = hparams.n_head;
2776
+ const int64_t n_head_kv = hparams.n_head_kv;
2777
+ const int64_t n_embd_head = hparams.n_embd_head();
2778
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2779
+
2780
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
+
2782
+ const float freq_base = hparams.rope_freq_base;
2783
+ const float freq_scale = hparams.rope_freq_scale;
2784
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
+
2786
+ const int n_gpu_layers = model.n_gpu_layers;
2787
+
2788
+ auto & buf_compute = lctx.buf_compute;
2789
+
2790
+ struct ggml_init_params params = {
2791
+ /*.mem_size =*/ buf_compute.size,
2792
+ /*.mem_buffer =*/ buf_compute.data,
2793
+ /*.no_alloc =*/ false,
2794
+ };
2795
+
2796
+ params.no_alloc = true;
2797
+
2798
+ struct ggml_context * ctx0 = ggml_init(params);
2799
+
2800
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2801
+
2802
+ struct ggml_tensor * cur;
2803
+ struct ggml_tensor * inpL;
2804
+
2805
+ if (tokens) {
2806
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2807
+
2808
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2811
+ }
2812
+ ggml_set_name(inp_tokens, "inp_tokens");
2813
+
2814
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2815
+ } else {
2816
+ #ifdef GGML_USE_MPI
2817
+ GGML_ASSERT(false && "not implemented");
2818
+ #endif
2819
+
2820
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2821
+
2822
+ ggml_allocr_alloc(lctx.alloc, inpL);
2823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2825
+ }
2826
+ }
2827
+
2828
+ const int i_gpu_start = n_layer - n_gpu_layers;
2829
+ (void) i_gpu_start;
2830
+
2831
+ // offload functions set the tensor output backend to GPU
2832
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2833
+ //
2834
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
+ // in that case ggml_cuda_assign_buffers has no effect
2836
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
+ offload_func_t offload_func_kq = llama_nop;
2838
+ offload_func_t offload_func_v = llama_nop;
2839
+
2840
+ #ifdef GGML_USE_CUBLAS
2841
+ if (n_gpu_layers > n_layer) {
2842
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2843
+ }
2844
+ if (n_gpu_layers > n_layer + 1) {
2845
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2846
+ }
2847
+ if (n_gpu_layers > n_layer + 2) {
2848
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2849
+ }
2850
+ #endif // GGML_USE_CUBLAS
2851
+
2852
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2853
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
+ }
2857
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2858
+
2859
+ for (int il = 0; il < n_layer; ++il) {
2860
+ ggml_format_name(inpL, "layer_inp_%d", il);
2861
+
2862
+ offload_func_t offload_func = llama_nop;
2863
+
2864
+ #ifdef GGML_USE_CUBLAS
2865
+ if (il >= i_gpu_start) {
2866
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2867
+ }
2868
+ #endif // GGML_USE_CUBLAS
2869
+
2870
+ struct ggml_tensor * inpSA = inpL;
2871
+
2872
+ // norm
2873
+ {
2874
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2875
+ offload_func(cur);
2876
+ ggml_set_name(cur, "rms_norm_0");
2877
+
2878
+ // cur = cur*attn_norm(broadcasted)
2879
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2880
+ offload_func(cur);
2881
+ ggml_set_name(cur, "attention_norm_0");
2882
+ }
2883
+
2884
+ // self-attention
2885
+ {
2886
+ // compute Q and K and RoPE them
2887
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2888
+ offload_func_kq(tmpk);
2889
+ ggml_set_name(tmpk, "tmpk");
2890
+
2891
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2892
+ offload_func_kq(tmpq);
2893
+ ggml_set_name(tmpq, "tmpq");
2894
+
2895
+ struct ggml_tensor * Kcur;
2896
+ struct ggml_tensor * Qcur;
2897
+ switch (model.type) {
2898
+ case MODEL_7B:
2899
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2901
+ break;
2902
+ case MODEL_13B:
2903
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2905
+ break;
2906
+ default:
2907
+ GGML_ASSERT(false);
2908
+ }
2909
+
2358
2910
  offload_func_kq(Kcur);
2359
2911
  ggml_set_name(Kcur, "Kcur");
2360
2912
 
2361
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2362
2913
  offload_func_kq(Qcur);
2363
2914
  ggml_set_name(Qcur, "Qcur");
2364
2915
 
@@ -2413,10 +2964,26 @@ static struct ggml_cgraph * llm_build_llama(
2413
2964
  offload_func_kq(KQ_scaled);
2414
2965
  ggml_set_name(KQ_scaled, "KQ_scaled");
2415
2966
 
2967
+ struct ggml_tensor * KQ_masked;
2968
+ struct ggml_tensor * KQ_scaled_alibi;
2969
+
2970
+ switch (model.type) {
2971
+ case MODEL_7B:
2972
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2973
+ break;
2974
+ case MODEL_13B:
2975
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2976
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2978
+ break;
2979
+ default:
2980
+ GGML_ASSERT(false);
2981
+ }
2416
2982
  // KQ_masked = mask_past(KQ_scaled)
2417
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2418
- offload_func_kq(KQ_masked);
2419
- ggml_set_name(KQ_masked, "KQ_masked");
2983
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
+ // offload_func_kq(KQ_masked);
2986
+ // ggml_set_name(KQ_masked, "KQ_masked");
2420
2987
 
2421
2988
  // KQ = soft_max(KQ_masked)
2422
2989
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
@@ -2851,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
2851
3418
  return gf;
2852
3419
  }
2853
3420
 
3421
+ static struct ggml_cgraph * llm_build_starcoder(
3422
+ llama_context & lctx,
3423
+ const llama_token * tokens,
3424
+ const float * embd,
3425
+ int n_tokens,
3426
+ int n_past) {
3427
+
3428
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
+
3430
+ const int N = n_tokens;
3431
+
3432
+ const auto & model = lctx.model;
3433
+ const auto & hparams = model.hparams;
3434
+
3435
+ const auto & kv_self = lctx.kv_self;
3436
+
3437
+ GGML_ASSERT(!!kv_self.ctx);
3438
+
3439
+ const int64_t n_embd = hparams.n_embd;
3440
+ const int64_t n_layer = hparams.n_layer;
3441
+ const int64_t n_ctx = hparams.n_ctx;
3442
+ const int64_t n_head = hparams.n_head;
3443
+ const int64_t n_head_kv = hparams.n_head_kv;
3444
+ const int64_t n_embd_head = hparams.n_embd_head();
3445
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3446
+
3447
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
+
3449
+ const float norm_eps = hparams.f_norm_eps;
3450
+
3451
+ auto & buf_compute = lctx.buf_compute;
3452
+
3453
+ struct ggml_init_params params = {
3454
+ /*.mem_size =*/ buf_compute.size,
3455
+ /*.mem_buffer =*/ buf_compute.data,
3456
+ /*.no_alloc =*/ false,
3457
+ };
3458
+
3459
+ params.no_alloc = true;
3460
+
3461
+ struct ggml_context * ctx0 = ggml_init(params);
3462
+
3463
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3464
+
3465
+ struct ggml_tensor * cur;
3466
+ struct ggml_tensor * token;
3467
+ struct ggml_tensor * position;
3468
+ struct ggml_tensor * inpL;
3469
+
3470
+ if (tokens) {
3471
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3472
+
3473
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3476
+ }
3477
+ ggml_set_name(inp_tokens, "inp_tokens");
3478
+
3479
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3480
+ } else {
3481
+ #ifdef GGML_USE_MPI
3482
+ GGML_ASSERT(false && "not implemented");
3483
+ #endif
3484
+
3485
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3486
+
3487
+ ggml_allocr_alloc(lctx.alloc, token);
3488
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
+ memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3490
+ }
3491
+ }
3492
+
3493
+ {
3494
+ // Compute position embeddings.
3495
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3496
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
+ for (int i = 0; i < N; ++i) {
3499
+ ((int32_t *) inp_positions->data)[i] = n_past + i;
3500
+ }
3501
+ }
3502
+ ggml_set_name(inp_positions, "inp_positions");
3503
+
3504
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
+ }
3506
+
3507
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3508
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
+ }
3512
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3513
+
3514
+ inpL = ggml_add(ctx0, token, position);
3515
+ ggml_set_name(inpL, "inpL");
3516
+
3517
+ for (int il = 0; il < n_layer; ++il) {
3518
+ {
3519
+ // Norm
3520
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3521
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3522
+ }
3523
+
3524
+ {
3525
+ // Self Attention
3526
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
+
3528
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
+
3532
+ struct ggml_tensor * Qcur = tmpq;
3533
+ struct ggml_tensor * Kcur = tmpk;
3534
+
3535
+ {
3536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3537
+ ggml_set_name(Vcur, "Vcur");
3538
+
3539
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3540
+ ggml_set_name(k, "k");
3541
+
3542
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3543
+ ( n_ctx)*ggml_element_size(kv_self.v),
3544
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3545
+
3546
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3548
+ }
3549
+
3550
+ struct ggml_tensor * Q =
3551
+ ggml_permute(ctx0,
3552
+ ggml_cpy(ctx0,
3553
+ Qcur,
3554
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3555
+ 0, 2, 1, 3);
3556
+ ggml_set_name(Q, "Q");
3557
+
3558
+ struct ggml_tensor * K =
3559
+ ggml_view_3d(ctx0, kv_self.k,
3560
+ n_embd_head, n_past + N, n_head_kv,
3561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3562
+ ggml_element_size(kv_self.k)*n_embd_head,
3563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3564
+ ggml_set_name(K, "K");
3565
+
3566
+ // K * Q
3567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3568
+ ggml_set_name(KQ, "KQ");
3569
+
3570
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3571
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
3572
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3574
+
3575
+ // KQ_masked = mask_past(KQ_scaled)
3576
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3577
+ ggml_set_name(KQ_masked, "KQ_masked");
3578
+
3579
+ // KQ = soft_max(KQ_masked)
3580
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3581
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3582
+
3583
+ // split cached V into n_head heads
3584
+ struct ggml_tensor * V =
3585
+ ggml_view_3d(ctx0, kv_self.v,
3586
+ n_past + N, n_embd_head, n_head_kv,
3587
+ ggml_element_size(kv_self.v)*n_ctx,
3588
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3590
+ ggml_set_name(V, "V");
3591
+
3592
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3593
+ ggml_set_name(KQV, "KQV");
3594
+
3595
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3596
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
+ ggml_set_name(KQV_merged, "KQV_merged");
3598
+
3599
+ // cur = KQV_merged.contiguous().view(n_embd, N)
3600
+ cur = ggml_cpy(ctx0,
3601
+ KQV_merged,
3602
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3603
+ ggml_set_name(cur, "KQV_merged_contiguous");
3604
+ }
3605
+
3606
+ // Projection
3607
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3608
+
3609
+ // Add the input
3610
+ cur = ggml_add(ctx0, cur, inpL);
3611
+
3612
+ struct ggml_tensor * inpFF = cur;
3613
+
3614
+ // FF
3615
+ {
3616
+ // Norm
3617
+ {
3618
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3619
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3620
+ }
3621
+
3622
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3623
+
3624
+ // GELU activation
3625
+ cur = ggml_gelu(ctx0, cur);
3626
+
3627
+ // Projection
3628
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3629
+ }
3630
+
3631
+ inpL = ggml_add(ctx0, cur, inpFF);
3632
+ }
3633
+
3634
+ // Output Norm
3635
+ {
3636
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3637
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3638
+ }
3639
+ ggml_set_name(cur, "result_norm");
3640
+
3641
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3642
+ ggml_set_name(cur, "result_output");
3643
+
3644
+ ggml_build_forward_expand(gf, cur);
3645
+ ggml_free(ctx0);
3646
+
3647
+ return gf;
3648
+ }
3649
+
2854
3650
  static struct ggml_cgraph * llama_build_graph(
2855
3651
  llama_context & lctx,
2856
3652
  const llama_token * tokens,
@@ -2866,10 +3662,18 @@ static struct ggml_cgraph * llama_build_graph(
2866
3662
  {
2867
3663
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2868
3664
  } break;
3665
+ case LLM_ARCH_BAICHUAN:
3666
+ {
3667
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3668
+ } break;
2869
3669
  case LLM_ARCH_FALCON:
2870
3670
  {
2871
3671
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
2872
3672
  } break;
3673
+ case LLM_ARCH_STARCODER:
3674
+ {
3675
+ result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3676
+ } break;
2873
3677
  default:
2874
3678
  GGML_ASSERT(false);
2875
3679
  };
@@ -2956,6 +3760,15 @@ static bool llama_eval_internal(
2956
3760
  n_threads = std::min(4, n_threads);
2957
3761
  }
2958
3762
 
3763
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
+ model.arch == LLM_ARCH_BAICHUAN ||
3766
+ model.arch == LLM_ARCH_FALCON;
3767
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
+ n_threads = 1;
3770
+ }
3771
+
2959
3772
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2960
3773
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
2961
3774
 
@@ -2971,10 +3784,6 @@ static bool llama_eval_internal(
2971
3784
  if (lctx.ctx_metal) {
2972
3785
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
2973
3786
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
2974
- ggml_metal_get_tensor (lctx.ctx_metal, res);
2975
- if (!lctx.embedding.empty()) {
2976
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
2977
- }
2978
3787
  } else {
2979
3788
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
2980
3789
  }
@@ -3123,10 +3932,9 @@ struct llm_tokenizer_spm {
3123
3932
  while (offs < text.size()) {
3124
3933
  llm_symbol sym;
3125
3934
  size_t len = utf8_len(text[offs]);
3126
- GGML_ASSERT(offs + len <= text.size());
3127
3935
  sym.text = text.c_str() + offs;
3128
- sym.n = len;
3129
- offs += len;
3936
+ sym.n = std::min(len, text.size() - offs);
3937
+ offs += sym.n;
3130
3938
  sym.prev = index - 1;
3131
3939
  sym.next = offs == text.size() ? -1 : index + 1;
3132
3940
  index++;
@@ -3488,7 +4296,7 @@ struct llama_grammar_candidate {
3488
4296
 
3489
4297
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3490
4298
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3491
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4299
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3492
4300
  const char * src,
3493
4301
  llama_partial_utf8 partial_start) {
3494
4302
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -4642,7 +5450,16 @@ void llama_beam_search(llama_context * ctx,
4642
5450
  // quantization
4643
5451
  //
4644
5452
 
4645
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5453
+ template <typename T>
5454
+ struct no_init {
5455
+ T value;
5456
+ no_init() { /* do nothing */ }
5457
+ };
5458
+
5459
+ static void llama_convert_tensor_internal(
5460
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5461
+ const size_t nelements, const int nthread
5462
+ ) {
4646
5463
  if (output.size() < nelements) {
4647
5464
  output.resize(nelements);
4648
5465
  }
@@ -4677,7 +5494,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4677
5494
  auto blocks_per_thread = nblocks / nthread;
4678
5495
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4679
5496
 
4680
- std::vector<std::thread> workers;
4681
5497
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4682
5498
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4683
5499
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4690,14 +5506,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4690
5506
  qtype.to_float(inbuf, outbuf, nels);
4691
5507
  }
4692
5508
  };
4693
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5509
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4694
5510
  in_buff_offs += thr_block_bytes;
4695
5511
  out_buff_offs += thr_elems;
4696
5512
  }
4697
- for (auto & worker : workers) {
4698
- worker.join();
5513
+ for (auto & w : workers) { w.join(); }
5514
+ workers.clear();
5515
+ }
5516
+
5517
+ #ifdef GGML_USE_K_QUANTS
5518
+ static ggml_type get_k_quant_type(
5519
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5520
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5521
+ ) {
5522
+ const std::string name = ggml_get_name(tensor);
5523
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5524
+ const auto tn = LLM_TN(model.arch);
5525
+
5526
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5527
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5528
+ };
5529
+
5530
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5531
+ int nx = tensor->ne[0];
5532
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5533
+ new_type = GGML_TYPE_Q8_0;
5534
+ }
5535
+ else if (new_type != GGML_TYPE_Q8_0) {
5536
+ new_type = GGML_TYPE_Q6_K;
5537
+ }
5538
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5539
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5540
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5541
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5542
+ }
5543
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5544
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5545
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5546
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5547
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5548
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5549
+ if (model.type == MODEL_70B) {
5550
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5551
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5552
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5553
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5554
+ }
5555
+ ++*i_attention_wv;
5556
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5557
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5558
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5559
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5560
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5561
+ : GGML_TYPE_Q3_K;
5562
+ }
5563
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5564
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5565
+ }
5566
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5567
+ if (model.arch == LLM_ARCH_FALCON) {
5568
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5569
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5570
+ } else {
5571
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5572
+ }
5573
+ }
5574
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5575
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5576
+ new_type = GGML_TYPE_Q5_K;
5577
+ }
5578
+ ++*i_feed_forward_w2;
5579
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5580
+ if (model.arch != LLM_ARCH_FALCON) {
5581
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5582
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5583
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5584
+ } else {
5585
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5586
+ }
5587
+ }
5588
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5589
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5590
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5591
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5592
+ }
5593
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5594
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5595
+ }
5596
+ // This can be used to reduce the size of the Q5_K_S model.
5597
+ // The associated PPL increase is fully in line with the size reduction
5598
+ //else {
5599
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5600
+ //}
5601
+ bool convert_incompatible_tensor = false;
5602
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5603
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5604
+ int nx = tensor->ne[0];
5605
+ int ny = tensor->ne[1];
5606
+ if (nx % QK_K != 0) {
5607
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5608
+ convert_incompatible_tensor = true;
5609
+ }
5610
+ }
5611
+ if (convert_incompatible_tensor) {
5612
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5613
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5614
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5615
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5616
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5617
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5618
+ } else {
5619
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5620
+ }
4699
5621
  }
5622
+
5623
+ return new_type;
4700
5624
  }
5625
+ #endif
4701
5626
 
4702
5627
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4703
5628
  ggml_type quantized_type;
@@ -4782,18 +5707,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4782
5707
  std::vector<int64_t> hist_all(1 << 4, 0);
4783
5708
 
4784
5709
  std::vector<std::thread> workers;
5710
+ workers.reserve(nthread);
4785
5711
  std::mutex mutex;
4786
5712
 
4787
- #ifdef GGML_USE_K_QUANTS
4788
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4789
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4790
- };
4791
- #endif
4792
-
4793
5713
  int idx = 0;
4794
5714
 
4795
- std::vector<uint8_t> read_data;
4796
- std::vector<uint8_t> work;
5715
+ std::vector<no_init<uint8_t>> read_data;
5716
+ std::vector<no_init<uint8_t>> work;
5717
+ std::vector<no_init<float>> f32_conv_buf;
4797
5718
 
4798
5719
  // populate the original tensors so we get an initial meta data
4799
5720
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4815,7 +5736,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4815
5736
 
4816
5737
  const std::string name = ggml_get_name(tensor);
4817
5738
 
4818
- read_data.resize(ggml_nbytes(tensor));
5739
+ if (read_data.size() < ggml_nbytes(tensor)) {
5740
+ read_data.resize(ggml_nbytes(tensor));
5741
+ }
4819
5742
  tensor->data = read_data.data();
4820
5743
  ml->load_data_for(tensor);
4821
5744
 
@@ -4840,101 +5763,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4840
5763
  if (quantize) {
4841
5764
  new_type = quantized_type;
4842
5765
  #ifdef GGML_USE_K_QUANTS
4843
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4844
- const auto tn = LLM_TN(ml->get_arch());
4845
-
4846
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4847
- int nx = tensor->ne[0];
4848
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
- new_type = GGML_TYPE_Q8_0;
4850
- }
4851
- else if (new_type != GGML_TYPE_Q8_0) {
4852
- new_type = GGML_TYPE_Q6_K;
4853
- }
4854
- } else if (name.find("attn_v.weight") != std::string::npos) {
4855
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4856
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4857
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4858
- }
4859
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4860
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4861
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4862
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4863
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4864
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
- if (model.type == MODEL_70B) {
4866
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
- }
4871
- ++i_attention_wv;
4872
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4873
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4874
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4875
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
- : GGML_TYPE_Q3_K;
4878
- }
4879
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
- }
4882
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
- if (model.arch == LLM_ARCH_FALCON) {
4884
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
- } else {
4887
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
- }
4889
- }
4890
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
- new_type = GGML_TYPE_Q5_K;
4893
- }
4894
- ++i_feed_forward_w2;
4895
- } else if (name.find("attn_output.weight") != std::string::npos) {
4896
- if (model.arch != LLM_ARCH_FALCON) {
4897
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
- } else {
4901
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
- }
4903
- }
4904
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4908
- }
4909
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4910
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4911
- }
4912
- // This can be used to reduce the size of the Q5_K_S model.
4913
- // The associated PPL increase is fully in line with the size reduction
4914
- //else {
4915
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4916
- //}
4917
- bool convert_incompatible_tensor = false;
4918
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4919
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4920
- int nx = tensor->ne[0];
4921
- int ny = tensor->ne[1];
4922
- if (nx % QK_K != 0) {
4923
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4924
- convert_incompatible_tensor = true;
4925
- }
4926
- }
4927
- if (convert_incompatible_tensor) {
4928
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4929
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4930
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4931
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4932
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4933
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4934
- } else {
4935
- throw std::runtime_error("Unsupported tensor size encountered\n");
4936
- }
4937
- }
5766
+ new_type = get_k_quant_type(
5767
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5768
+ );
4938
5769
  #endif
4939
5770
  // If we've decided to quantize to the same type the tensor is already
4940
5771
  // in then there's nothing to do.
@@ -4949,23 +5780,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4949
5780
  const size_t nelements = ggml_nelements(tensor);
4950
5781
 
4951
5782
  float * f32_data;
4952
- std::vector<float> f32_conv_buf;
4953
5783
 
4954
5784
  if (tensor->type == GGML_TYPE_F32) {
4955
5785
  f32_data = (float *) tensor->data;
4956
5786
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4957
5787
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4958
5788
  } else {
4959
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5789
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4960
5790
  f32_data = (float *) f32_conv_buf.data();
4961
5791
  }
4962
5792
 
4963
5793
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4964
5794
  fflush(stdout);
4965
5795
 
4966
- work.resize(nelements * 4); // upper bound on size
5796
+ if (work.size() < nelements * 4) {
5797
+ work.resize(nelements * 4); // upper bound on size
5798
+ }
4967
5799
  new_data = work.data();
4968
- std::vector<int64_t> hist_cur(1 << 4, 0);
5800
+ std::array<int64_t, 1 << 4> hist_cur = {};
4969
5801
 
4970
5802
  static const int chunk_size = 32 * 512;
4971
5803
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4976,13 +5808,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4976
5808
  size_t counter = 0;
4977
5809
  new_size = 0;
4978
5810
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4979
- std::vector<int64_t> local_hist;
5811
+ std::array<int64_t, 1 << 4> local_hist = {};
4980
5812
  size_t local_size = 0;
4981
5813
  while (true) {
4982
5814
  std::unique_lock<std::mutex> lock(mutex);
4983
5815
  size_t first = counter; counter += chunk_size;
4984
5816
  if (first >= nelements) {
4985
- if (!local_hist.empty()) {
5817
+ if (local_size > 0) {
4986
5818
  for (int j=0; j<int(local_hist.size()); ++j) {
4987
5819
  hist_cur[j] += local_hist[j];
4988
5820
  }
@@ -4992,22 +5824,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4992
5824
  }
4993
5825
  lock.unlock();
4994
5826
  size_t last = std::min(nelements, first + chunk_size);
4995
- if (local_hist.empty()) {
4996
- local_hist.resize(hist_cur.size(), 0);
4997
- }
4998
5827
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4999
5828
  }
5000
5829
  };
5001
- if ((int) workers.size() < nthread_use - 1) {
5002
- workers.resize(nthread_use - 1);
5003
- }
5004
5830
  for (int it = 0; it < nthread_use - 1; ++it) {
5005
- workers[it] = std::thread(compute);
5831
+ workers.emplace_back(compute);
5006
5832
  }
5007
5833
  compute();
5008
- for (int it = 0; it < nthread_use - 1; ++it) {
5009
- workers[it].join();
5010
- }
5834
+ for (auto & w : workers) { w.join(); }
5835
+ workers.clear();
5011
5836
  }
5012
5837
 
5013
5838
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5069,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5069
5894
  }
5070
5895
 
5071
5896
  // TODO: after the GGUF PR, this likely won't work and needs to be updated
5072
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
5897
+ static int llama_apply_lora_from_file_internal(
5898
+ const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
5899
+ ) {
5073
5900
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5074
5901
 
5075
5902
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5353,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
5353
6180
  /*.n_gpu_layers =*/ 0,
5354
6181
  /*.main_gpu =*/ 0,
5355
6182
  /*.tensor_split =*/ nullptr,
5356
- /*.rope_freq_base =*/ 10000.0f,
5357
- /*.rope_freq_scale =*/ 1.0f,
6183
+ /*.rope_freq_base =*/ 0.0f,
6184
+ /*.rope_freq_scale =*/ 0.0f,
5358
6185
  /*.progress_callback =*/ nullptr,
5359
6186
  /*.progress_callback_user_data =*/ nullptr,
5360
6187
  /*.low_vram =*/ false,
@@ -5616,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
5616
6443
  return ctx;
5617
6444
  }
5618
6445
 
5619
- struct llama_context * llama_init_from_file(
6446
+ static struct llama_context * llama_init_from_file(
5620
6447
  const char * path_model,
5621
6448
  struct llama_context_params params) {
5622
6449
  struct llama_model * model = llama_load_model_from_file(path_model, params);
@@ -5635,15 +6462,19 @@ void llama_free(struct llama_context * ctx) {
5635
6462
  }
5636
6463
 
5637
6464
  int llama_n_vocab(const struct llama_context * ctx) {
5638
- return ctx->model.vocab.id_to_token.size();
6465
+ return llama_model_n_vocab(&ctx->model);
5639
6466
  }
5640
6467
 
5641
6468
  int llama_n_ctx(const struct llama_context * ctx) {
5642
- return ctx->model.hparams.n_ctx;
6469
+ return llama_model_n_ctx(&ctx->model);
6470
+ }
6471
+
6472
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6473
+ return llama_model_n_ctx_train(&ctx->model);
5643
6474
  }
5644
6475
 
5645
6476
  int llama_n_embd(const struct llama_context * ctx) {
5646
- return ctx->model.hparams.n_embd;
6477
+ return llama_model_n_embd(&ctx->model);
5647
6478
  }
5648
6479
 
5649
6480
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5658,6 +6489,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5658
6489
  return model->hparams.n_ctx;
5659
6490
  }
5660
6491
 
6492
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6493
+ return model->hparams.n_ctx_train;
6494
+ }
6495
+
5661
6496
  int llama_model_n_embd(const struct llama_model * model) {
5662
6497
  return model->hparams.n_embd;
5663
6498
  }
@@ -5813,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
5813
6648
  * llama_copy_state_data(ctx, &data_ctx);
5814
6649
  *
5815
6650
  */
5816
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6651
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
5817
6652
  // copy rng
5818
6653
  {
5819
6654
  std::stringstream rng_ss;
@@ -6197,22 +7032,24 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6197
7032
  int llama_tokenize(
6198
7033
  struct llama_context * ctx,
6199
7034
  const char * text,
7035
+ int text_len,
6200
7036
  llama_token * tokens,
6201
7037
  int n_max_tokens,
6202
7038
  bool add_bos) {
6203
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
7039
+ return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
6204
7040
  }
6205
7041
 
6206
7042
  int llama_tokenize_with_model(
6207
7043
  const struct llama_model * model,
6208
7044
  const char * text,
7045
+ int text_len,
6209
7046
  llama_token * tokens,
6210
7047
  int n_max_tokens,
6211
7048
  bool add_bos) {
6212
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7049
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6213
7050
 
6214
7051
  if (n_max_tokens < (int) res.size()) {
6215
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
7052
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6216
7053
  return -((int) res.size());
6217
7054
  }
6218
7055
 
@@ -6351,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6351
7188
  }
6352
7189
 
6353
7190
  // For internal test use
6354
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7191
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7192
+ struct llama_context * ctx
7193
+ ) {
6355
7194
  return ctx->model.tensors_by_name;
6356
7195
  }
6357
7196