llama_cpp 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,4 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
1
+ #define LLAMA_API_INTERNAL
6
2
  #include "llama.h"
7
3
 
8
4
  #include "ggml.h"
@@ -113,7 +109,7 @@ static size_t utf8_len(char src) {
113
109
  return lookup[highbits];
114
110
  }
115
111
 
116
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
113
  std::string result;
118
114
  for (size_t pos = 0; ; pos += search.length()) {
119
115
  auto new_pos = s.find(search, pos);
@@ -160,20 +156,24 @@ static std::string format(const char * fmt, ...) {
160
156
  enum llm_arch {
161
157
  LLM_ARCH_LLAMA,
162
158
  LLM_ARCH_FALCON,
159
+ LLM_ARCH_BAICHUAN,
163
160
  LLM_ARCH_GPT2,
164
161
  LLM_ARCH_GPTJ,
165
162
  LLM_ARCH_GPTNEOX,
166
163
  LLM_ARCH_MPT,
164
+ LLM_ARCH_STARCODER,
167
165
  LLM_ARCH_UNKNOWN,
168
166
  };
169
167
 
170
168
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
171
- { LLM_ARCH_LLAMA, "llama" },
172
- { LLM_ARCH_FALCON, "falcon" },
173
- { LLM_ARCH_GPT2, "gpt2" },
174
- { LLM_ARCH_GPTJ, "gptj" },
175
- { LLM_ARCH_GPTNEOX, "gptneox" },
176
- { LLM_ARCH_MPT, "mpt" },
169
+ { LLM_ARCH_LLAMA, "llama" },
170
+ { LLM_ARCH_FALCON, "falcon" },
171
+ { LLM_ARCH_GPT2, "gpt2" },
172
+ { LLM_ARCH_GPTJ, "gptj" },
173
+ { LLM_ARCH_GPTNEOX, "gptneox" },
174
+ { LLM_ARCH_MPT, "mpt" },
175
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
+ { LLM_ARCH_STARCODER, "starcoder" },
177
177
  };
178
178
 
179
179
  enum llm_kv {
@@ -314,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
314
314
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
315
315
  },
316
316
  },
317
+ {
318
+ LLM_ARCH_BAICHUAN,
319
+ {
320
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
321
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
322
+ { LLM_TENSOR_OUTPUT, "output" },
323
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
324
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
325
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
326
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
327
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
328
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
329
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
330
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
331
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
332
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
333
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
334
+ },
335
+ },
317
336
  {
318
337
  LLM_ARCH_FALCON,
319
338
  {
@@ -360,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360
379
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
380
  },
362
381
  },
382
+ {
383
+ LLM_ARCH_STARCODER,
384
+ {
385
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
386
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
387
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
388
+ { LLM_TENSOR_OUTPUT, "output" },
389
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
390
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
393
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
+ },
396
+ },
363
397
  {
364
398
  LLM_ARCH_UNKNOWN,
365
399
  {
@@ -658,9 +692,7 @@ struct llama_mmap {
658
692
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
659
693
  if (prefetch) {
660
694
  // Advise the kernel to preload the mapped memory
661
-
662
695
  WIN32_MEMORY_RANGE_ENTRY range;
663
-
664
696
  range.VirtualAddress = addr;
665
697
  range.NumberOfBytes = (SIZE_T)size;
666
698
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -876,9 +908,11 @@ static llama_state g_state;
876
908
  // available llama models
877
909
  enum e_model {
878
910
  MODEL_UNKNOWN,
911
+ MODEL_1B,
879
912
  MODEL_3B,
880
913
  MODEL_7B,
881
914
  MODEL_13B,
915
+ MODEL_15B,
882
916
  MODEL_30B,
883
917
  MODEL_34B,
884
918
  MODEL_40B,
@@ -888,24 +922,24 @@ enum e_model {
888
922
 
889
923
  static const size_t kB = 1024;
890
924
  static const size_t MB = kB*kB;
925
+ static const size_t GB = kB*kB*kB;
891
926
 
892
- // default hparams (LLaMA 7B)
893
927
  struct llama_hparams {
894
- uint32_t n_vocab = 32000;
895
- uint32_t n_ctx_train = 2048; // the context size used during training
896
- uint32_t n_ctx = 512; // the context size used during inference
897
- uint32_t n_embd = 4096;
898
- uint32_t n_head = 32;
899
- uint32_t n_head_kv = 32;
900
- uint32_t n_layer = 32;
901
- uint32_t n_rot = 64;
902
- uint32_t n_ff = 11008;
903
-
904
- float f_norm_eps = 1e-5;
905
- float f_norm_rms_eps = 1e-5;
906
-
907
- float rope_freq_base = 10000.0f;
908
- float rope_freq_scale = 1.0f;
928
+ uint32_t n_vocab;
929
+ uint32_t n_ctx_train; // context size the model was trained on
930
+ uint32_t n_ctx; // context size used during inference
931
+ uint32_t n_embd;
932
+ uint32_t n_head;
933
+ uint32_t n_head_kv;
934
+ uint32_t n_layer;
935
+ uint32_t n_rot;
936
+ uint32_t n_ff;
937
+
938
+ float f_norm_eps;
939
+ float f_norm_rms_eps;
940
+
941
+ float rope_freq_base;
942
+ float rope_freq_scale;
909
943
 
910
944
  bool operator!=(const llama_hparams & other) const {
911
945
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -947,13 +981,22 @@ struct llama_layer {
947
981
  struct ggml_tensor * wo;
948
982
  struct ggml_tensor * wqkv;
949
983
 
984
+ // attention bias
985
+ struct ggml_tensor * bo;
986
+ struct ggml_tensor * bqkv;
987
+
950
988
  // normalization
951
989
  struct ggml_tensor * ffn_norm;
990
+ struct ggml_tensor * ffn_norm_b;
952
991
 
953
992
  // ff
954
993
  struct ggml_tensor * w1; // ffn_gate
955
994
  struct ggml_tensor * w2; // ffn_down
956
995
  struct ggml_tensor * w3; // ffn_up
996
+
997
+ // ff bias
998
+ struct ggml_tensor * b2; // ffn_down
999
+ struct ggml_tensor * b3; // ffn_up
957
1000
  };
958
1001
 
959
1002
  struct llama_kv_cache {
@@ -1027,10 +1070,11 @@ struct llama_model {
1027
1070
 
1028
1071
  std::string name = "n/a";
1029
1072
 
1030
- llama_hparams hparams;
1073
+ llama_hparams hparams = {};
1031
1074
  llama_vocab vocab;
1032
1075
 
1033
1076
  struct ggml_tensor * tok_embeddings;
1077
+ struct ggml_tensor * pos_embeddings;
1034
1078
 
1035
1079
  struct ggml_tensor * output_norm;
1036
1080
  struct ggml_tensor * output_norm_b;
@@ -1231,6 +1275,7 @@ struct llama_model_loader {
1231
1275
  int n_created = 0;
1232
1276
 
1233
1277
  int64_t n_elements = 0;
1278
+ size_t n_bytes = 0;
1234
1279
 
1235
1280
  bool use_mmap = false;
1236
1281
 
@@ -1263,6 +1308,7 @@ struct llama_model_loader {
1263
1308
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1264
1309
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1265
1310
  n_elements += ggml_nelements(t);
1311
+ n_bytes += ggml_nbytes(t);
1266
1312
  }
1267
1313
 
1268
1314
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1541,7 +1587,7 @@ struct llama_model_loader {
1541
1587
  // load LLaMA models
1542
1588
  //
1543
1589
 
1544
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1590
+ static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1545
1591
  if (ftype & LLAMA_FTYPE_GUESSED) {
1546
1592
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1547
1593
  }
@@ -1574,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1574
1620
 
1575
1621
  static const char * llama_model_type_name(e_model type) {
1576
1622
  switch (type) {
1623
+ case MODEL_1B: return "1B";
1577
1624
  case MODEL_3B: return "3B";
1578
1625
  case MODEL_7B: return "7B";
1579
1626
  case MODEL_13B: return "13B";
1627
+ case MODEL_15B: return "15B";
1580
1628
  case MODEL_30B: return "30B";
1581
1629
  case MODEL_34B: return "34B";
1582
1630
  case MODEL_40B: return "40B";
@@ -1620,28 +1668,17 @@ static void llm_load_hparams(
1620
1668
  hparams.n_head_kv = hparams.n_head;
1621
1669
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1622
1670
 
1623
- // TODO: manually setting rope freq base and scale should override this
1624
- // FIXME: partial fix when the param specified is not the default value, but
1625
- // will not work for overriding the model value to the params default
1626
-
1627
- llama_context_params defaults = llama_context_default_params();
1628
-
1629
- // rope_freq_base
1630
- {
1631
- float ropebase = 10000.0f;
1632
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1633
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1634
- rope_freq_base = ropebase;
1635
- }
1671
+ // rope_freq_base (optional)
1672
+ if (rope_freq_base == 0.0f) {
1673
+ rope_freq_base = 10000.0f;
1674
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1636
1675
  }
1637
1676
 
1638
1677
  // rope_freq_scale (inverse of the kv) is optional
1639
- {
1678
+ if (rope_freq_scale == 0.0f) {
1640
1679
  float ropescale = 1.0f;
1641
1680
  GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1642
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1643
- rope_freq_scale = 1.0f/ropescale;
1644
- }
1681
+ rope_freq_scale = 1.0f/ropescale;
1645
1682
  }
1646
1683
 
1647
1684
  // sanity check for n_rot (optional)
@@ -1685,6 +1722,26 @@ static void llm_load_hparams(
1685
1722
  default: model.type = e_model::MODEL_UNKNOWN;
1686
1723
  }
1687
1724
  } break;
1725
+ case LLM_ARCH_BAICHUAN:
1726
+ {
1727
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1728
+ switch (hparams.n_layer) {
1729
+ case 32: model.type = e_model::MODEL_7B; break;
1730
+ case 40: model.type = e_model::MODEL_13B; break;
1731
+ default: model.type = e_model::MODEL_UNKNOWN;
1732
+ }
1733
+ } break;
1734
+ case LLM_ARCH_STARCODER:
1735
+ {
1736
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1737
+ switch (hparams.n_layer) {
1738
+ case 24: model.type = e_model::MODEL_1B; break;
1739
+ case 36: model.type = e_model::MODEL_3B; break;
1740
+ case 42: model.type = e_model::MODEL_7B; break;
1741
+ case 40: model.type = e_model::MODEL_15B; break;
1742
+ default: model.type = e_model::MODEL_UNKNOWN;
1743
+ }
1744
+ } break;
1688
1745
  default: (void)0;
1689
1746
  };
1690
1747
 
@@ -1838,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1838
1895
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1839
1896
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1840
1897
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1841
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
1898
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
+ if (ml.n_bytes < GB) {
1900
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
+ } else {
1902
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
+ }
1842
1904
 
1843
1905
  // general kv
1844
1906
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -1925,7 +1987,6 @@ static void llm_load_tensors(
1925
1987
  const int64_t n_vocab = hparams.n_vocab;
1926
1988
 
1927
1989
  const auto tn = LLM_TN(model.arch);
1928
-
1929
1990
  switch (model.arch) {
1930
1991
  case LLM_ARCH_LLAMA:
1931
1992
  {
@@ -1968,6 +2029,72 @@ static void llm_load_tensors(
1968
2029
 
1969
2030
  model.layers.resize(n_layer);
1970
2031
 
2032
+ for (uint32_t i = 0; i < n_layer; ++i) {
2033
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2034
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2035
+
2036
+ auto & layer = model.layers[i];
2037
+
2038
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2039
+
2040
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2041
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2042
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2043
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2044
+
2045
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2046
+
2047
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2048
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2049
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2050
+
2051
+ if (backend == GGML_BACKEND_GPU) {
2052
+ vram_weights +=
2053
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2054
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2055
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2056
+ }
2057
+ }
2058
+ } break;
2059
+ case LLM_ARCH_BAICHUAN:
2060
+ {
2061
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2062
+ {
2063
+ ggml_backend backend_norm;
2064
+ ggml_backend backend_output;
2065
+
2066
+ if (n_gpu_layers > int(n_layer)) {
2067
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
+ // on Windows however this is detrimental unless everything is on the GPU
2069
+ #ifndef _WIN32
2070
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2071
+ #else
2072
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
+ #endif // _WIN32
2074
+
2075
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2076
+ } else {
2077
+ backend_norm = GGML_BACKEND_CPU;
2078
+ backend_output = GGML_BACKEND_CPU;
2079
+ }
2080
+
2081
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2082
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2083
+
2084
+ if (backend_norm == GGML_BACKEND_GPU) {
2085
+ vram_weights += ggml_nbytes(model.output_norm);
2086
+ }
2087
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2088
+ vram_weights += ggml_nbytes(model.output);
2089
+ }
2090
+ }
2091
+
2092
+ const uint32_t n_ff = hparams.n_ff;
2093
+
2094
+ const int i_gpu_start = n_layer - n_gpu_layers;
2095
+
2096
+ model.layers.resize(n_layer);
2097
+
1971
2098
  for (uint32_t i = 0; i < n_layer; ++i) {
1972
2099
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1973
2100
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2073,6 +2200,85 @@ static void llm_load_tensors(
2073
2200
  }
2074
2201
  }
2075
2202
  } break;
2203
+ case LLM_ARCH_STARCODER:
2204
+ {
2205
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2206
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2207
+
2208
+ // output
2209
+ {
2210
+ ggml_backend backend_norm;
2211
+ ggml_backend backend_output;
2212
+
2213
+ if (n_gpu_layers > int(n_layer)) {
2214
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
+ // on Windows however this is detrimental unless everything is on the GPU
2216
+ #ifndef _WIN32
2217
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2218
+ #else
2219
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
+ #endif // _WIN32
2221
+
2222
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2223
+ } else {
2224
+ backend_norm = GGML_BACKEND_CPU;
2225
+ backend_output = GGML_BACKEND_CPU;
2226
+ }
2227
+
2228
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2229
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2230
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2231
+
2232
+ if (backend_norm == GGML_BACKEND_GPU) {
2233
+ vram_weights += ggml_nbytes(model.output_norm);
2234
+ vram_weights += ggml_nbytes(model.output_norm_b);
2235
+ }
2236
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2237
+ vram_weights += ggml_nbytes(model.output);
2238
+ }
2239
+ }
2240
+
2241
+ const uint32_t n_ff = hparams.n_ff;
2242
+
2243
+ const int i_gpu_start = n_layer - n_gpu_layers;
2244
+
2245
+ model.layers.resize(n_layer);
2246
+
2247
+ for (uint32_t i = 0; i < n_layer; ++i) {
2248
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2249
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2250
+
2251
+ auto & layer = model.layers[i];
2252
+
2253
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2254
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2255
+
2256
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2257
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2258
+
2259
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2260
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2261
+
2262
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2263
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2264
+
2265
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2266
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2267
+
2268
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2269
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2270
+
2271
+ if (backend == GGML_BACKEND_GPU) {
2272
+ vram_weights +=
2273
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2274
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2275
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2276
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2277
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2278
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2279
+ }
2280
+ }
2281
+ } break;
2076
2282
  default:
2077
2283
  throw std::runtime_error("unknown architecture");
2078
2284
  };
@@ -2354,11 +2560,356 @@ static struct ggml_cgraph * llm_build_llama(
2354
2560
  offload_func_kq(tmpq);
2355
2561
  ggml_set_name(tmpq, "tmpq");
2356
2562
 
2357
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2563
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2564
+ offload_func_kq(Kcur);
2565
+ ggml_set_name(Kcur, "Kcur");
2566
+
2567
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2568
+ offload_func_kq(Qcur);
2569
+ ggml_set_name(Qcur, "Qcur");
2570
+
2571
+ // store key and value to memory
2572
+ {
2573
+ // compute the transposed [N, n_embd] V matrix
2574
+
2575
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
+ offload_func_v(tmpv);
2577
+ ggml_set_name(tmpv, "tmpv");
2578
+
2579
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2580
+ offload_func_v(Vcur);
2581
+ ggml_set_name(Vcur, "Vcur");
2582
+
2583
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2584
+ offload_func_kq(k);
2585
+ ggml_set_name(k, "k");
2586
+
2587
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2588
+ ( n_ctx)*ggml_element_size(kv_self.v),
2589
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2590
+ offload_func_v(v);
2591
+ ggml_set_name(v, "v");
2592
+
2593
+ // important: storing RoPE-ed version of K in the KV cache!
2594
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2595
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2596
+ }
2597
+
2598
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2599
+ offload_func_kq(Q);
2600
+ ggml_set_name(Q, "Q");
2601
+
2602
+ struct ggml_tensor * K =
2603
+ ggml_view_3d(ctx0, kv_self.k,
2604
+ n_embd_head, n_past + N, n_head_kv,
2605
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2606
+ ggml_element_size(kv_self.k)*n_embd_head,
2607
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2608
+ offload_func_kq(K);
2609
+ ggml_set_name(K, "K");
2610
+
2611
+ // K * Q
2612
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2613
+ offload_func_kq(KQ);
2614
+ ggml_set_name(KQ, "KQ");
2615
+
2616
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2617
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2619
+ offload_func_kq(KQ_scaled);
2620
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2621
+
2622
+ // KQ_masked = mask_past(KQ_scaled)
2623
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2624
+ offload_func_kq(KQ_masked);
2625
+ ggml_set_name(KQ_masked, "KQ_masked");
2626
+
2627
+ // KQ = soft_max(KQ_masked)
2628
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2629
+ offload_func_v(KQ_soft_max);
2630
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
+
2632
+ // split cached V into n_head heads
2633
+ struct ggml_tensor * V =
2634
+ ggml_view_3d(ctx0, kv_self.v,
2635
+ n_past + N, n_embd_head, n_head_kv,
2636
+ ggml_element_size(kv_self.v)*n_ctx,
2637
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2639
+ offload_func_v(V);
2640
+ ggml_set_name(V, "V");
2641
+
2642
+ #if 1
2643
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2644
+ offload_func_v(KQV);
2645
+ ggml_set_name(KQV, "KQV");
2646
+ #else
2647
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
+ // is there a better way?
2650
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2651
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
+ #endif
2653
+
2654
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2655
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2656
+ offload_func_v(KQV_merged);
2657
+ ggml_set_name(KQV_merged, "KQV_merged");
2658
+
2659
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2660
+ cur = ggml_cpy(ctx0,
2661
+ KQV_merged,
2662
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2663
+ offload_func_v(cur);
2664
+ ggml_set_name(cur, "KQV_merged_contiguous");
2665
+
2666
+ // projection (no bias)
2667
+ cur = ggml_mul_mat(ctx0,
2668
+ model.layers[il].wo,
2669
+ cur);
2670
+ offload_func(cur);
2671
+ ggml_set_name(cur, "result_wo");
2672
+ }
2673
+
2674
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2675
+ offload_func(inpFF);
2676
+ ggml_set_name(inpFF, "inpFF");
2677
+
2678
+ // feed-forward network
2679
+ {
2680
+ // norm
2681
+ {
2682
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2683
+ offload_func(cur);
2684
+ ggml_set_name(cur, "rms_norm_1");
2685
+
2686
+ // cur = cur*ffn_norm(broadcasted)
2687
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2688
+ offload_func(cur);
2689
+ ggml_set_name(cur, "ffn_norm");
2690
+ }
2691
+
2692
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2693
+ model.layers[il].w3,
2694
+ cur);
2695
+ offload_func(tmp);
2696
+ ggml_set_name(tmp, "result_w3");
2697
+
2698
+ cur = ggml_mul_mat(ctx0,
2699
+ model.layers[il].w1,
2700
+ cur);
2701
+ offload_func(cur);
2702
+ ggml_set_name(cur, "result_w1");
2703
+
2704
+ // SILU activation
2705
+ cur = ggml_silu(ctx0, cur);
2706
+ offload_func(cur);
2707
+ ggml_set_name(cur, "silu");
2708
+
2709
+ cur = ggml_mul(ctx0, cur, tmp);
2710
+ offload_func(cur);
2711
+ ggml_set_name(cur, "silu_x_result_w3");
2712
+
2713
+ cur = ggml_mul_mat(ctx0,
2714
+ model.layers[il].w2,
2715
+ cur);
2716
+ offload_func(cur);
2717
+ ggml_set_name(cur, "result_w2");
2718
+ }
2719
+
2720
+ cur = ggml_add(ctx0, cur, inpFF);
2721
+ offload_func(cur);
2722
+ ggml_set_name(cur, "inpFF_+_result_w2");
2723
+
2724
+ // input for next layer
2725
+ inpL = cur;
2726
+ }
2727
+
2728
+ cur = inpL;
2729
+
2730
+ // norm
2731
+ {
2732
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2733
+ offload_func_nr(cur);
2734
+ ggml_set_name(cur, "rms_norm_2");
2735
+
2736
+ // cur = cur*norm(broadcasted)
2737
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2738
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2739
+ ggml_set_name(cur, "result_norm");
2740
+ }
2741
+
2742
+ // lm_head
2743
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2744
+ ggml_set_name(cur, "result_output");
2745
+
2746
+ ggml_build_forward_expand(gf, cur);
2747
+
2748
+ ggml_free(ctx0);
2749
+
2750
+ return gf;
2751
+ }
2752
+
2753
+
2754
+ static struct ggml_cgraph * llm_build_baichaun(
2755
+ llama_context & lctx,
2756
+ const llama_token * tokens,
2757
+ const float * embd,
2758
+ int n_tokens,
2759
+ int n_past) {
2760
+
2761
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
+
2763
+ const int N = n_tokens;
2764
+
2765
+ const auto & model = lctx.model;
2766
+ const auto & hparams = model.hparams;
2767
+
2768
+ const auto & kv_self = lctx.kv_self;
2769
+
2770
+ GGML_ASSERT(!!kv_self.ctx);
2771
+
2772
+ const int64_t n_embd = hparams.n_embd;
2773
+ const int64_t n_layer = hparams.n_layer;
2774
+ const int64_t n_ctx = hparams.n_ctx;
2775
+ const int64_t n_head = hparams.n_head;
2776
+ const int64_t n_head_kv = hparams.n_head_kv;
2777
+ const int64_t n_embd_head = hparams.n_embd_head();
2778
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2779
+
2780
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
+
2782
+ const float freq_base = hparams.rope_freq_base;
2783
+ const float freq_scale = hparams.rope_freq_scale;
2784
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
+
2786
+ const int n_gpu_layers = model.n_gpu_layers;
2787
+
2788
+ auto & buf_compute = lctx.buf_compute;
2789
+
2790
+ struct ggml_init_params params = {
2791
+ /*.mem_size =*/ buf_compute.size,
2792
+ /*.mem_buffer =*/ buf_compute.data,
2793
+ /*.no_alloc =*/ false,
2794
+ };
2795
+
2796
+ params.no_alloc = true;
2797
+
2798
+ struct ggml_context * ctx0 = ggml_init(params);
2799
+
2800
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2801
+
2802
+ struct ggml_tensor * cur;
2803
+ struct ggml_tensor * inpL;
2804
+
2805
+ if (tokens) {
2806
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2807
+
2808
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2811
+ }
2812
+ ggml_set_name(inp_tokens, "inp_tokens");
2813
+
2814
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2815
+ } else {
2816
+ #ifdef GGML_USE_MPI
2817
+ GGML_ASSERT(false && "not implemented");
2818
+ #endif
2819
+
2820
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2821
+
2822
+ ggml_allocr_alloc(lctx.alloc, inpL);
2823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2825
+ }
2826
+ }
2827
+
2828
+ const int i_gpu_start = n_layer - n_gpu_layers;
2829
+ (void) i_gpu_start;
2830
+
2831
+ // offload functions set the tensor output backend to GPU
2832
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2833
+ //
2834
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
+ // in that case ggml_cuda_assign_buffers has no effect
2836
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
+ offload_func_t offload_func_kq = llama_nop;
2838
+ offload_func_t offload_func_v = llama_nop;
2839
+
2840
+ #ifdef GGML_USE_CUBLAS
2841
+ if (n_gpu_layers > n_layer) {
2842
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2843
+ }
2844
+ if (n_gpu_layers > n_layer + 1) {
2845
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2846
+ }
2847
+ if (n_gpu_layers > n_layer + 2) {
2848
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2849
+ }
2850
+ #endif // GGML_USE_CUBLAS
2851
+
2852
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2853
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
+ }
2857
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2858
+
2859
+ for (int il = 0; il < n_layer; ++il) {
2860
+ ggml_format_name(inpL, "layer_inp_%d", il);
2861
+
2862
+ offload_func_t offload_func = llama_nop;
2863
+
2864
+ #ifdef GGML_USE_CUBLAS
2865
+ if (il >= i_gpu_start) {
2866
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2867
+ }
2868
+ #endif // GGML_USE_CUBLAS
2869
+
2870
+ struct ggml_tensor * inpSA = inpL;
2871
+
2872
+ // norm
2873
+ {
2874
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2875
+ offload_func(cur);
2876
+ ggml_set_name(cur, "rms_norm_0");
2877
+
2878
+ // cur = cur*attn_norm(broadcasted)
2879
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2880
+ offload_func(cur);
2881
+ ggml_set_name(cur, "attention_norm_0");
2882
+ }
2883
+
2884
+ // self-attention
2885
+ {
2886
+ // compute Q and K and RoPE them
2887
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2888
+ offload_func_kq(tmpk);
2889
+ ggml_set_name(tmpk, "tmpk");
2890
+
2891
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2892
+ offload_func_kq(tmpq);
2893
+ ggml_set_name(tmpq, "tmpq");
2894
+
2895
+ struct ggml_tensor * Kcur;
2896
+ struct ggml_tensor * Qcur;
2897
+ switch (model.type) {
2898
+ case MODEL_7B:
2899
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2901
+ break;
2902
+ case MODEL_13B:
2903
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2905
+ break;
2906
+ default:
2907
+ GGML_ASSERT(false);
2908
+ }
2909
+
2358
2910
  offload_func_kq(Kcur);
2359
2911
  ggml_set_name(Kcur, "Kcur");
2360
2912
 
2361
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2362
2913
  offload_func_kq(Qcur);
2363
2914
  ggml_set_name(Qcur, "Qcur");
2364
2915
 
@@ -2413,10 +2964,26 @@ static struct ggml_cgraph * llm_build_llama(
2413
2964
  offload_func_kq(KQ_scaled);
2414
2965
  ggml_set_name(KQ_scaled, "KQ_scaled");
2415
2966
 
2967
+ struct ggml_tensor * KQ_masked;
2968
+ struct ggml_tensor * KQ_scaled_alibi;
2969
+
2970
+ switch (model.type) {
2971
+ case MODEL_7B:
2972
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2973
+ break;
2974
+ case MODEL_13B:
2975
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2976
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2978
+ break;
2979
+ default:
2980
+ GGML_ASSERT(false);
2981
+ }
2416
2982
  // KQ_masked = mask_past(KQ_scaled)
2417
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2418
- offload_func_kq(KQ_masked);
2419
- ggml_set_name(KQ_masked, "KQ_masked");
2983
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
+ // offload_func_kq(KQ_masked);
2986
+ // ggml_set_name(KQ_masked, "KQ_masked");
2420
2987
 
2421
2988
  // KQ = soft_max(KQ_masked)
2422
2989
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
@@ -2851,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
2851
3418
  return gf;
2852
3419
  }
2853
3420
 
3421
+ static struct ggml_cgraph * llm_build_starcoder(
3422
+ llama_context & lctx,
3423
+ const llama_token * tokens,
3424
+ const float * embd,
3425
+ int n_tokens,
3426
+ int n_past) {
3427
+
3428
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
+
3430
+ const int N = n_tokens;
3431
+
3432
+ const auto & model = lctx.model;
3433
+ const auto & hparams = model.hparams;
3434
+
3435
+ const auto & kv_self = lctx.kv_self;
3436
+
3437
+ GGML_ASSERT(!!kv_self.ctx);
3438
+
3439
+ const int64_t n_embd = hparams.n_embd;
3440
+ const int64_t n_layer = hparams.n_layer;
3441
+ const int64_t n_ctx = hparams.n_ctx;
3442
+ const int64_t n_head = hparams.n_head;
3443
+ const int64_t n_head_kv = hparams.n_head_kv;
3444
+ const int64_t n_embd_head = hparams.n_embd_head();
3445
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3446
+
3447
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
+
3449
+ const float norm_eps = hparams.f_norm_eps;
3450
+
3451
+ auto & buf_compute = lctx.buf_compute;
3452
+
3453
+ struct ggml_init_params params = {
3454
+ /*.mem_size =*/ buf_compute.size,
3455
+ /*.mem_buffer =*/ buf_compute.data,
3456
+ /*.no_alloc =*/ false,
3457
+ };
3458
+
3459
+ params.no_alloc = true;
3460
+
3461
+ struct ggml_context * ctx0 = ggml_init(params);
3462
+
3463
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3464
+
3465
+ struct ggml_tensor * cur;
3466
+ struct ggml_tensor * token;
3467
+ struct ggml_tensor * position;
3468
+ struct ggml_tensor * inpL;
3469
+
3470
+ if (tokens) {
3471
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3472
+
3473
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3476
+ }
3477
+ ggml_set_name(inp_tokens, "inp_tokens");
3478
+
3479
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3480
+ } else {
3481
+ #ifdef GGML_USE_MPI
3482
+ GGML_ASSERT(false && "not implemented");
3483
+ #endif
3484
+
3485
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3486
+
3487
+ ggml_allocr_alloc(lctx.alloc, token);
3488
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
+ memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3490
+ }
3491
+ }
3492
+
3493
+ {
3494
+ // Compute position embeddings.
3495
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3496
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
+ for (int i = 0; i < N; ++i) {
3499
+ ((int32_t *) inp_positions->data)[i] = n_past + i;
3500
+ }
3501
+ }
3502
+ ggml_set_name(inp_positions, "inp_positions");
3503
+
3504
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
+ }
3506
+
3507
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3508
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
+ }
3512
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3513
+
3514
+ inpL = ggml_add(ctx0, token, position);
3515
+ ggml_set_name(inpL, "inpL");
3516
+
3517
+ for (int il = 0; il < n_layer; ++il) {
3518
+ {
3519
+ // Norm
3520
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3521
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3522
+ }
3523
+
3524
+ {
3525
+ // Self Attention
3526
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
+
3528
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
+
3532
+ struct ggml_tensor * Qcur = tmpq;
3533
+ struct ggml_tensor * Kcur = tmpk;
3534
+
3535
+ {
3536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3537
+ ggml_set_name(Vcur, "Vcur");
3538
+
3539
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3540
+ ggml_set_name(k, "k");
3541
+
3542
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3543
+ ( n_ctx)*ggml_element_size(kv_self.v),
3544
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3545
+
3546
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3548
+ }
3549
+
3550
+ struct ggml_tensor * Q =
3551
+ ggml_permute(ctx0,
3552
+ ggml_cpy(ctx0,
3553
+ Qcur,
3554
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3555
+ 0, 2, 1, 3);
3556
+ ggml_set_name(Q, "Q");
3557
+
3558
+ struct ggml_tensor * K =
3559
+ ggml_view_3d(ctx0, kv_self.k,
3560
+ n_embd_head, n_past + N, n_head_kv,
3561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3562
+ ggml_element_size(kv_self.k)*n_embd_head,
3563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3564
+ ggml_set_name(K, "K");
3565
+
3566
+ // K * Q
3567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3568
+ ggml_set_name(KQ, "KQ");
3569
+
3570
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3571
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
3572
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3574
+
3575
+ // KQ_masked = mask_past(KQ_scaled)
3576
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3577
+ ggml_set_name(KQ_masked, "KQ_masked");
3578
+
3579
+ // KQ = soft_max(KQ_masked)
3580
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3581
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3582
+
3583
+ // split cached V into n_head heads
3584
+ struct ggml_tensor * V =
3585
+ ggml_view_3d(ctx0, kv_self.v,
3586
+ n_past + N, n_embd_head, n_head_kv,
3587
+ ggml_element_size(kv_self.v)*n_ctx,
3588
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3590
+ ggml_set_name(V, "V");
3591
+
3592
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3593
+ ggml_set_name(KQV, "KQV");
3594
+
3595
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3596
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
+ ggml_set_name(KQV_merged, "KQV_merged");
3598
+
3599
+ // cur = KQV_merged.contiguous().view(n_embd, N)
3600
+ cur = ggml_cpy(ctx0,
3601
+ KQV_merged,
3602
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3603
+ ggml_set_name(cur, "KQV_merged_contiguous");
3604
+ }
3605
+
3606
+ // Projection
3607
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3608
+
3609
+ // Add the input
3610
+ cur = ggml_add(ctx0, cur, inpL);
3611
+
3612
+ struct ggml_tensor * inpFF = cur;
3613
+
3614
+ // FF
3615
+ {
3616
+ // Norm
3617
+ {
3618
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3619
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3620
+ }
3621
+
3622
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3623
+
3624
+ // GELU activation
3625
+ cur = ggml_gelu(ctx0, cur);
3626
+
3627
+ // Projection
3628
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3629
+ }
3630
+
3631
+ inpL = ggml_add(ctx0, cur, inpFF);
3632
+ }
3633
+
3634
+ // Output Norm
3635
+ {
3636
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3637
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3638
+ }
3639
+ ggml_set_name(cur, "result_norm");
3640
+
3641
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3642
+ ggml_set_name(cur, "result_output");
3643
+
3644
+ ggml_build_forward_expand(gf, cur);
3645
+ ggml_free(ctx0);
3646
+
3647
+ return gf;
3648
+ }
3649
+
2854
3650
  static struct ggml_cgraph * llama_build_graph(
2855
3651
  llama_context & lctx,
2856
3652
  const llama_token * tokens,
@@ -2866,10 +3662,18 @@ static struct ggml_cgraph * llama_build_graph(
2866
3662
  {
2867
3663
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2868
3664
  } break;
3665
+ case LLM_ARCH_BAICHUAN:
3666
+ {
3667
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3668
+ } break;
2869
3669
  case LLM_ARCH_FALCON:
2870
3670
  {
2871
3671
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
2872
3672
  } break;
3673
+ case LLM_ARCH_STARCODER:
3674
+ {
3675
+ result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3676
+ } break;
2873
3677
  default:
2874
3678
  GGML_ASSERT(false);
2875
3679
  };
@@ -2956,6 +3760,15 @@ static bool llama_eval_internal(
2956
3760
  n_threads = std::min(4, n_threads);
2957
3761
  }
2958
3762
 
3763
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
+ model.arch == LLM_ARCH_BAICHUAN ||
3766
+ model.arch == LLM_ARCH_FALCON;
3767
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
+ n_threads = 1;
3770
+ }
3771
+
2959
3772
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2960
3773
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
2961
3774
 
@@ -2971,10 +3784,6 @@ static bool llama_eval_internal(
2971
3784
  if (lctx.ctx_metal) {
2972
3785
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
2973
3786
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
2974
- ggml_metal_get_tensor (lctx.ctx_metal, res);
2975
- if (!lctx.embedding.empty()) {
2976
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
2977
- }
2978
3787
  } else {
2979
3788
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
2980
3789
  }
@@ -3123,10 +3932,9 @@ struct llm_tokenizer_spm {
3123
3932
  while (offs < text.size()) {
3124
3933
  llm_symbol sym;
3125
3934
  size_t len = utf8_len(text[offs]);
3126
- GGML_ASSERT(offs + len <= text.size());
3127
3935
  sym.text = text.c_str() + offs;
3128
- sym.n = len;
3129
- offs += len;
3936
+ sym.n = std::min(len, text.size() - offs);
3937
+ offs += sym.n;
3130
3938
  sym.prev = index - 1;
3131
3939
  sym.next = offs == text.size() ? -1 : index + 1;
3132
3940
  index++;
@@ -3488,7 +4296,7 @@ struct llama_grammar_candidate {
3488
4296
 
3489
4297
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3490
4298
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3491
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4299
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3492
4300
  const char * src,
3493
4301
  llama_partial_utf8 partial_start) {
3494
4302
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -4642,7 +5450,16 @@ void llama_beam_search(llama_context * ctx,
4642
5450
  // quantization
4643
5451
  //
4644
5452
 
4645
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5453
+ template <typename T>
5454
+ struct no_init {
5455
+ T value;
5456
+ no_init() { /* do nothing */ }
5457
+ };
5458
+
5459
+ static void llama_convert_tensor_internal(
5460
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5461
+ const size_t nelements, const int nthread
5462
+ ) {
4646
5463
  if (output.size() < nelements) {
4647
5464
  output.resize(nelements);
4648
5465
  }
@@ -4677,7 +5494,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4677
5494
  auto blocks_per_thread = nblocks / nthread;
4678
5495
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4679
5496
 
4680
- std::vector<std::thread> workers;
4681
5497
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4682
5498
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4683
5499
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4690,14 +5506,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4690
5506
  qtype.to_float(inbuf, outbuf, nels);
4691
5507
  }
4692
5508
  };
4693
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5509
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4694
5510
  in_buff_offs += thr_block_bytes;
4695
5511
  out_buff_offs += thr_elems;
4696
5512
  }
4697
- for (auto & worker : workers) {
4698
- worker.join();
5513
+ for (auto & w : workers) { w.join(); }
5514
+ workers.clear();
5515
+ }
5516
+
5517
+ #ifdef GGML_USE_K_QUANTS
5518
+ static ggml_type get_k_quant_type(
5519
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5520
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5521
+ ) {
5522
+ const std::string name = ggml_get_name(tensor);
5523
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5524
+ const auto tn = LLM_TN(model.arch);
5525
+
5526
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5527
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5528
+ };
5529
+
5530
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5531
+ int nx = tensor->ne[0];
5532
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5533
+ new_type = GGML_TYPE_Q8_0;
5534
+ }
5535
+ else if (new_type != GGML_TYPE_Q8_0) {
5536
+ new_type = GGML_TYPE_Q6_K;
5537
+ }
5538
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5539
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5540
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5541
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5542
+ }
5543
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5544
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5545
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5546
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5547
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5548
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5549
+ if (model.type == MODEL_70B) {
5550
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5551
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5552
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5553
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5554
+ }
5555
+ ++*i_attention_wv;
5556
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5557
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5558
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5559
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5560
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5561
+ : GGML_TYPE_Q3_K;
5562
+ }
5563
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5564
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5565
+ }
5566
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5567
+ if (model.arch == LLM_ARCH_FALCON) {
5568
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5569
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5570
+ } else {
5571
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5572
+ }
5573
+ }
5574
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5575
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5576
+ new_type = GGML_TYPE_Q5_K;
5577
+ }
5578
+ ++*i_feed_forward_w2;
5579
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5580
+ if (model.arch != LLM_ARCH_FALCON) {
5581
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5582
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5583
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5584
+ } else {
5585
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5586
+ }
5587
+ }
5588
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5589
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5590
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5591
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5592
+ }
5593
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5594
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5595
+ }
5596
+ // This can be used to reduce the size of the Q5_K_S model.
5597
+ // The associated PPL increase is fully in line with the size reduction
5598
+ //else {
5599
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5600
+ //}
5601
+ bool convert_incompatible_tensor = false;
5602
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5603
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5604
+ int nx = tensor->ne[0];
5605
+ int ny = tensor->ne[1];
5606
+ if (nx % QK_K != 0) {
5607
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5608
+ convert_incompatible_tensor = true;
5609
+ }
5610
+ }
5611
+ if (convert_incompatible_tensor) {
5612
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5613
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5614
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5615
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5616
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5617
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5618
+ } else {
5619
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5620
+ }
4699
5621
  }
5622
+
5623
+ return new_type;
4700
5624
  }
5625
+ #endif
4701
5626
 
4702
5627
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4703
5628
  ggml_type quantized_type;
@@ -4782,18 +5707,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4782
5707
  std::vector<int64_t> hist_all(1 << 4, 0);
4783
5708
 
4784
5709
  std::vector<std::thread> workers;
5710
+ workers.reserve(nthread);
4785
5711
  std::mutex mutex;
4786
5712
 
4787
- #ifdef GGML_USE_K_QUANTS
4788
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4789
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4790
- };
4791
- #endif
4792
-
4793
5713
  int idx = 0;
4794
5714
 
4795
- std::vector<uint8_t> read_data;
4796
- std::vector<uint8_t> work;
5715
+ std::vector<no_init<uint8_t>> read_data;
5716
+ std::vector<no_init<uint8_t>> work;
5717
+ std::vector<no_init<float>> f32_conv_buf;
4797
5718
 
4798
5719
  // populate the original tensors so we get an initial meta data
4799
5720
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4815,7 +5736,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4815
5736
 
4816
5737
  const std::string name = ggml_get_name(tensor);
4817
5738
 
4818
- read_data.resize(ggml_nbytes(tensor));
5739
+ if (read_data.size() < ggml_nbytes(tensor)) {
5740
+ read_data.resize(ggml_nbytes(tensor));
5741
+ }
4819
5742
  tensor->data = read_data.data();
4820
5743
  ml->load_data_for(tensor);
4821
5744
 
@@ -4840,101 +5763,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4840
5763
  if (quantize) {
4841
5764
  new_type = quantized_type;
4842
5765
  #ifdef GGML_USE_K_QUANTS
4843
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4844
- const auto tn = LLM_TN(ml->get_arch());
4845
-
4846
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4847
- int nx = tensor->ne[0];
4848
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
- new_type = GGML_TYPE_Q8_0;
4850
- }
4851
- else if (new_type != GGML_TYPE_Q8_0) {
4852
- new_type = GGML_TYPE_Q6_K;
4853
- }
4854
- } else if (name.find("attn_v.weight") != std::string::npos) {
4855
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4856
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4857
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4858
- }
4859
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4860
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4861
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4862
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4863
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4864
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
- if (model.type == MODEL_70B) {
4866
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
- }
4871
- ++i_attention_wv;
4872
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4873
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4874
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4875
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
- : GGML_TYPE_Q3_K;
4878
- }
4879
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
- }
4882
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
- if (model.arch == LLM_ARCH_FALCON) {
4884
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
- } else {
4887
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
- }
4889
- }
4890
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
- new_type = GGML_TYPE_Q5_K;
4893
- }
4894
- ++i_feed_forward_w2;
4895
- } else if (name.find("attn_output.weight") != std::string::npos) {
4896
- if (model.arch != LLM_ARCH_FALCON) {
4897
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
- } else {
4901
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
- }
4903
- }
4904
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4908
- }
4909
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4910
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4911
- }
4912
- // This can be used to reduce the size of the Q5_K_S model.
4913
- // The associated PPL increase is fully in line with the size reduction
4914
- //else {
4915
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4916
- //}
4917
- bool convert_incompatible_tensor = false;
4918
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4919
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4920
- int nx = tensor->ne[0];
4921
- int ny = tensor->ne[1];
4922
- if (nx % QK_K != 0) {
4923
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4924
- convert_incompatible_tensor = true;
4925
- }
4926
- }
4927
- if (convert_incompatible_tensor) {
4928
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4929
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4930
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4931
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4932
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4933
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4934
- } else {
4935
- throw std::runtime_error("Unsupported tensor size encountered\n");
4936
- }
4937
- }
5766
+ new_type = get_k_quant_type(
5767
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5768
+ );
4938
5769
  #endif
4939
5770
  // If we've decided to quantize to the same type the tensor is already
4940
5771
  // in then there's nothing to do.
@@ -4949,23 +5780,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4949
5780
  const size_t nelements = ggml_nelements(tensor);
4950
5781
 
4951
5782
  float * f32_data;
4952
- std::vector<float> f32_conv_buf;
4953
5783
 
4954
5784
  if (tensor->type == GGML_TYPE_F32) {
4955
5785
  f32_data = (float *) tensor->data;
4956
5786
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4957
5787
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4958
5788
  } else {
4959
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5789
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4960
5790
  f32_data = (float *) f32_conv_buf.data();
4961
5791
  }
4962
5792
 
4963
5793
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4964
5794
  fflush(stdout);
4965
5795
 
4966
- work.resize(nelements * 4); // upper bound on size
5796
+ if (work.size() < nelements * 4) {
5797
+ work.resize(nelements * 4); // upper bound on size
5798
+ }
4967
5799
  new_data = work.data();
4968
- std::vector<int64_t> hist_cur(1 << 4, 0);
5800
+ std::array<int64_t, 1 << 4> hist_cur = {};
4969
5801
 
4970
5802
  static const int chunk_size = 32 * 512;
4971
5803
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4976,13 +5808,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4976
5808
  size_t counter = 0;
4977
5809
  new_size = 0;
4978
5810
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4979
- std::vector<int64_t> local_hist;
5811
+ std::array<int64_t, 1 << 4> local_hist = {};
4980
5812
  size_t local_size = 0;
4981
5813
  while (true) {
4982
5814
  std::unique_lock<std::mutex> lock(mutex);
4983
5815
  size_t first = counter; counter += chunk_size;
4984
5816
  if (first >= nelements) {
4985
- if (!local_hist.empty()) {
5817
+ if (local_size > 0) {
4986
5818
  for (int j=0; j<int(local_hist.size()); ++j) {
4987
5819
  hist_cur[j] += local_hist[j];
4988
5820
  }
@@ -4992,22 +5824,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4992
5824
  }
4993
5825
  lock.unlock();
4994
5826
  size_t last = std::min(nelements, first + chunk_size);
4995
- if (local_hist.empty()) {
4996
- local_hist.resize(hist_cur.size(), 0);
4997
- }
4998
5827
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4999
5828
  }
5000
5829
  };
5001
- if ((int) workers.size() < nthread_use - 1) {
5002
- workers.resize(nthread_use - 1);
5003
- }
5004
5830
  for (int it = 0; it < nthread_use - 1; ++it) {
5005
- workers[it] = std::thread(compute);
5831
+ workers.emplace_back(compute);
5006
5832
  }
5007
5833
  compute();
5008
- for (int it = 0; it < nthread_use - 1; ++it) {
5009
- workers[it].join();
5010
- }
5834
+ for (auto & w : workers) { w.join(); }
5835
+ workers.clear();
5011
5836
  }
5012
5837
 
5013
5838
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5069,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5069
5894
  }
5070
5895
 
5071
5896
  // TODO: after the GGUF PR, this likely won't work and needs to be updated
5072
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
5897
+ static int llama_apply_lora_from_file_internal(
5898
+ const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
5899
+ ) {
5073
5900
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5074
5901
 
5075
5902
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5353,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
5353
6180
  /*.n_gpu_layers =*/ 0,
5354
6181
  /*.main_gpu =*/ 0,
5355
6182
  /*.tensor_split =*/ nullptr,
5356
- /*.rope_freq_base =*/ 10000.0f,
5357
- /*.rope_freq_scale =*/ 1.0f,
6183
+ /*.rope_freq_base =*/ 0.0f,
6184
+ /*.rope_freq_scale =*/ 0.0f,
5358
6185
  /*.progress_callback =*/ nullptr,
5359
6186
  /*.progress_callback_user_data =*/ nullptr,
5360
6187
  /*.low_vram =*/ false,
@@ -5616,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
5616
6443
  return ctx;
5617
6444
  }
5618
6445
 
5619
- struct llama_context * llama_init_from_file(
6446
+ static struct llama_context * llama_init_from_file(
5620
6447
  const char * path_model,
5621
6448
  struct llama_context_params params) {
5622
6449
  struct llama_model * model = llama_load_model_from_file(path_model, params);
@@ -5635,15 +6462,19 @@ void llama_free(struct llama_context * ctx) {
5635
6462
  }
5636
6463
 
5637
6464
  int llama_n_vocab(const struct llama_context * ctx) {
5638
- return ctx->model.vocab.id_to_token.size();
6465
+ return llama_model_n_vocab(&ctx->model);
5639
6466
  }
5640
6467
 
5641
6468
  int llama_n_ctx(const struct llama_context * ctx) {
5642
- return ctx->model.hparams.n_ctx;
6469
+ return llama_model_n_ctx(&ctx->model);
6470
+ }
6471
+
6472
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6473
+ return llama_model_n_ctx_train(&ctx->model);
5643
6474
  }
5644
6475
 
5645
6476
  int llama_n_embd(const struct llama_context * ctx) {
5646
- return ctx->model.hparams.n_embd;
6477
+ return llama_model_n_embd(&ctx->model);
5647
6478
  }
5648
6479
 
5649
6480
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5658,6 +6489,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5658
6489
  return model->hparams.n_ctx;
5659
6490
  }
5660
6491
 
6492
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6493
+ return model->hparams.n_ctx_train;
6494
+ }
6495
+
5661
6496
  int llama_model_n_embd(const struct llama_model * model) {
5662
6497
  return model->hparams.n_embd;
5663
6498
  }
@@ -5813,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
5813
6648
  * llama_copy_state_data(ctx, &data_ctx);
5814
6649
  *
5815
6650
  */
5816
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6651
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
5817
6652
  // copy rng
5818
6653
  {
5819
6654
  std::stringstream rng_ss;
@@ -6197,22 +7032,24 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6197
7032
  int llama_tokenize(
6198
7033
  struct llama_context * ctx,
6199
7034
  const char * text,
7035
+ int text_len,
6200
7036
  llama_token * tokens,
6201
7037
  int n_max_tokens,
6202
7038
  bool add_bos) {
6203
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
7039
+ return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
6204
7040
  }
6205
7041
 
6206
7042
  int llama_tokenize_with_model(
6207
7043
  const struct llama_model * model,
6208
7044
  const char * text,
7045
+ int text_len,
6209
7046
  llama_token * tokens,
6210
7047
  int n_max_tokens,
6211
7048
  bool add_bos) {
6212
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7049
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6213
7050
 
6214
7051
  if (n_max_tokens < (int) res.size()) {
6215
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
7052
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6216
7053
  return -((int) res.size());
6217
7054
  }
6218
7055
 
@@ -6351,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6351
7188
  }
6352
7189
 
6353
7190
  // For internal test use
6354
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7191
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7192
+ struct llama_context * ctx
7193
+ ) {
6355
7194
  return ctx->model.tensors_by_name;
6356
7195
  }
6357
7196