llama_cpp 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ #define LLAMA_API_INTERNAL
1
2
  #include "llama.h"
2
3
 
3
4
  #include "ggml.h"
@@ -108,7 +109,7 @@ static size_t utf8_len(char src) {
108
109
  return lookup[highbits];
109
110
  }
110
111
 
111
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
113
  std::string result;
113
114
  for (size_t pos = 0; ; pos += search.length()) {
114
115
  auto new_pos = s.find(search, pos);
@@ -160,17 +161,19 @@ enum llm_arch {
160
161
  LLM_ARCH_GPTJ,
161
162
  LLM_ARCH_GPTNEOX,
162
163
  LLM_ARCH_MPT,
164
+ LLM_ARCH_STARCODER,
163
165
  LLM_ARCH_UNKNOWN,
164
166
  };
165
167
 
166
168
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
167
- { LLM_ARCH_LLAMA, "llama" },
168
- { LLM_ARCH_FALCON, "falcon" },
169
- { LLM_ARCH_GPT2, "gpt2" },
170
- { LLM_ARCH_GPTJ, "gptj" },
171
- { LLM_ARCH_GPTNEOX, "gptneox" },
172
- { LLM_ARCH_MPT, "mpt" },
173
- { LLM_ARCH_BAICHUAN,"baichuan" },
169
+ { LLM_ARCH_LLAMA, "llama" },
170
+ { LLM_ARCH_FALCON, "falcon" },
171
+ { LLM_ARCH_GPT2, "gpt2" },
172
+ { LLM_ARCH_GPTJ, "gptj" },
173
+ { LLM_ARCH_GPTNEOX, "gptneox" },
174
+ { LLM_ARCH_MPT, "mpt" },
175
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
+ { LLM_ARCH_STARCODER, "starcoder" },
174
177
  };
175
178
 
176
179
  enum llm_kv {
@@ -376,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
376
379
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
380
  },
378
381
  },
382
+ {
383
+ LLM_ARCH_STARCODER,
384
+ {
385
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
386
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
387
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
388
+ { LLM_TENSOR_OUTPUT, "output" },
389
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
390
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
393
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
+ },
396
+ },
379
397
  {
380
398
  LLM_ARCH_UNKNOWN,
381
399
  {
@@ -680,6 +698,7 @@ struct llama_mmap {
680
698
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
681
699
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
682
700
  llama_format_win_err(GetLastError()).c_str());
701
+ }
683
702
  }
684
703
  #else
685
704
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -889,9 +908,11 @@ static llama_state g_state;
889
908
  // available llama models
890
909
  enum e_model {
891
910
  MODEL_UNKNOWN,
911
+ MODEL_1B,
892
912
  MODEL_3B,
893
913
  MODEL_7B,
894
914
  MODEL_13B,
915
+ MODEL_15B,
895
916
  MODEL_30B,
896
917
  MODEL_34B,
897
918
  MODEL_40B,
@@ -901,24 +922,24 @@ enum e_model {
901
922
 
902
923
  static const size_t kB = 1024;
903
924
  static const size_t MB = kB*kB;
925
+ static const size_t GB = kB*kB*kB;
904
926
 
905
- // default hparams (LLaMA 7B)
906
927
  struct llama_hparams {
907
- uint32_t n_vocab = 32000;
908
- uint32_t n_ctx_train = 2048; // the context size used during training
909
- uint32_t n_ctx = 512; // the context size used during inference
910
- uint32_t n_embd = 4096;
911
- uint32_t n_head = 32;
912
- uint32_t n_head_kv = 32;
913
- uint32_t n_layer = 32;
914
- uint32_t n_rot = 64;
915
- uint32_t n_ff = 11008;
916
-
917
- float f_norm_eps = 1e-5;
918
- float f_norm_rms_eps = 1e-5;
919
-
920
- float rope_freq_base = 10000.0f;
921
- float rope_freq_scale = 1.0f;
928
+ uint32_t n_vocab;
929
+ uint32_t n_ctx_train; // context size the model was trained on
930
+ uint32_t n_ctx; // context size used during inference
931
+ uint32_t n_embd;
932
+ uint32_t n_head;
933
+ uint32_t n_head_kv;
934
+ uint32_t n_layer;
935
+ uint32_t n_rot;
936
+ uint32_t n_ff;
937
+
938
+ float f_norm_eps;
939
+ float f_norm_rms_eps;
940
+
941
+ float rope_freq_base;
942
+ float rope_freq_scale;
922
943
 
923
944
  bool operator!=(const llama_hparams & other) const {
924
945
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -960,13 +981,22 @@ struct llama_layer {
960
981
  struct ggml_tensor * wo;
961
982
  struct ggml_tensor * wqkv;
962
983
 
984
+ // attention bias
985
+ struct ggml_tensor * bo;
986
+ struct ggml_tensor * bqkv;
987
+
963
988
  // normalization
964
989
  struct ggml_tensor * ffn_norm;
990
+ struct ggml_tensor * ffn_norm_b;
965
991
 
966
992
  // ff
967
993
  struct ggml_tensor * w1; // ffn_gate
968
994
  struct ggml_tensor * w2; // ffn_down
969
995
  struct ggml_tensor * w3; // ffn_up
996
+
997
+ // ff bias
998
+ struct ggml_tensor * b2; // ffn_down
999
+ struct ggml_tensor * b3; // ffn_up
970
1000
  };
971
1001
 
972
1002
  struct llama_kv_cache {
@@ -1040,10 +1070,11 @@ struct llama_model {
1040
1070
 
1041
1071
  std::string name = "n/a";
1042
1072
 
1043
- llama_hparams hparams;
1073
+ llama_hparams hparams = {};
1044
1074
  llama_vocab vocab;
1045
1075
 
1046
1076
  struct ggml_tensor * tok_embeddings;
1077
+ struct ggml_tensor * pos_embeddings;
1047
1078
 
1048
1079
  struct ggml_tensor * output_norm;
1049
1080
  struct ggml_tensor * output_norm_b;
@@ -1244,6 +1275,7 @@ struct llama_model_loader {
1244
1275
  int n_created = 0;
1245
1276
 
1246
1277
  int64_t n_elements = 0;
1278
+ size_t n_bytes = 0;
1247
1279
 
1248
1280
  bool use_mmap = false;
1249
1281
 
@@ -1276,6 +1308,7 @@ struct llama_model_loader {
1276
1308
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1277
1309
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1278
1310
  n_elements += ggml_nelements(t);
1311
+ n_bytes += ggml_nbytes(t);
1279
1312
  }
1280
1313
 
1281
1314
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1554,7 +1587,7 @@ struct llama_model_loader {
1554
1587
  // load LLaMA models
1555
1588
  //
1556
1589
 
1557
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1590
+ static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1558
1591
  if (ftype & LLAMA_FTYPE_GUESSED) {
1559
1592
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1560
1593
  }
@@ -1587,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1587
1620
 
1588
1621
  static const char * llama_model_type_name(e_model type) {
1589
1622
  switch (type) {
1623
+ case MODEL_1B: return "1B";
1590
1624
  case MODEL_3B: return "3B";
1591
1625
  case MODEL_7B: return "7B";
1592
1626
  case MODEL_13B: return "13B";
1627
+ case MODEL_15B: return "15B";
1593
1628
  case MODEL_30B: return "30B";
1594
1629
  case MODEL_34B: return "34B";
1595
1630
  case MODEL_40B: return "40B";
@@ -1633,28 +1668,17 @@ static void llm_load_hparams(
1633
1668
  hparams.n_head_kv = hparams.n_head;
1634
1669
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1635
1670
 
1636
- // TODO: manually setting rope freq base and scale should override this
1637
- // FIXME: partial fix when the param specified is not the default value, but
1638
- // will not work for overriding the model value to the params default
1639
-
1640
- llama_context_params defaults = llama_context_default_params();
1641
-
1642
- // rope_freq_base
1643
- {
1644
- float ropebase = 10000.0f;
1645
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1646
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1647
- rope_freq_base = ropebase;
1648
- }
1671
+ // rope_freq_base (optional)
1672
+ if (rope_freq_base == 0.0f) {
1673
+ rope_freq_base = 10000.0f;
1674
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1649
1675
  }
1650
1676
 
1651
1677
  // rope_freq_scale (inverse of the kv) is optional
1652
- {
1678
+ if (rope_freq_scale == 0.0f) {
1653
1679
  float ropescale = 1.0f;
1654
1680
  GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1655
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1656
- rope_freq_scale = 1.0f/ropescale;
1657
- }
1681
+ rope_freq_scale = 1.0f/ropescale;
1658
1682
  }
1659
1683
 
1660
1684
  // sanity check for n_rot (optional)
@@ -1707,6 +1731,17 @@ static void llm_load_hparams(
1707
1731
  default: model.type = e_model::MODEL_UNKNOWN;
1708
1732
  }
1709
1733
  } break;
1734
+ case LLM_ARCH_STARCODER:
1735
+ {
1736
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1737
+ switch (hparams.n_layer) {
1738
+ case 24: model.type = e_model::MODEL_1B; break;
1739
+ case 36: model.type = e_model::MODEL_3B; break;
1740
+ case 42: model.type = e_model::MODEL_7B; break;
1741
+ case 40: model.type = e_model::MODEL_15B; break;
1742
+ default: model.type = e_model::MODEL_UNKNOWN;
1743
+ }
1744
+ } break;
1710
1745
  default: (void)0;
1711
1746
  };
1712
1747
 
@@ -1860,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1860
1895
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1861
1896
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1862
1897
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1863
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
1898
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
+ if (ml.n_bytes < GB) {
1900
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
+ } else {
1902
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
+ }
1864
1904
 
1865
1905
  // general kv
1866
1906
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -2160,6 +2200,85 @@ static void llm_load_tensors(
2160
2200
  }
2161
2201
  }
2162
2202
  } break;
2203
+ case LLM_ARCH_STARCODER:
2204
+ {
2205
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2206
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2207
+
2208
+ // output
2209
+ {
2210
+ ggml_backend backend_norm;
2211
+ ggml_backend backend_output;
2212
+
2213
+ if (n_gpu_layers > int(n_layer)) {
2214
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
+ // on Windows however this is detrimental unless everything is on the GPU
2216
+ #ifndef _WIN32
2217
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2218
+ #else
2219
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
+ #endif // _WIN32
2221
+
2222
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2223
+ } else {
2224
+ backend_norm = GGML_BACKEND_CPU;
2225
+ backend_output = GGML_BACKEND_CPU;
2226
+ }
2227
+
2228
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2229
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2230
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2231
+
2232
+ if (backend_norm == GGML_BACKEND_GPU) {
2233
+ vram_weights += ggml_nbytes(model.output_norm);
2234
+ vram_weights += ggml_nbytes(model.output_norm_b);
2235
+ }
2236
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2237
+ vram_weights += ggml_nbytes(model.output);
2238
+ }
2239
+ }
2240
+
2241
+ const uint32_t n_ff = hparams.n_ff;
2242
+
2243
+ const int i_gpu_start = n_layer - n_gpu_layers;
2244
+
2245
+ model.layers.resize(n_layer);
2246
+
2247
+ for (uint32_t i = 0; i < n_layer; ++i) {
2248
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2249
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2250
+
2251
+ auto & layer = model.layers[i];
2252
+
2253
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2254
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2255
+
2256
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2257
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2258
+
2259
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2260
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2261
+
2262
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2263
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2264
+
2265
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2266
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2267
+
2268
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2269
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2270
+
2271
+ if (backend == GGML_BACKEND_GPU) {
2272
+ vram_weights +=
2273
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2274
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2275
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2276
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2277
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2278
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2279
+ }
2280
+ }
2281
+ } break;
2163
2282
  default:
2164
2283
  throw std::runtime_error("unknown architecture");
2165
2284
  };
@@ -3299,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
3299
3418
  return gf;
3300
3419
  }
3301
3420
 
3421
+ static struct ggml_cgraph * llm_build_starcoder(
3422
+ llama_context & lctx,
3423
+ const llama_token * tokens,
3424
+ const float * embd,
3425
+ int n_tokens,
3426
+ int n_past) {
3427
+
3428
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
+
3430
+ const int N = n_tokens;
3431
+
3432
+ const auto & model = lctx.model;
3433
+ const auto & hparams = model.hparams;
3434
+
3435
+ const auto & kv_self = lctx.kv_self;
3436
+
3437
+ GGML_ASSERT(!!kv_self.ctx);
3438
+
3439
+ const int64_t n_embd = hparams.n_embd;
3440
+ const int64_t n_layer = hparams.n_layer;
3441
+ const int64_t n_ctx = hparams.n_ctx;
3442
+ const int64_t n_head = hparams.n_head;
3443
+ const int64_t n_head_kv = hparams.n_head_kv;
3444
+ const int64_t n_embd_head = hparams.n_embd_head();
3445
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3446
+
3447
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
+
3449
+ const float norm_eps = hparams.f_norm_eps;
3450
+
3451
+ auto & buf_compute = lctx.buf_compute;
3452
+
3453
+ struct ggml_init_params params = {
3454
+ /*.mem_size =*/ buf_compute.size,
3455
+ /*.mem_buffer =*/ buf_compute.data,
3456
+ /*.no_alloc =*/ false,
3457
+ };
3458
+
3459
+ params.no_alloc = true;
3460
+
3461
+ struct ggml_context * ctx0 = ggml_init(params);
3462
+
3463
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3464
+
3465
+ struct ggml_tensor * cur;
3466
+ struct ggml_tensor * token;
3467
+ struct ggml_tensor * position;
3468
+ struct ggml_tensor * inpL;
3469
+
3470
+ if (tokens) {
3471
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3472
+
3473
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3476
+ }
3477
+ ggml_set_name(inp_tokens, "inp_tokens");
3478
+
3479
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3480
+ } else {
3481
+ #ifdef GGML_USE_MPI
3482
+ GGML_ASSERT(false && "not implemented");
3483
+ #endif
3484
+
3485
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3486
+
3487
+ ggml_allocr_alloc(lctx.alloc, token);
3488
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
+ memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3490
+ }
3491
+ }
3492
+
3493
+ {
3494
+ // Compute position embeddings.
3495
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3496
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
+ for (int i = 0; i < N; ++i) {
3499
+ ((int32_t *) inp_positions->data)[i] = n_past + i;
3500
+ }
3501
+ }
3502
+ ggml_set_name(inp_positions, "inp_positions");
3503
+
3504
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
+ }
3506
+
3507
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3508
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
+ }
3512
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3513
+
3514
+ inpL = ggml_add(ctx0, token, position);
3515
+ ggml_set_name(inpL, "inpL");
3516
+
3517
+ for (int il = 0; il < n_layer; ++il) {
3518
+ {
3519
+ // Norm
3520
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3521
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3522
+ }
3523
+
3524
+ {
3525
+ // Self Attention
3526
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
+
3528
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
+
3532
+ struct ggml_tensor * Qcur = tmpq;
3533
+ struct ggml_tensor * Kcur = tmpk;
3534
+
3535
+ {
3536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3537
+ ggml_set_name(Vcur, "Vcur");
3538
+
3539
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3540
+ ggml_set_name(k, "k");
3541
+
3542
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3543
+ ( n_ctx)*ggml_element_size(kv_self.v),
3544
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3545
+
3546
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3548
+ }
3549
+
3550
+ struct ggml_tensor * Q =
3551
+ ggml_permute(ctx0,
3552
+ ggml_cpy(ctx0,
3553
+ Qcur,
3554
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3555
+ 0, 2, 1, 3);
3556
+ ggml_set_name(Q, "Q");
3557
+
3558
+ struct ggml_tensor * K =
3559
+ ggml_view_3d(ctx0, kv_self.k,
3560
+ n_embd_head, n_past + N, n_head_kv,
3561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3562
+ ggml_element_size(kv_self.k)*n_embd_head,
3563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3564
+ ggml_set_name(K, "K");
3565
+
3566
+ // K * Q
3567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3568
+ ggml_set_name(KQ, "KQ");
3569
+
3570
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3571
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
3572
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3574
+
3575
+ // KQ_masked = mask_past(KQ_scaled)
3576
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3577
+ ggml_set_name(KQ_masked, "KQ_masked");
3578
+
3579
+ // KQ = soft_max(KQ_masked)
3580
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3581
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3582
+
3583
+ // split cached V into n_head heads
3584
+ struct ggml_tensor * V =
3585
+ ggml_view_3d(ctx0, kv_self.v,
3586
+ n_past + N, n_embd_head, n_head_kv,
3587
+ ggml_element_size(kv_self.v)*n_ctx,
3588
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3590
+ ggml_set_name(V, "V");
3591
+
3592
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3593
+ ggml_set_name(KQV, "KQV");
3594
+
3595
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3596
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
+ ggml_set_name(KQV_merged, "KQV_merged");
3598
+
3599
+ // cur = KQV_merged.contiguous().view(n_embd, N)
3600
+ cur = ggml_cpy(ctx0,
3601
+ KQV_merged,
3602
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3603
+ ggml_set_name(cur, "KQV_merged_contiguous");
3604
+ }
3605
+
3606
+ // Projection
3607
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3608
+
3609
+ // Add the input
3610
+ cur = ggml_add(ctx0, cur, inpL);
3611
+
3612
+ struct ggml_tensor * inpFF = cur;
3613
+
3614
+ // FF
3615
+ {
3616
+ // Norm
3617
+ {
3618
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3619
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3620
+ }
3621
+
3622
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3623
+
3624
+ // GELU activation
3625
+ cur = ggml_gelu(ctx0, cur);
3626
+
3627
+ // Projection
3628
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3629
+ }
3630
+
3631
+ inpL = ggml_add(ctx0, cur, inpFF);
3632
+ }
3633
+
3634
+ // Output Norm
3635
+ {
3636
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3637
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3638
+ }
3639
+ ggml_set_name(cur, "result_norm");
3640
+
3641
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3642
+ ggml_set_name(cur, "result_output");
3643
+
3644
+ ggml_build_forward_expand(gf, cur);
3645
+ ggml_free(ctx0);
3646
+
3647
+ return gf;
3648
+ }
3649
+
3302
3650
  static struct ggml_cgraph * llama_build_graph(
3303
3651
  llama_context & lctx,
3304
3652
  const llama_token * tokens,
@@ -3322,6 +3670,10 @@ static struct ggml_cgraph * llama_build_graph(
3322
3670
  {
3323
3671
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3324
3672
  } break;
3673
+ case LLM_ARCH_STARCODER:
3674
+ {
3675
+ result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3676
+ } break;
3325
3677
  default:
3326
3678
  GGML_ASSERT(false);
3327
3679
  };
@@ -3408,6 +3760,15 @@ static bool llama_eval_internal(
3408
3760
  n_threads = std::min(4, n_threads);
3409
3761
  }
3410
3762
 
3763
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
+ model.arch == LLM_ARCH_BAICHUAN ||
3766
+ model.arch == LLM_ARCH_FALCON;
3767
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
+ n_threads = 1;
3770
+ }
3771
+
3411
3772
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
3412
3773
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
3413
3774
 
@@ -3423,10 +3784,6 @@ static bool llama_eval_internal(
3423
3784
  if (lctx.ctx_metal) {
3424
3785
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
3425
3786
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
3426
- ggml_metal_get_tensor (lctx.ctx_metal, res);
3427
- if (!lctx.embedding.empty()) {
3428
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
3429
- }
3430
3787
  } else {
3431
3788
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
3432
3789
  }
@@ -3939,7 +4296,7 @@ struct llama_grammar_candidate {
3939
4296
 
3940
4297
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3941
4298
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3942
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4299
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3943
4300
  const char * src,
3944
4301
  llama_partial_utf8 partial_start) {
3945
4302
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -5537,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5537
5894
  }
5538
5895
 
5539
5896
  // TODO: after the GGUF PR, this likely won't work and needs to be updated
5540
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
5897
+ static int llama_apply_lora_from_file_internal(
5898
+ const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
5899
+ ) {
5541
5900
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5542
5901
 
5543
5902
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5821,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
5821
6180
  /*.n_gpu_layers =*/ 0,
5822
6181
  /*.main_gpu =*/ 0,
5823
6182
  /*.tensor_split =*/ nullptr,
5824
- /*.rope_freq_base =*/ 10000.0f,
5825
- /*.rope_freq_scale =*/ 1.0f,
6183
+ /*.rope_freq_base =*/ 0.0f,
6184
+ /*.rope_freq_scale =*/ 0.0f,
5826
6185
  /*.progress_callback =*/ nullptr,
5827
6186
  /*.progress_callback_user_data =*/ nullptr,
5828
6187
  /*.low_vram =*/ false,
@@ -6084,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
6084
6443
  return ctx;
6085
6444
  }
6086
6445
 
6087
- struct llama_context * llama_init_from_file(
6446
+ static struct llama_context * llama_init_from_file(
6088
6447
  const char * path_model,
6089
6448
  struct llama_context_params params) {
6090
6449
  struct llama_model * model = llama_load_model_from_file(path_model, params);
@@ -6289,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
6289
6648
  * llama_copy_state_data(ctx, &data_ctx);
6290
6649
  *
6291
6650
  */
6292
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6651
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6293
6652
  // copy rng
6294
6653
  {
6295
6654
  std::stringstream rng_ss;
@@ -6673,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6673
7032
  int llama_tokenize(
6674
7033
  struct llama_context * ctx,
6675
7034
  const char * text,
7035
+ int text_len,
6676
7036
  llama_token * tokens,
6677
7037
  int n_max_tokens,
6678
7038
  bool add_bos) {
6679
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
7039
+ return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
6680
7040
  }
6681
7041
 
6682
7042
  int llama_tokenize_with_model(
6683
7043
  const struct llama_model * model,
6684
7044
  const char * text,
7045
+ int text_len,
6685
7046
  llama_token * tokens,
6686
7047
  int n_max_tokens,
6687
7048
  bool add_bos) {
6688
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7049
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6689
7050
 
6690
7051
  if (n_max_tokens < (int) res.size()) {
6691
7052
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -6827,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6827
7188
  }
6828
7189
 
6829
7190
  // For internal test use
6830
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7191
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7192
+ struct llama_context * ctx
7193
+ ) {
6831
7194
  return ctx->model.tensors_by_name;
6832
7195
  }
6833
7196