llama_cpp 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ #define LLAMA_API_INTERNAL
1
2
  #include "llama.h"
2
3
 
3
4
  #include "ggml.h"
@@ -108,7 +109,7 @@ static size_t utf8_len(char src) {
108
109
  return lookup[highbits];
109
110
  }
110
111
 
111
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
113
  std::string result;
113
114
  for (size_t pos = 0; ; pos += search.length()) {
114
115
  auto new_pos = s.find(search, pos);
@@ -160,17 +161,19 @@ enum llm_arch {
160
161
  LLM_ARCH_GPTJ,
161
162
  LLM_ARCH_GPTNEOX,
162
163
  LLM_ARCH_MPT,
164
+ LLM_ARCH_STARCODER,
163
165
  LLM_ARCH_UNKNOWN,
164
166
  };
165
167
 
166
168
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
167
- { LLM_ARCH_LLAMA, "llama" },
168
- { LLM_ARCH_FALCON, "falcon" },
169
- { LLM_ARCH_GPT2, "gpt2" },
170
- { LLM_ARCH_GPTJ, "gptj" },
171
- { LLM_ARCH_GPTNEOX, "gptneox" },
172
- { LLM_ARCH_MPT, "mpt" },
173
- { LLM_ARCH_BAICHUAN,"baichuan" },
169
+ { LLM_ARCH_LLAMA, "llama" },
170
+ { LLM_ARCH_FALCON, "falcon" },
171
+ { LLM_ARCH_GPT2, "gpt2" },
172
+ { LLM_ARCH_GPTJ, "gptj" },
173
+ { LLM_ARCH_GPTNEOX, "gptneox" },
174
+ { LLM_ARCH_MPT, "mpt" },
175
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
+ { LLM_ARCH_STARCODER, "starcoder" },
174
177
  };
175
178
 
176
179
  enum llm_kv {
@@ -376,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
376
379
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
380
  },
378
381
  },
382
+ {
383
+ LLM_ARCH_STARCODER,
384
+ {
385
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
386
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
387
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
388
+ { LLM_TENSOR_OUTPUT, "output" },
389
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
390
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
391
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
392
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
393
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
394
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
+ },
396
+ },
379
397
  {
380
398
  LLM_ARCH_UNKNOWN,
381
399
  {
@@ -680,6 +698,7 @@ struct llama_mmap {
680
698
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
681
699
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
682
700
  llama_format_win_err(GetLastError()).c_str());
701
+ }
683
702
  }
684
703
  #else
685
704
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -889,9 +908,11 @@ static llama_state g_state;
889
908
  // available llama models
890
909
  enum e_model {
891
910
  MODEL_UNKNOWN,
911
+ MODEL_1B,
892
912
  MODEL_3B,
893
913
  MODEL_7B,
894
914
  MODEL_13B,
915
+ MODEL_15B,
895
916
  MODEL_30B,
896
917
  MODEL_34B,
897
918
  MODEL_40B,
@@ -901,24 +922,24 @@ enum e_model {
901
922
 
902
923
  static const size_t kB = 1024;
903
924
  static const size_t MB = kB*kB;
925
+ static const size_t GB = kB*kB*kB;
904
926
 
905
- // default hparams (LLaMA 7B)
906
927
  struct llama_hparams {
907
- uint32_t n_vocab = 32000;
908
- uint32_t n_ctx_train = 2048; // the context size used during training
909
- uint32_t n_ctx = 512; // the context size used during inference
910
- uint32_t n_embd = 4096;
911
- uint32_t n_head = 32;
912
- uint32_t n_head_kv = 32;
913
- uint32_t n_layer = 32;
914
- uint32_t n_rot = 64;
915
- uint32_t n_ff = 11008;
916
-
917
- float f_norm_eps = 1e-5;
918
- float f_norm_rms_eps = 1e-5;
919
-
920
- float rope_freq_base = 10000.0f;
921
- float rope_freq_scale = 1.0f;
928
+ uint32_t n_vocab;
929
+ uint32_t n_ctx_train; // context size the model was trained on
930
+ uint32_t n_ctx; // context size used during inference
931
+ uint32_t n_embd;
932
+ uint32_t n_head;
933
+ uint32_t n_head_kv;
934
+ uint32_t n_layer;
935
+ uint32_t n_rot;
936
+ uint32_t n_ff;
937
+
938
+ float f_norm_eps;
939
+ float f_norm_rms_eps;
940
+
941
+ float rope_freq_base;
942
+ float rope_freq_scale;
922
943
 
923
944
  bool operator!=(const llama_hparams & other) const {
924
945
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -960,13 +981,22 @@ struct llama_layer {
960
981
  struct ggml_tensor * wo;
961
982
  struct ggml_tensor * wqkv;
962
983
 
984
+ // attention bias
985
+ struct ggml_tensor * bo;
986
+ struct ggml_tensor * bqkv;
987
+
963
988
  // normalization
964
989
  struct ggml_tensor * ffn_norm;
990
+ struct ggml_tensor * ffn_norm_b;
965
991
 
966
992
  // ff
967
993
  struct ggml_tensor * w1; // ffn_gate
968
994
  struct ggml_tensor * w2; // ffn_down
969
995
  struct ggml_tensor * w3; // ffn_up
996
+
997
+ // ff bias
998
+ struct ggml_tensor * b2; // ffn_down
999
+ struct ggml_tensor * b3; // ffn_up
970
1000
  };
971
1001
 
972
1002
  struct llama_kv_cache {
@@ -1040,10 +1070,11 @@ struct llama_model {
1040
1070
 
1041
1071
  std::string name = "n/a";
1042
1072
 
1043
- llama_hparams hparams;
1073
+ llama_hparams hparams = {};
1044
1074
  llama_vocab vocab;
1045
1075
 
1046
1076
  struct ggml_tensor * tok_embeddings;
1077
+ struct ggml_tensor * pos_embeddings;
1047
1078
 
1048
1079
  struct ggml_tensor * output_norm;
1049
1080
  struct ggml_tensor * output_norm_b;
@@ -1244,6 +1275,7 @@ struct llama_model_loader {
1244
1275
  int n_created = 0;
1245
1276
 
1246
1277
  int64_t n_elements = 0;
1278
+ size_t n_bytes = 0;
1247
1279
 
1248
1280
  bool use_mmap = false;
1249
1281
 
@@ -1276,6 +1308,7 @@ struct llama_model_loader {
1276
1308
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1277
1309
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1278
1310
  n_elements += ggml_nelements(t);
1311
+ n_bytes += ggml_nbytes(t);
1279
1312
  }
1280
1313
 
1281
1314
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1554,7 +1587,7 @@ struct llama_model_loader {
1554
1587
  // load LLaMA models
1555
1588
  //
1556
1589
 
1557
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1590
+ static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1558
1591
  if (ftype & LLAMA_FTYPE_GUESSED) {
1559
1592
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1560
1593
  }
@@ -1587,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1587
1620
 
1588
1621
  static const char * llama_model_type_name(e_model type) {
1589
1622
  switch (type) {
1623
+ case MODEL_1B: return "1B";
1590
1624
  case MODEL_3B: return "3B";
1591
1625
  case MODEL_7B: return "7B";
1592
1626
  case MODEL_13B: return "13B";
1627
+ case MODEL_15B: return "15B";
1593
1628
  case MODEL_30B: return "30B";
1594
1629
  case MODEL_34B: return "34B";
1595
1630
  case MODEL_40B: return "40B";
@@ -1633,28 +1668,17 @@ static void llm_load_hparams(
1633
1668
  hparams.n_head_kv = hparams.n_head;
1634
1669
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1635
1670
 
1636
- // TODO: manually setting rope freq base and scale should override this
1637
- // FIXME: partial fix when the param specified is not the default value, but
1638
- // will not work for overriding the model value to the params default
1639
-
1640
- llama_context_params defaults = llama_context_default_params();
1641
-
1642
- // rope_freq_base
1643
- {
1644
- float ropebase = 10000.0f;
1645
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1646
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1647
- rope_freq_base = ropebase;
1648
- }
1671
+ // rope_freq_base (optional)
1672
+ if (rope_freq_base == 0.0f) {
1673
+ rope_freq_base = 10000.0f;
1674
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1649
1675
  }
1650
1676
 
1651
1677
  // rope_freq_scale (inverse of the kv) is optional
1652
- {
1678
+ if (rope_freq_scale == 0.0f) {
1653
1679
  float ropescale = 1.0f;
1654
1680
  GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1655
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1656
- rope_freq_scale = 1.0f/ropescale;
1657
- }
1681
+ rope_freq_scale = 1.0f/ropescale;
1658
1682
  }
1659
1683
 
1660
1684
  // sanity check for n_rot (optional)
@@ -1707,6 +1731,17 @@ static void llm_load_hparams(
1707
1731
  default: model.type = e_model::MODEL_UNKNOWN;
1708
1732
  }
1709
1733
  } break;
1734
+ case LLM_ARCH_STARCODER:
1735
+ {
1736
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1737
+ switch (hparams.n_layer) {
1738
+ case 24: model.type = e_model::MODEL_1B; break;
1739
+ case 36: model.type = e_model::MODEL_3B; break;
1740
+ case 42: model.type = e_model::MODEL_7B; break;
1741
+ case 40: model.type = e_model::MODEL_15B; break;
1742
+ default: model.type = e_model::MODEL_UNKNOWN;
1743
+ }
1744
+ } break;
1710
1745
  default: (void)0;
1711
1746
  };
1712
1747
 
@@ -1860,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1860
1895
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1861
1896
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1862
1897
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1863
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
1898
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
+ if (ml.n_bytes < GB) {
1900
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
+ } else {
1902
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
+ }
1864
1904
 
1865
1905
  // general kv
1866
1906
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -2160,6 +2200,85 @@ static void llm_load_tensors(
2160
2200
  }
2161
2201
  }
2162
2202
  } break;
2203
+ case LLM_ARCH_STARCODER:
2204
+ {
2205
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2206
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2207
+
2208
+ // output
2209
+ {
2210
+ ggml_backend backend_norm;
2211
+ ggml_backend backend_output;
2212
+
2213
+ if (n_gpu_layers > int(n_layer)) {
2214
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
+ // on Windows however this is detrimental unless everything is on the GPU
2216
+ #ifndef _WIN32
2217
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2218
+ #else
2219
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
+ #endif // _WIN32
2221
+
2222
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2223
+ } else {
2224
+ backend_norm = GGML_BACKEND_CPU;
2225
+ backend_output = GGML_BACKEND_CPU;
2226
+ }
2227
+
2228
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2229
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2230
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2231
+
2232
+ if (backend_norm == GGML_BACKEND_GPU) {
2233
+ vram_weights += ggml_nbytes(model.output_norm);
2234
+ vram_weights += ggml_nbytes(model.output_norm_b);
2235
+ }
2236
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2237
+ vram_weights += ggml_nbytes(model.output);
2238
+ }
2239
+ }
2240
+
2241
+ const uint32_t n_ff = hparams.n_ff;
2242
+
2243
+ const int i_gpu_start = n_layer - n_gpu_layers;
2244
+
2245
+ model.layers.resize(n_layer);
2246
+
2247
+ for (uint32_t i = 0; i < n_layer; ++i) {
2248
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2249
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2250
+
2251
+ auto & layer = model.layers[i];
2252
+
2253
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2254
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2255
+
2256
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2257
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2258
+
2259
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2260
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2261
+
2262
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2263
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2264
+
2265
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2266
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2267
+
2268
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2269
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2270
+
2271
+ if (backend == GGML_BACKEND_GPU) {
2272
+ vram_weights +=
2273
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2274
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2275
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2276
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2277
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2278
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2279
+ }
2280
+ }
2281
+ } break;
2163
2282
  default:
2164
2283
  throw std::runtime_error("unknown architecture");
2165
2284
  };
@@ -3299,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
3299
3418
  return gf;
3300
3419
  }
3301
3420
 
3421
+ static struct ggml_cgraph * llm_build_starcoder(
3422
+ llama_context & lctx,
3423
+ const llama_token * tokens,
3424
+ const float * embd,
3425
+ int n_tokens,
3426
+ int n_past) {
3427
+
3428
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
+
3430
+ const int N = n_tokens;
3431
+
3432
+ const auto & model = lctx.model;
3433
+ const auto & hparams = model.hparams;
3434
+
3435
+ const auto & kv_self = lctx.kv_self;
3436
+
3437
+ GGML_ASSERT(!!kv_self.ctx);
3438
+
3439
+ const int64_t n_embd = hparams.n_embd;
3440
+ const int64_t n_layer = hparams.n_layer;
3441
+ const int64_t n_ctx = hparams.n_ctx;
3442
+ const int64_t n_head = hparams.n_head;
3443
+ const int64_t n_head_kv = hparams.n_head_kv;
3444
+ const int64_t n_embd_head = hparams.n_embd_head();
3445
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3446
+
3447
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
+
3449
+ const float norm_eps = hparams.f_norm_eps;
3450
+
3451
+ auto & buf_compute = lctx.buf_compute;
3452
+
3453
+ struct ggml_init_params params = {
3454
+ /*.mem_size =*/ buf_compute.size,
3455
+ /*.mem_buffer =*/ buf_compute.data,
3456
+ /*.no_alloc =*/ false,
3457
+ };
3458
+
3459
+ params.no_alloc = true;
3460
+
3461
+ struct ggml_context * ctx0 = ggml_init(params);
3462
+
3463
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3464
+
3465
+ struct ggml_tensor * cur;
3466
+ struct ggml_tensor * token;
3467
+ struct ggml_tensor * position;
3468
+ struct ggml_tensor * inpL;
3469
+
3470
+ if (tokens) {
3471
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3472
+
3473
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3476
+ }
3477
+ ggml_set_name(inp_tokens, "inp_tokens");
3478
+
3479
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3480
+ } else {
3481
+ #ifdef GGML_USE_MPI
3482
+ GGML_ASSERT(false && "not implemented");
3483
+ #endif
3484
+
3485
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3486
+
3487
+ ggml_allocr_alloc(lctx.alloc, token);
3488
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
+ memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3490
+ }
3491
+ }
3492
+
3493
+ {
3494
+ // Compute position embeddings.
3495
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3496
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
+ for (int i = 0; i < N; ++i) {
3499
+ ((int32_t *) inp_positions->data)[i] = n_past + i;
3500
+ }
3501
+ }
3502
+ ggml_set_name(inp_positions, "inp_positions");
3503
+
3504
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
+ }
3506
+
3507
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3508
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
+ }
3512
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3513
+
3514
+ inpL = ggml_add(ctx0, token, position);
3515
+ ggml_set_name(inpL, "inpL");
3516
+
3517
+ for (int il = 0; il < n_layer; ++il) {
3518
+ {
3519
+ // Norm
3520
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3521
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3522
+ }
3523
+
3524
+ {
3525
+ // Self Attention
3526
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
+
3528
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
+
3532
+ struct ggml_tensor * Qcur = tmpq;
3533
+ struct ggml_tensor * Kcur = tmpk;
3534
+
3535
+ {
3536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3537
+ ggml_set_name(Vcur, "Vcur");
3538
+
3539
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3540
+ ggml_set_name(k, "k");
3541
+
3542
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3543
+ ( n_ctx)*ggml_element_size(kv_self.v),
3544
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3545
+
3546
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3548
+ }
3549
+
3550
+ struct ggml_tensor * Q =
3551
+ ggml_permute(ctx0,
3552
+ ggml_cpy(ctx0,
3553
+ Qcur,
3554
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3555
+ 0, 2, 1, 3);
3556
+ ggml_set_name(Q, "Q");
3557
+
3558
+ struct ggml_tensor * K =
3559
+ ggml_view_3d(ctx0, kv_self.k,
3560
+ n_embd_head, n_past + N, n_head_kv,
3561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3562
+ ggml_element_size(kv_self.k)*n_embd_head,
3563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3564
+ ggml_set_name(K, "K");
3565
+
3566
+ // K * Q
3567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3568
+ ggml_set_name(KQ, "KQ");
3569
+
3570
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3571
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
3572
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3574
+
3575
+ // KQ_masked = mask_past(KQ_scaled)
3576
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3577
+ ggml_set_name(KQ_masked, "KQ_masked");
3578
+
3579
+ // KQ = soft_max(KQ_masked)
3580
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3581
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3582
+
3583
+ // split cached V into n_head heads
3584
+ struct ggml_tensor * V =
3585
+ ggml_view_3d(ctx0, kv_self.v,
3586
+ n_past + N, n_embd_head, n_head_kv,
3587
+ ggml_element_size(kv_self.v)*n_ctx,
3588
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3590
+ ggml_set_name(V, "V");
3591
+
3592
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3593
+ ggml_set_name(KQV, "KQV");
3594
+
3595
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3596
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
+ ggml_set_name(KQV_merged, "KQV_merged");
3598
+
3599
+ // cur = KQV_merged.contiguous().view(n_embd, N)
3600
+ cur = ggml_cpy(ctx0,
3601
+ KQV_merged,
3602
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3603
+ ggml_set_name(cur, "KQV_merged_contiguous");
3604
+ }
3605
+
3606
+ // Projection
3607
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3608
+
3609
+ // Add the input
3610
+ cur = ggml_add(ctx0, cur, inpL);
3611
+
3612
+ struct ggml_tensor * inpFF = cur;
3613
+
3614
+ // FF
3615
+ {
3616
+ // Norm
3617
+ {
3618
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3619
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3620
+ }
3621
+
3622
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3623
+
3624
+ // GELU activation
3625
+ cur = ggml_gelu(ctx0, cur);
3626
+
3627
+ // Projection
3628
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3629
+ }
3630
+
3631
+ inpL = ggml_add(ctx0, cur, inpFF);
3632
+ }
3633
+
3634
+ // Output Norm
3635
+ {
3636
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3637
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3638
+ }
3639
+ ggml_set_name(cur, "result_norm");
3640
+
3641
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3642
+ ggml_set_name(cur, "result_output");
3643
+
3644
+ ggml_build_forward_expand(gf, cur);
3645
+ ggml_free(ctx0);
3646
+
3647
+ return gf;
3648
+ }
3649
+
3302
3650
  static struct ggml_cgraph * llama_build_graph(
3303
3651
  llama_context & lctx,
3304
3652
  const llama_token * tokens,
@@ -3322,6 +3670,10 @@ static struct ggml_cgraph * llama_build_graph(
3322
3670
  {
3323
3671
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3324
3672
  } break;
3673
+ case LLM_ARCH_STARCODER:
3674
+ {
3675
+ result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3676
+ } break;
3325
3677
  default:
3326
3678
  GGML_ASSERT(false);
3327
3679
  };
@@ -3408,6 +3760,15 @@ static bool llama_eval_internal(
3408
3760
  n_threads = std::min(4, n_threads);
3409
3761
  }
3410
3762
 
3763
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
+ model.arch == LLM_ARCH_BAICHUAN ||
3766
+ model.arch == LLM_ARCH_FALCON;
3767
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
+ n_threads = 1;
3770
+ }
3771
+
3411
3772
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
3412
3773
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
3413
3774
 
@@ -3423,10 +3784,6 @@ static bool llama_eval_internal(
3423
3784
  if (lctx.ctx_metal) {
3424
3785
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
3425
3786
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
3426
- ggml_metal_get_tensor (lctx.ctx_metal, res);
3427
- if (!lctx.embedding.empty()) {
3428
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
3429
- }
3430
3787
  } else {
3431
3788
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
3432
3789
  }
@@ -3939,7 +4296,7 @@ struct llama_grammar_candidate {
3939
4296
 
3940
4297
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3941
4298
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3942
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4299
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3943
4300
  const char * src,
3944
4301
  llama_partial_utf8 partial_start) {
3945
4302
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -5537,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5537
5894
  }
5538
5895
 
5539
5896
  // TODO: after the GGUF PR, this likely won't work and needs to be updated
5540
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
5897
+ static int llama_apply_lora_from_file_internal(
5898
+ const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
5899
+ ) {
5541
5900
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5542
5901
 
5543
5902
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5821,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
5821
6180
  /*.n_gpu_layers =*/ 0,
5822
6181
  /*.main_gpu =*/ 0,
5823
6182
  /*.tensor_split =*/ nullptr,
5824
- /*.rope_freq_base =*/ 10000.0f,
5825
- /*.rope_freq_scale =*/ 1.0f,
6183
+ /*.rope_freq_base =*/ 0.0f,
6184
+ /*.rope_freq_scale =*/ 0.0f,
5826
6185
  /*.progress_callback =*/ nullptr,
5827
6186
  /*.progress_callback_user_data =*/ nullptr,
5828
6187
  /*.low_vram =*/ false,
@@ -6084,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
6084
6443
  return ctx;
6085
6444
  }
6086
6445
 
6087
- struct llama_context * llama_init_from_file(
6446
+ static struct llama_context * llama_init_from_file(
6088
6447
  const char * path_model,
6089
6448
  struct llama_context_params params) {
6090
6449
  struct llama_model * model = llama_load_model_from_file(path_model, params);
@@ -6289,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
6289
6648
  * llama_copy_state_data(ctx, &data_ctx);
6290
6649
  *
6291
6650
  */
6292
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6651
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
6293
6652
  // copy rng
6294
6653
  {
6295
6654
  std::stringstream rng_ss;
@@ -6673,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6673
7032
  int llama_tokenize(
6674
7033
  struct llama_context * ctx,
6675
7034
  const char * text,
7035
+ int text_len,
6676
7036
  llama_token * tokens,
6677
7037
  int n_max_tokens,
6678
7038
  bool add_bos) {
6679
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
7039
+ return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
6680
7040
  }
6681
7041
 
6682
7042
  int llama_tokenize_with_model(
6683
7043
  const struct llama_model * model,
6684
7044
  const char * text,
7045
+ int text_len,
6685
7046
  llama_token * tokens,
6686
7047
  int n_max_tokens,
6687
7048
  bool add_bos) {
6688
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7049
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6689
7050
 
6690
7051
  if (n_max_tokens < (int) res.size()) {
6691
7052
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -6827,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6827
7188
  }
6828
7189
 
6829
7190
  // For internal test use
6830
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7191
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7192
+ struct llama_context * ctx
7193
+ ) {
6831
7194
  return ctx->model.tensors_by_name;
6832
7195
  }
6833
7196