llama_cpp 0.14.6 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -211,6 +211,7 @@ enum llm_arch {
211
211
  LLM_ARCH_QWEN2,
212
212
  LLM_ARCH_QWEN2MOE,
213
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
214
215
  LLM_ARCH_PLAMO,
215
216
  LLM_ARCH_CODESHELL,
216
217
  LLM_ARCH_ORION,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
246
247
  { LLM_ARCH_QWEN2, "qwen2" },
247
248
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
248
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
249
251
  { LLM_ARCH_PLAMO, "plamo" },
250
252
  { LLM_ARCH_CODESHELL, "codeshell" },
251
253
  { LLM_ARCH_ORION, "orion" },
@@ -793,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
793
795
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
794
796
  },
795
797
  },
798
+ {
799
+ LLM_ARCH_PHI3,
800
+ {
801
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
802
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
803
+ { LLM_TENSOR_OUTPUT, "output" },
804
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
805
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
806
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
807
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
808
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
809
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
810
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
811
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
812
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
813
+ },
814
+ },
796
815
  {
797
816
  LLM_ARCH_PLAMO,
798
817
  {
@@ -1600,12 +1619,12 @@ struct llama_mlock {
1600
1619
  };
1601
1620
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1602
1621
 
1603
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1622
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1604
1623
  std::vector<char> result(8, 0);
1605
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1624
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1606
1625
  if (n_tokens < 0) {
1607
1626
  result.resize(-n_tokens);
1608
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1627
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1609
1628
  GGML_ASSERT(check == -n_tokens);
1610
1629
  }
1611
1630
  else {
@@ -2120,7 +2139,7 @@ struct llama_vocab {
2120
2139
  id special_prefix_id = -1;
2121
2140
  id special_suffix_id = -1;
2122
2141
  id special_middle_id = -1;
2123
- id special_eot_id = -1;
2142
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2124
2143
 
2125
2144
  bool add_space_prefix = true;
2126
2145
 
@@ -2980,9 +2999,13 @@ struct llama_model_loader {
2980
2999
 
2981
3000
  ggml_tensor * tensor;
2982
3001
 
2983
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3002
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2984
3003
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2985
3004
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3005
+
3006
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3007
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3008
+ }
2986
3009
  }
2987
3010
  };
2988
3011
  std::vector<llama_tensor_weight> weights;
@@ -3021,15 +3044,15 @@ struct llama_model_loader {
3021
3044
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
3022
3045
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
3023
3046
 
3047
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3048
+ contexts.emplace_back(ctx);
3049
+
3024
3050
  // Save tensors data offset of the main file.
3025
3051
  // For subsidiary files, `meta` tensor data offset must not be used,
3026
3052
  // so we build a unified tensors index for weights.
3027
3053
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3028
- weights.emplace_back(0, cur->name, meta, cur);
3054
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
3029
3055
  }
3030
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
3031
- contexts.emplace_back(ctx);
3032
-
3033
3056
  uint16_t n_split = 0;
3034
3057
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
3035
3058
 
@@ -3063,12 +3086,13 @@ struct llama_model_loader {
3063
3086
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
3064
3087
  }
3065
3088
 
3089
+ files.emplace_back(new llama_file(split_path, "rb"));
3090
+ contexts.emplace_back(ctx);
3091
+
3066
3092
  // Save tensors data offset info of the shard.
3067
3093
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3068
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3094
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
3069
3095
  }
3070
- files.emplace_back(new llama_file(split_path, "rb"));
3071
- contexts.emplace_back(ctx);
3072
3096
 
3073
3097
  gguf_free(ctx_gguf);
3074
3098
  }
@@ -3278,6 +3302,10 @@ struct llama_model_loader {
3278
3302
  return nullptr;
3279
3303
  }
3280
3304
 
3305
+ const llama_tensor_weight * get_weight(int i) const {
3306
+ return get_weight(get_tensor_name(i));
3307
+ }
3308
+
3281
3309
  const llama_tensor_weight & require_weight(const char * name) const {
3282
3310
  const llama_tensor_weight * weight = get_weight(name);
3283
3311
  if (!weight) {
@@ -3770,7 +3798,7 @@ static void llm_load_hparams(
3770
3798
  switch (hparams.n_layer) {
3771
3799
  case 22: model.type = e_model::MODEL_1B; break;
3772
3800
  case 26: model.type = e_model::MODEL_3B; break;
3773
- case 32: model.type = e_model::MODEL_7B; break;
3801
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3774
3802
  case 40: model.type = e_model::MODEL_13B; break;
3775
3803
  case 48: model.type = e_model::MODEL_34B; break;
3776
3804
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3955,6 +3983,16 @@ static void llm_load_hparams(
3955
3983
  {
3956
3984
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3957
3985
 
3986
+ switch (hparams.n_layer) {
3987
+ case 24: model.type = e_model::MODEL_1B; break;
3988
+ case 32: model.type = e_model::MODEL_3B; break;
3989
+ default: model.type = e_model::MODEL_UNKNOWN;
3990
+ }
3991
+ } break;
3992
+ case LLM_ARCH_PHI3:
3993
+ {
3994
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3995
+
3958
3996
  switch (hparams.n_layer) {
3959
3997
  case 24: model.type = e_model::MODEL_1B; break;
3960
3998
  case 32: model.type = e_model::MODEL_3B; break;
@@ -4179,7 +4217,10 @@ static void llm_load_vocab(
4179
4217
  vocab.special_prefix_id = 67;
4180
4218
  vocab.special_suffix_id = 69;
4181
4219
  vocab.special_middle_id = 68;
4182
- vocab.special_eot_id = 70;
4220
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4221
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4222
+ //vocab.special_eot_id = 70;
4223
+ vocab.special_eot_id = 107;
4183
4224
  }
4184
4225
  }
4185
4226
 
@@ -4308,6 +4349,7 @@ static void llm_load_vocab(
4308
4349
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
4350
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4310
4351
  };
4352
+
4311
4353
  for (const auto & it : special_token_types) {
4312
4354
  const std::string & key = kv(std::get<0>(it));
4313
4355
  int32_t & id = std::get<1>(it);
@@ -4322,7 +4364,6 @@ static void llm_load_vocab(
4322
4364
  } else {
4323
4365
  id = new_id;
4324
4366
  }
4325
-
4326
4367
  }
4327
4368
 
4328
4369
  // Handle add_bos_token and add_eos_token
@@ -4336,6 +4377,28 @@ static void llm_load_vocab(
4336
4377
  vocab.special_add_eos = int(temp);
4337
4378
  }
4338
4379
  }
4380
+
4381
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4382
+ //
4383
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4384
+ // for now, we apply this workaround to find the EOT token based on its text
4385
+ if (vocab.special_eot_id == -1) {
4386
+ for (const auto & t : vocab.token_to_id) {
4387
+ if (
4388
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4389
+ // need to fix convert script
4390
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4391
+ (t.first == "<|eot_id|>" ||
4392
+ t.first == "<|im_end|>" ||
4393
+ t.first == "<|end|>" ||
4394
+ t.first == "<end_of_turn>"
4395
+ )
4396
+ ) {
4397
+ vocab.special_eot_id = t.second;
4398
+ break;
4399
+ }
4400
+ }
4401
+ }
4339
4402
  }
4340
4403
 
4341
4404
  // build special tokens cache
@@ -4498,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4498
4561
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4499
4562
 
4500
4563
  // special tokens
4501
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4564
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4565
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4566
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4567
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4568
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4569
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4570
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4571
+
4572
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4573
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4574
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4575
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4576
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4509
4577
  }
4510
4578
 
4511
4579
  // Returns false if cancelled by progress_callback
@@ -5346,6 +5414,33 @@ static bool llm_load_tensors(
5346
5414
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5347
5415
  }
5348
5416
  } break;
5417
+ case LLM_ARCH_PHI3:
5418
+ {
5419
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5420
+
5421
+ // output
5422
+ {
5423
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5424
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5425
+ }
5426
+
5427
+ for (int i = 0; i < n_layer; ++i) {
5428
+ ggml_context* ctx_layer = ctx_for_layer(i);
5429
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5430
+
5431
+ auto& layer = model.layers[i];
5432
+
5433
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5434
+
5435
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5436
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5437
+
5438
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5439
+
5440
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5441
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5442
+ }
5443
+ } break;
5349
5444
  case LLM_ARCH_PLAMO:
5350
5445
  {
5351
5446
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6297,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
6297
6392
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6298
6393
  cb(kq, "kq", il);
6299
6394
 
6300
- if (model.arch == LLM_ARCH_PHI2) {
6395
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6301
6396
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6302
6397
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6303
6398
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -8938,12 +9033,140 @@ struct llm_build_context {
8938
9033
 
8939
9034
  cur = ggml_add(ctx0, cur, model.output_b);
8940
9035
  cb(cur, "result_output", -1);
9036
+ ggml_build_forward_expand(gf, cur);
9037
+ return gf;
9038
+ }
9039
+
9040
+ struct ggml_cgraph * build_phi3() {
9041
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9042
+
9043
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9044
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9045
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9046
+
9047
+ struct ggml_tensor * cur;
9048
+ struct ggml_tensor * inpL;
9049
+
9050
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9051
+
9052
+ // inp_pos - contains the positions
9053
+ struct ggml_tensor * inp_pos = build_inp_pos();
9054
+
9055
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9056
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9057
+
9058
+ for (int il = 0; il < n_layer; ++il) {
9059
+ auto residual = inpL;
9060
+
9061
+ // self-attention
9062
+ {
9063
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9064
+ model.layers[il].attn_norm,
9065
+ NULL,
9066
+ LLM_NORM_RMS, cb, il);
9067
+ cb(attn_norm_output, "attn_norm", il);
9068
+
9069
+ struct ggml_tensor * Qcur = nullptr;
9070
+ struct ggml_tensor * Kcur = nullptr;
9071
+ struct ggml_tensor * Vcur = nullptr;
9072
+
9073
+ if (model.layers[il].wqkv) {
9074
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
9075
+ cb(cur, "wqkv", il);
9076
+
9077
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9078
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9079
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9080
+ }
9081
+ else {
9082
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9083
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9084
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9085
+ }
9086
+
9087
+ cb(Qcur, "Qcur", il);
9088
+ cb(Kcur, "Kcur", il);
9089
+ cb(Vcur, "Vcur", il);
9090
+
9091
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9092
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9093
+
9094
+ Qcur = ggml_rope_custom(
9095
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9096
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9097
+ );
9098
+ cb(Qcur, "Qcur", il);
9099
+
9100
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9101
+ cb(Qcur, "Qcur", il);
9102
+
9103
+ Kcur = ggml_rope_custom(
9104
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9105
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9106
+ );
9107
+ cb(Kcur, "Kcur", il);
9108
+
9109
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
+ model.layers[il].wo, NULL,
9111
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
+ }
9113
+
9114
+ if (il == n_layer - 1) {
9115
+ // skip computing output for unused tokens
9116
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9117
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9118
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9119
+ }
9120
+
9121
+ cur = ggml_add(ctx0, cur, residual);
9122
+ residual = cur;
9123
+
9124
+ cur = llm_build_norm(ctx0, cur, hparams,
9125
+ model.layers[il].ffn_norm, NULL,
9126
+ LLM_NORM_RMS, cb, il);
9127
+ cb(cur, "ffn_norm", il);
9128
+
9129
+ // FF
9130
+ // special-case: the up and gate tensors are merged into a single tensor
9131
+ // TOOD: support into llm_build_ffn
9132
+ {
9133
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9134
+ cb(up, "ffn_up", il);
9135
+
9136
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9137
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
9138
+
9139
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9140
+ cb(y, "ffn_gate", il);
9141
+
9142
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9143
+ cb(down, "ffn_down", il);
9144
+
9145
+ cur = down;
9146
+ cb(cur, "ffn_out", il);
9147
+ }
9148
+
9149
+ cur = ggml_add(ctx0, residual, cur);
9150
+ cb(cur, "l_out", il);
9151
+
9152
+ inpL = cur;
9153
+ }
9154
+
9155
+ cur = llm_build_norm(ctx0, inpL, hparams,
9156
+ model.output_norm,
9157
+ NULL,
9158
+ LLM_NORM_RMS, cb, -1);
9159
+ cb(cur, "result_norm", -1);
9160
+
9161
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9162
+ cb(cur, "result_output", -1);
8941
9163
 
8942
9164
  ggml_build_forward_expand(gf, cur);
8943
9165
 
8944
9166
  return gf;
8945
9167
  }
8946
9168
 
9169
+
8947
9170
  struct ggml_cgraph * build_plamo() {
8948
9171
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8949
9172
 
@@ -10445,6 +10668,10 @@ static struct ggml_cgraph * llama_build_graph(
10445
10668
  {
10446
10669
  result = llm.build_phi2();
10447
10670
  } break;
10671
+ case LLM_ARCH_PHI3:
10672
+ {
10673
+ result = llm.build_phi3();
10674
+ } break;
10448
10675
  case LLM_ARCH_PLAMO:
10449
10676
  {
10450
10677
  result = llm.build_plamo();
@@ -13268,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13268
13495
  GGML_ASSERT(ctx);
13269
13496
  const int64_t t_start_sample_us = ggml_time_us();
13270
13497
 
13271
- bool allow_eos = false;
13498
+ bool allow_eog = false;
13272
13499
  for (const auto & stack : grammar->stacks) {
13273
13500
  if (stack.empty()) {
13274
- allow_eos = true;
13501
+ allow_eog = true;
13275
13502
  break;
13276
13503
  }
13277
13504
  }
13278
13505
 
13279
- const llama_token eos = llama_token_eos(&ctx->model);
13280
-
13281
13506
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13282
13507
  candidates_decoded.reserve(candidates->size);
13283
13508
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -13285,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13285
13510
 
13286
13511
  for (size_t i = 0; i < candidates->size; ++i) {
13287
13512
  const llama_token id = candidates->data[i].id;
13288
- const std::string piece = llama_token_to_piece(ctx, id);
13289
- if (id == eos) {
13290
- if (!allow_eos) {
13513
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13514
+
13515
+ if (llama_token_is_eog(&ctx->model, id)) {
13516
+ if (!allow_eog) {
13291
13517
  candidates->data[i].logit = -INFINITY;
13292
13518
  }
13293
13519
  } else if (piece.empty() || piece[0] == 0) {
@@ -13450,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
13450
13676
  return result;
13451
13677
  }
13452
13678
 
13453
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13679
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
13454
13680
  GGML_ASSERT(ctx);
13455
13681
 
13456
13682
  const int64_t t_start_sample_us = ggml_time_us();
@@ -13463,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13463
13689
  }
13464
13690
 
13465
13691
  std::discrete_distribution<> dist(probs.begin(), probs.end());
13466
- auto & rng = ctx->rng;
13467
13692
  int idx = dist(rng);
13468
13693
 
13469
13694
  llama_token result = candidates->data[idx].id;
@@ -13473,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13473
13698
  return result;
13474
13699
  }
13475
13700
 
13701
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13702
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13703
+ }
13704
+
13476
13705
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
13477
13706
  const int64_t t_start_sample_us = ggml_time_us();
13478
13707
 
13479
- if (token == llama_token_eos(&ctx->model)) {
13708
+ if (llama_token_is_eog(&ctx->model, token)) {
13480
13709
  for (const auto & stack : grammar->stacks) {
13481
13710
  if (stack.empty()) {
13482
13711
  return;
@@ -13485,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
13485
13714
  GGML_ASSERT(false);
13486
13715
  }
13487
13716
 
13488
- const std::string piece = llama_token_to_piece(ctx, token);
13717
+ const std::string piece = llama_token_to_piece(ctx, token, false);
13489
13718
 
13490
13719
  // Note terminating 0 in decoded string
13491
13720
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14308,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14308
14537
  std::vector<no_init<uint8_t>> work;
14309
14538
  std::vector<no_init<float>> f32_conv_buf;
14310
14539
 
14540
+ uint16_t n_split = 1;
14541
+ // Assume split index is continuous
14542
+ if (params->keep_split) {
14543
+ for (int i = 0; i < ml.n_tensors; ++i) {
14544
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14545
+ }
14546
+ }
14547
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14548
+ ctx_outs[0] = ctx_out;
14549
+
14311
14550
  // populate the original tensors so we get an initial meta data
14312
14551
  for (int i = 0; i < ml.n_tensors; ++i) {
14313
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14314
- gguf_add_tensor(ctx_out, meta);
14552
+ auto weight = ml.get_weight(i);
14553
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14554
+ struct ggml_tensor * tensor = weight->tensor;
14555
+ if (ctx_outs[i_split] == NULL) {
14556
+ ctx_outs[i_split] = gguf_init_empty();
14557
+ }
14558
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14315
14559
  }
14316
14560
 
14317
- std::ofstream fout(fname_out, std::ios::binary);
14318
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14319
-
14320
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14561
+ // Set split info if needed
14562
+ if (n_split > 1) {
14563
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14564
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14565
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14566
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14567
+ }
14568
+ }
14321
14569
 
14322
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14570
+ int cur_split = -1;
14571
+ std::ofstream fout;
14572
+ auto close_ofstream = [&]() {
14573
+ // Write metadata and close file handler
14574
+ if (fout.is_open()) {
14575
+ fout.seekp(0);
14576
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14577
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14578
+ fout.write((const char *) data.data(), data.size());
14579
+ fout.close();
14580
+ }
14581
+ };
14582
+ auto new_ofstream = [&](int index) {
14583
+ cur_split = index;
14584
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14585
+ std::string fname = fname_out;
14586
+ if (params->keep_split) {
14587
+ char split_path[PATH_MAX] = {0};
14588
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14589
+ fname = std::string(split_path);
14590
+ }
14323
14591
 
14324
- // placeholder for the meta data
14325
- ::zeros(fout, meta_size);
14592
+ fout = std::ofstream(fname, std::ios::binary);
14593
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14594
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14595
+ // placeholder for the meta data
14596
+ ::zeros(fout, meta_size);
14597
+ };
14326
14598
 
14327
14599
  const auto tn = LLM_TN(model.arch);
14328
-
14600
+ new_ofstream(0);
14329
14601
  for (int i = 0; i < ml.n_tensors; ++i) {
14330
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14602
+ auto weight = ml.get_weight(i);
14603
+ struct ggml_tensor * tensor = weight->tensor;
14604
+ if (weight->idx != cur_split && params->keep_split) {
14605
+ close_ofstream();
14606
+ new_ofstream(weight->idx);
14607
+ }
14331
14608
 
14332
14609
  const std::string name = ggml_get_name(tensor);
14333
14610
 
@@ -14482,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14482
14759
  total_size_new += new_size;
14483
14760
 
14484
14761
  // update the gguf meta data as we go
14485
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14486
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14762
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14763
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
14487
14764
 
14488
14765
  // write tensor data + padding
14489
14766
  fout.write((const char *) new_data, new_size);
14490
14767
  zeros(fout, GGML_PAD(new_size, align) - new_size);
14491
14768
  }
14492
-
14493
- // go back to beginning of file and write the updated meta data
14494
- {
14495
- fout.seekp(0);
14496
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14497
- gguf_get_meta_data(ctx_out, data.data());
14498
- fout.write((const char *) data.data(), data.size());
14769
+ close_ofstream();
14770
+ for (auto & c:ctx_outs) {
14771
+ gguf_free(c);
14499
14772
  }
14500
14773
 
14501
- fout.close();
14502
-
14503
- gguf_free(ctx_out);
14504
-
14505
14774
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14506
14775
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14507
14776
 
@@ -14857,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14857
15126
  /*.quantize_output_tensor =*/ true,
14858
15127
  /*.only_copy =*/ false,
14859
15128
  /*.pure =*/ false,
15129
+ /*.keep_split =*/ false,
14860
15130
  /*.imatrix =*/ nullptr,
14861
15131
  /*.kv_overrides =*/ nullptr,
14862
15132
  };
@@ -15365,6 +15635,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15365
15635
  case LLM_ARCH_QWEN2:
15366
15636
  case LLM_ARCH_QWEN2MOE:
15367
15637
  case LLM_ARCH_PHI2:
15638
+ case LLM_ARCH_PHI3:
15368
15639
  case LLM_ARCH_GEMMA:
15369
15640
  case LLM_ARCH_STARCODER2:
15370
15641
  return LLAMA_ROPE_TYPE_NEOX;
@@ -15378,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15378
15649
  return LLAMA_ROPE_TYPE_NONE;
15379
15650
  }
15380
15651
 
15652
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15653
+ return ctx->cparams.pooling_type;
15654
+ }
15655
+
15381
15656
  int32_t llama_n_vocab(const struct llama_model * model) {
15382
15657
  return model->hparams.n_vocab;
15383
15658
  }
@@ -15856,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
15856
16131
  *
15857
16132
  */
15858
16133
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16134
+ llama_synchronize(ctx);
16135
+
15859
16136
  // copy rng
15860
16137
  {
15861
16138
  std::ostringstream rng_ss;
@@ -16008,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
16008
16285
 
16009
16286
  // Sets the state reading from the specified source address
16010
16287
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16288
+ llama_synchronize(ctx);
16289
+
16011
16290
  const uint8_t * inp = src;
16012
16291
 
16013
16292
  // set rng
@@ -16312,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
16312
16591
  }
16313
16592
 
16314
16593
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16594
+ llama_synchronize(ctx);
16595
+
16315
16596
  const auto & kv_self = ctx->kv_self;
16316
16597
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
16598
 
@@ -16429,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
16429
16710
  }
16430
16711
 
16431
16712
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16713
+ llama_synchronize(ctx);
16714
+
16432
16715
  auto & kv_self = ctx->kv_self;
16433
16716
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
16717
 
@@ -16880,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16880
17163
  return model->vocab.id_to_token[token].type;
16881
17164
  }
16882
17165
 
17166
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17167
+ return token != -1 && (
17168
+ token == llama_token_eos(model) ||
17169
+ token == llama_token_eot(model)
17170
+ );
17171
+ }
17172
+
16883
17173
  llama_token llama_token_bos(const struct llama_model * model) {
16884
17174
  return model->vocab.special_bos_id;
16885
17175
  }
@@ -16957,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
16957
17247
  }
16958
17248
 
16959
17249
  // does not write null-terminator to buf
16960
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17250
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16961
17251
  if (0 <= token && token < llama_n_vocab(model)) {
16962
17252
  switch (llama_vocab_get_type(model->vocab)) {
16963
17253
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16972,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16972
17262
  }
16973
17263
  memcpy(buf, result.c_str(), result.length());
16974
17264
  return result.length();
16975
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17265
+ } else if (
17266
+ (llama_is_user_defined_token(model->vocab, token)) ||
17267
+ (llama_is_control_token (model->vocab, token) && special)) {
16976
17268
  std::string result = model->vocab.id_to_token[token].text;
16977
17269
  if (length < (int) result.length()) {
16978
17270
  return -(int) result.length();
@@ -16985,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16985
17277
  }
16986
17278
  memcpy(buf, "\xe2\x96\x85", 3);
16987
17279
  return 3;
16988
- } else if (llama_is_control_token(model->vocab, token)) {
16989
- ;
16990
17280
  } else if (llama_is_byte_token(model->vocab, token)) {
16991
17281
  if (length < 1) {
16992
17282
  return -1;
@@ -17007,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
17007
17297
  }
17008
17298
  memcpy(buf, result.c_str(), result.length());
17009
17299
  return result.length();
17010
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17300
+ } else if (
17301
+ (llama_is_user_defined_token(model->vocab, token)) ||
17302
+ (llama_is_control_token (model->vocab, token) && special)) {
17011
17303
  std::string result = model->vocab.id_to_token[token].text;
17012
17304
  if (length < (int) result.length()) {
17013
17305
  return -(int) result.length();
17014
17306
  }
17015
17307
  memcpy(buf, result.c_str(), result.length());
17016
17308
  return result.length();
17017
- } else if (llama_is_control_token(model->vocab, token)) {
17018
- ;
17019
17309
  }
17020
17310
  break;
17021
17311
  }
@@ -17213,6 +17503,24 @@ static int32_t llama_chat_apply_template_internal(
17213
17503
  if (add_ass) {
17214
17504
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
17505
  }
17506
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17507
+ // Llama 3
17508
+ for (auto message : chat) {
17509
+ std::string role(message->role);
17510
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17511
+ }
17512
+ if (add_ass) {
17513
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17514
+ }
17515
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17516
+ // Phi 3
17517
+ for (auto message : chat) {
17518
+ std::string role(message->role);
17519
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17520
+ }
17521
+ if (add_ass) {
17522
+ ss << "<|assistant|>\n";
17523
+ }
17216
17524
  } else {
17217
17525
  // template not supported
17218
17526
  return -1;
@@ -17345,6 +17653,11 @@ const char * llama_print_system_info(void) {
17345
17653
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
17346
17654
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17347
17655
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
+ #ifdef GGML_USE_LLAMAFILE
17657
+ s += "LAMMAFILE = 1 | ";
17658
+ #else
17659
+ s += "LAMMAFILE = 0 | ";
17660
+ #endif
17348
17661
 
17349
17662
  return s.c_str();
17350
17663
  }