llama_cpp 0.14.6 → 0.14.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -211,6 +211,7 @@ enum llm_arch {
211
211
  LLM_ARCH_QWEN2,
212
212
  LLM_ARCH_QWEN2MOE,
213
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
214
215
  LLM_ARCH_PLAMO,
215
216
  LLM_ARCH_CODESHELL,
216
217
  LLM_ARCH_ORION,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
246
247
  { LLM_ARCH_QWEN2, "qwen2" },
247
248
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
248
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
249
251
  { LLM_ARCH_PLAMO, "plamo" },
250
252
  { LLM_ARCH_CODESHELL, "codeshell" },
251
253
  { LLM_ARCH_ORION, "orion" },
@@ -793,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
793
795
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
794
796
  },
795
797
  },
798
+ {
799
+ LLM_ARCH_PHI3,
800
+ {
801
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
802
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
803
+ { LLM_TENSOR_OUTPUT, "output" },
804
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
805
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
806
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
807
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
808
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
809
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
810
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
811
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
812
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
813
+ },
814
+ },
796
815
  {
797
816
  LLM_ARCH_PLAMO,
798
817
  {
@@ -1600,12 +1619,12 @@ struct llama_mlock {
1600
1619
  };
1601
1620
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1602
1621
 
1603
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1622
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1604
1623
  std::vector<char> result(8, 0);
1605
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1624
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1606
1625
  if (n_tokens < 0) {
1607
1626
  result.resize(-n_tokens);
1608
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1627
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1609
1628
  GGML_ASSERT(check == -n_tokens);
1610
1629
  }
1611
1630
  else {
@@ -2120,7 +2139,7 @@ struct llama_vocab {
2120
2139
  id special_prefix_id = -1;
2121
2140
  id special_suffix_id = -1;
2122
2141
  id special_middle_id = -1;
2123
- id special_eot_id = -1;
2142
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2124
2143
 
2125
2144
  bool add_space_prefix = true;
2126
2145
 
@@ -2980,9 +2999,13 @@ struct llama_model_loader {
2980
2999
 
2981
3000
  ggml_tensor * tensor;
2982
3001
 
2983
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3002
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2984
3003
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2985
3004
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3005
+
3006
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3007
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3008
+ }
2986
3009
  }
2987
3010
  };
2988
3011
  std::vector<llama_tensor_weight> weights;
@@ -3021,15 +3044,15 @@ struct llama_model_loader {
3021
3044
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
3022
3045
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
3023
3046
 
3047
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3048
+ contexts.emplace_back(ctx);
3049
+
3024
3050
  // Save tensors data offset of the main file.
3025
3051
  // For subsidiary files, `meta` tensor data offset must not be used,
3026
3052
  // so we build a unified tensors index for weights.
3027
3053
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3028
- weights.emplace_back(0, cur->name, meta, cur);
3054
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
3029
3055
  }
3030
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
3031
- contexts.emplace_back(ctx);
3032
-
3033
3056
  uint16_t n_split = 0;
3034
3057
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
3035
3058
 
@@ -3063,12 +3086,13 @@ struct llama_model_loader {
3063
3086
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
3064
3087
  }
3065
3088
 
3089
+ files.emplace_back(new llama_file(split_path, "rb"));
3090
+ contexts.emplace_back(ctx);
3091
+
3066
3092
  // Save tensors data offset info of the shard.
3067
3093
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3068
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3094
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
3069
3095
  }
3070
- files.emplace_back(new llama_file(split_path, "rb"));
3071
- contexts.emplace_back(ctx);
3072
3096
 
3073
3097
  gguf_free(ctx_gguf);
3074
3098
  }
@@ -3278,6 +3302,10 @@ struct llama_model_loader {
3278
3302
  return nullptr;
3279
3303
  }
3280
3304
 
3305
+ const llama_tensor_weight * get_weight(int i) const {
3306
+ return get_weight(get_tensor_name(i));
3307
+ }
3308
+
3281
3309
  const llama_tensor_weight & require_weight(const char * name) const {
3282
3310
  const llama_tensor_weight * weight = get_weight(name);
3283
3311
  if (!weight) {
@@ -3770,7 +3798,7 @@ static void llm_load_hparams(
3770
3798
  switch (hparams.n_layer) {
3771
3799
  case 22: model.type = e_model::MODEL_1B; break;
3772
3800
  case 26: model.type = e_model::MODEL_3B; break;
3773
- case 32: model.type = e_model::MODEL_7B; break;
3801
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3774
3802
  case 40: model.type = e_model::MODEL_13B; break;
3775
3803
  case 48: model.type = e_model::MODEL_34B; break;
3776
3804
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3955,6 +3983,16 @@ static void llm_load_hparams(
3955
3983
  {
3956
3984
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3957
3985
 
3986
+ switch (hparams.n_layer) {
3987
+ case 24: model.type = e_model::MODEL_1B; break;
3988
+ case 32: model.type = e_model::MODEL_3B; break;
3989
+ default: model.type = e_model::MODEL_UNKNOWN;
3990
+ }
3991
+ } break;
3992
+ case LLM_ARCH_PHI3:
3993
+ {
3994
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3995
+
3958
3996
  switch (hparams.n_layer) {
3959
3997
  case 24: model.type = e_model::MODEL_1B; break;
3960
3998
  case 32: model.type = e_model::MODEL_3B; break;
@@ -4179,7 +4217,10 @@ static void llm_load_vocab(
4179
4217
  vocab.special_prefix_id = 67;
4180
4218
  vocab.special_suffix_id = 69;
4181
4219
  vocab.special_middle_id = 68;
4182
- vocab.special_eot_id = 70;
4220
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4221
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4222
+ //vocab.special_eot_id = 70;
4223
+ vocab.special_eot_id = 107;
4183
4224
  }
4184
4225
  }
4185
4226
 
@@ -4308,6 +4349,7 @@ static void llm_load_vocab(
4308
4349
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
4350
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4310
4351
  };
4352
+
4311
4353
  for (const auto & it : special_token_types) {
4312
4354
  const std::string & key = kv(std::get<0>(it));
4313
4355
  int32_t & id = std::get<1>(it);
@@ -4322,7 +4364,6 @@ static void llm_load_vocab(
4322
4364
  } else {
4323
4365
  id = new_id;
4324
4366
  }
4325
-
4326
4367
  }
4327
4368
 
4328
4369
  // Handle add_bos_token and add_eos_token
@@ -4336,6 +4377,28 @@ static void llm_load_vocab(
4336
4377
  vocab.special_add_eos = int(temp);
4337
4378
  }
4338
4379
  }
4380
+
4381
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4382
+ //
4383
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4384
+ // for now, we apply this workaround to find the EOT token based on its text
4385
+ if (vocab.special_eot_id == -1) {
4386
+ for (const auto & t : vocab.token_to_id) {
4387
+ if (
4388
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4389
+ // need to fix convert script
4390
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4391
+ (t.first == "<|eot_id|>" ||
4392
+ t.first == "<|im_end|>" ||
4393
+ t.first == "<|end|>" ||
4394
+ t.first == "<end_of_turn>"
4395
+ )
4396
+ ) {
4397
+ vocab.special_eot_id = t.second;
4398
+ break;
4399
+ }
4400
+ }
4401
+ }
4339
4402
  }
4340
4403
 
4341
4404
  // build special tokens cache
@@ -4498,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4498
4561
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4499
4562
 
4500
4563
  // special tokens
4501
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4564
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4565
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4566
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4567
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4568
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4569
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4570
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4571
+
4572
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4573
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4574
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4575
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4576
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4509
4577
  }
4510
4578
 
4511
4579
  // Returns false if cancelled by progress_callback
@@ -5346,6 +5414,33 @@ static bool llm_load_tensors(
5346
5414
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5347
5415
  }
5348
5416
  } break;
5417
+ case LLM_ARCH_PHI3:
5418
+ {
5419
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5420
+
5421
+ // output
5422
+ {
5423
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5424
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5425
+ }
5426
+
5427
+ for (int i = 0; i < n_layer; ++i) {
5428
+ ggml_context* ctx_layer = ctx_for_layer(i);
5429
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5430
+
5431
+ auto& layer = model.layers[i];
5432
+
5433
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5434
+
5435
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5436
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5437
+
5438
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5439
+
5440
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5441
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5442
+ }
5443
+ } break;
5349
5444
  case LLM_ARCH_PLAMO:
5350
5445
  {
5351
5446
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6297,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
6297
6392
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6298
6393
  cb(kq, "kq", il);
6299
6394
 
6300
- if (model.arch == LLM_ARCH_PHI2) {
6395
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6301
6396
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6302
6397
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6303
6398
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -8938,12 +9033,140 @@ struct llm_build_context {
8938
9033
 
8939
9034
  cur = ggml_add(ctx0, cur, model.output_b);
8940
9035
  cb(cur, "result_output", -1);
9036
+ ggml_build_forward_expand(gf, cur);
9037
+ return gf;
9038
+ }
9039
+
9040
+ struct ggml_cgraph * build_phi3() {
9041
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9042
+
9043
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9044
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9045
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9046
+
9047
+ struct ggml_tensor * cur;
9048
+ struct ggml_tensor * inpL;
9049
+
9050
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9051
+
9052
+ // inp_pos - contains the positions
9053
+ struct ggml_tensor * inp_pos = build_inp_pos();
9054
+
9055
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9056
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9057
+
9058
+ for (int il = 0; il < n_layer; ++il) {
9059
+ auto residual = inpL;
9060
+
9061
+ // self-attention
9062
+ {
9063
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9064
+ model.layers[il].attn_norm,
9065
+ NULL,
9066
+ LLM_NORM_RMS, cb, il);
9067
+ cb(attn_norm_output, "attn_norm", il);
9068
+
9069
+ struct ggml_tensor * Qcur = nullptr;
9070
+ struct ggml_tensor * Kcur = nullptr;
9071
+ struct ggml_tensor * Vcur = nullptr;
9072
+
9073
+ if (model.layers[il].wqkv) {
9074
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
9075
+ cb(cur, "wqkv", il);
9076
+
9077
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9078
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9079
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9080
+ }
9081
+ else {
9082
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9083
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9084
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9085
+ }
9086
+
9087
+ cb(Qcur, "Qcur", il);
9088
+ cb(Kcur, "Kcur", il);
9089
+ cb(Vcur, "Vcur", il);
9090
+
9091
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9092
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9093
+
9094
+ Qcur = ggml_rope_custom(
9095
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9096
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9097
+ );
9098
+ cb(Qcur, "Qcur", il);
9099
+
9100
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9101
+ cb(Qcur, "Qcur", il);
9102
+
9103
+ Kcur = ggml_rope_custom(
9104
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9105
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9106
+ );
9107
+ cb(Kcur, "Kcur", il);
9108
+
9109
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
+ model.layers[il].wo, NULL,
9111
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
+ }
9113
+
9114
+ if (il == n_layer - 1) {
9115
+ // skip computing output for unused tokens
9116
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9117
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9118
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9119
+ }
9120
+
9121
+ cur = ggml_add(ctx0, cur, residual);
9122
+ residual = cur;
9123
+
9124
+ cur = llm_build_norm(ctx0, cur, hparams,
9125
+ model.layers[il].ffn_norm, NULL,
9126
+ LLM_NORM_RMS, cb, il);
9127
+ cb(cur, "ffn_norm", il);
9128
+
9129
+ // FF
9130
+ // special-case: the up and gate tensors are merged into a single tensor
9131
+ // TOOD: support into llm_build_ffn
9132
+ {
9133
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9134
+ cb(up, "ffn_up", il);
9135
+
9136
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9137
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
9138
+
9139
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9140
+ cb(y, "ffn_gate", il);
9141
+
9142
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9143
+ cb(down, "ffn_down", il);
9144
+
9145
+ cur = down;
9146
+ cb(cur, "ffn_out", il);
9147
+ }
9148
+
9149
+ cur = ggml_add(ctx0, residual, cur);
9150
+ cb(cur, "l_out", il);
9151
+
9152
+ inpL = cur;
9153
+ }
9154
+
9155
+ cur = llm_build_norm(ctx0, inpL, hparams,
9156
+ model.output_norm,
9157
+ NULL,
9158
+ LLM_NORM_RMS, cb, -1);
9159
+ cb(cur, "result_norm", -1);
9160
+
9161
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9162
+ cb(cur, "result_output", -1);
8941
9163
 
8942
9164
  ggml_build_forward_expand(gf, cur);
8943
9165
 
8944
9166
  return gf;
8945
9167
  }
8946
9168
 
9169
+
8947
9170
  struct ggml_cgraph * build_plamo() {
8948
9171
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8949
9172
 
@@ -10445,6 +10668,10 @@ static struct ggml_cgraph * llama_build_graph(
10445
10668
  {
10446
10669
  result = llm.build_phi2();
10447
10670
  } break;
10671
+ case LLM_ARCH_PHI3:
10672
+ {
10673
+ result = llm.build_phi3();
10674
+ } break;
10448
10675
  case LLM_ARCH_PLAMO:
10449
10676
  {
10450
10677
  result = llm.build_plamo();
@@ -13268,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13268
13495
  GGML_ASSERT(ctx);
13269
13496
  const int64_t t_start_sample_us = ggml_time_us();
13270
13497
 
13271
- bool allow_eos = false;
13498
+ bool allow_eog = false;
13272
13499
  for (const auto & stack : grammar->stacks) {
13273
13500
  if (stack.empty()) {
13274
- allow_eos = true;
13501
+ allow_eog = true;
13275
13502
  break;
13276
13503
  }
13277
13504
  }
13278
13505
 
13279
- const llama_token eos = llama_token_eos(&ctx->model);
13280
-
13281
13506
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13282
13507
  candidates_decoded.reserve(candidates->size);
13283
13508
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -13285,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13285
13510
 
13286
13511
  for (size_t i = 0; i < candidates->size; ++i) {
13287
13512
  const llama_token id = candidates->data[i].id;
13288
- const std::string piece = llama_token_to_piece(ctx, id);
13289
- if (id == eos) {
13290
- if (!allow_eos) {
13513
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13514
+
13515
+ if (llama_token_is_eog(&ctx->model, id)) {
13516
+ if (!allow_eog) {
13291
13517
  candidates->data[i].logit = -INFINITY;
13292
13518
  }
13293
13519
  } else if (piece.empty() || piece[0] == 0) {
@@ -13450,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
13450
13676
  return result;
13451
13677
  }
13452
13678
 
13453
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13679
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
13454
13680
  GGML_ASSERT(ctx);
13455
13681
 
13456
13682
  const int64_t t_start_sample_us = ggml_time_us();
@@ -13463,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13463
13689
  }
13464
13690
 
13465
13691
  std::discrete_distribution<> dist(probs.begin(), probs.end());
13466
- auto & rng = ctx->rng;
13467
13692
  int idx = dist(rng);
13468
13693
 
13469
13694
  llama_token result = candidates->data[idx].id;
@@ -13473,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13473
13698
  return result;
13474
13699
  }
13475
13700
 
13701
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13702
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13703
+ }
13704
+
13476
13705
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
13477
13706
  const int64_t t_start_sample_us = ggml_time_us();
13478
13707
 
13479
- if (token == llama_token_eos(&ctx->model)) {
13708
+ if (llama_token_is_eog(&ctx->model, token)) {
13480
13709
  for (const auto & stack : grammar->stacks) {
13481
13710
  if (stack.empty()) {
13482
13711
  return;
@@ -13485,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
13485
13714
  GGML_ASSERT(false);
13486
13715
  }
13487
13716
 
13488
- const std::string piece = llama_token_to_piece(ctx, token);
13717
+ const std::string piece = llama_token_to_piece(ctx, token, false);
13489
13718
 
13490
13719
  // Note terminating 0 in decoded string
13491
13720
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14308,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14308
14537
  std::vector<no_init<uint8_t>> work;
14309
14538
  std::vector<no_init<float>> f32_conv_buf;
14310
14539
 
14540
+ uint16_t n_split = 1;
14541
+ // Assume split index is continuous
14542
+ if (params->keep_split) {
14543
+ for (int i = 0; i < ml.n_tensors; ++i) {
14544
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14545
+ }
14546
+ }
14547
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14548
+ ctx_outs[0] = ctx_out;
14549
+
14311
14550
  // populate the original tensors so we get an initial meta data
14312
14551
  for (int i = 0; i < ml.n_tensors; ++i) {
14313
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14314
- gguf_add_tensor(ctx_out, meta);
14552
+ auto weight = ml.get_weight(i);
14553
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14554
+ struct ggml_tensor * tensor = weight->tensor;
14555
+ if (ctx_outs[i_split] == NULL) {
14556
+ ctx_outs[i_split] = gguf_init_empty();
14557
+ }
14558
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14315
14559
  }
14316
14560
 
14317
- std::ofstream fout(fname_out, std::ios::binary);
14318
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14319
-
14320
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14561
+ // Set split info if needed
14562
+ if (n_split > 1) {
14563
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14564
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14565
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14566
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14567
+ }
14568
+ }
14321
14569
 
14322
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14570
+ int cur_split = -1;
14571
+ std::ofstream fout;
14572
+ auto close_ofstream = [&]() {
14573
+ // Write metadata and close file handler
14574
+ if (fout.is_open()) {
14575
+ fout.seekp(0);
14576
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14577
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14578
+ fout.write((const char *) data.data(), data.size());
14579
+ fout.close();
14580
+ }
14581
+ };
14582
+ auto new_ofstream = [&](int index) {
14583
+ cur_split = index;
14584
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14585
+ std::string fname = fname_out;
14586
+ if (params->keep_split) {
14587
+ char split_path[PATH_MAX] = {0};
14588
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14589
+ fname = std::string(split_path);
14590
+ }
14323
14591
 
14324
- // placeholder for the meta data
14325
- ::zeros(fout, meta_size);
14592
+ fout = std::ofstream(fname, std::ios::binary);
14593
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14594
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14595
+ // placeholder for the meta data
14596
+ ::zeros(fout, meta_size);
14597
+ };
14326
14598
 
14327
14599
  const auto tn = LLM_TN(model.arch);
14328
-
14600
+ new_ofstream(0);
14329
14601
  for (int i = 0; i < ml.n_tensors; ++i) {
14330
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14602
+ auto weight = ml.get_weight(i);
14603
+ struct ggml_tensor * tensor = weight->tensor;
14604
+ if (weight->idx != cur_split && params->keep_split) {
14605
+ close_ofstream();
14606
+ new_ofstream(weight->idx);
14607
+ }
14331
14608
 
14332
14609
  const std::string name = ggml_get_name(tensor);
14333
14610
 
@@ -14482,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14482
14759
  total_size_new += new_size;
14483
14760
 
14484
14761
  // update the gguf meta data as we go
14485
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14486
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14762
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14763
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
14487
14764
 
14488
14765
  // write tensor data + padding
14489
14766
  fout.write((const char *) new_data, new_size);
14490
14767
  zeros(fout, GGML_PAD(new_size, align) - new_size);
14491
14768
  }
14492
-
14493
- // go back to beginning of file and write the updated meta data
14494
- {
14495
- fout.seekp(0);
14496
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14497
- gguf_get_meta_data(ctx_out, data.data());
14498
- fout.write((const char *) data.data(), data.size());
14769
+ close_ofstream();
14770
+ for (auto & c:ctx_outs) {
14771
+ gguf_free(c);
14499
14772
  }
14500
14773
 
14501
- fout.close();
14502
-
14503
- gguf_free(ctx_out);
14504
-
14505
14774
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14506
14775
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14507
14776
 
@@ -14857,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14857
15126
  /*.quantize_output_tensor =*/ true,
14858
15127
  /*.only_copy =*/ false,
14859
15128
  /*.pure =*/ false,
15129
+ /*.keep_split =*/ false,
14860
15130
  /*.imatrix =*/ nullptr,
14861
15131
  /*.kv_overrides =*/ nullptr,
14862
15132
  };
@@ -15365,6 +15635,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15365
15635
  case LLM_ARCH_QWEN2:
15366
15636
  case LLM_ARCH_QWEN2MOE:
15367
15637
  case LLM_ARCH_PHI2:
15638
+ case LLM_ARCH_PHI3:
15368
15639
  case LLM_ARCH_GEMMA:
15369
15640
  case LLM_ARCH_STARCODER2:
15370
15641
  return LLAMA_ROPE_TYPE_NEOX;
@@ -15378,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15378
15649
  return LLAMA_ROPE_TYPE_NONE;
15379
15650
  }
15380
15651
 
15652
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15653
+ return ctx->cparams.pooling_type;
15654
+ }
15655
+
15381
15656
  int32_t llama_n_vocab(const struct llama_model * model) {
15382
15657
  return model->hparams.n_vocab;
15383
15658
  }
@@ -15856,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
15856
16131
  *
15857
16132
  */
15858
16133
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16134
+ llama_synchronize(ctx);
16135
+
15859
16136
  // copy rng
15860
16137
  {
15861
16138
  std::ostringstream rng_ss;
@@ -16008,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
16008
16285
 
16009
16286
  // Sets the state reading from the specified source address
16010
16287
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16288
+ llama_synchronize(ctx);
16289
+
16011
16290
  const uint8_t * inp = src;
16012
16291
 
16013
16292
  // set rng
@@ -16312,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
16312
16591
  }
16313
16592
 
16314
16593
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16594
+ llama_synchronize(ctx);
16595
+
16315
16596
  const auto & kv_self = ctx->kv_self;
16316
16597
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
16598
 
@@ -16429,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
16429
16710
  }
16430
16711
 
16431
16712
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16713
+ llama_synchronize(ctx);
16714
+
16432
16715
  auto & kv_self = ctx->kv_self;
16433
16716
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
16717
 
@@ -16880,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16880
17163
  return model->vocab.id_to_token[token].type;
16881
17164
  }
16882
17165
 
17166
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17167
+ return token != -1 && (
17168
+ token == llama_token_eos(model) ||
17169
+ token == llama_token_eot(model)
17170
+ );
17171
+ }
17172
+
16883
17173
  llama_token llama_token_bos(const struct llama_model * model) {
16884
17174
  return model->vocab.special_bos_id;
16885
17175
  }
@@ -16957,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
16957
17247
  }
16958
17248
 
16959
17249
  // does not write null-terminator to buf
16960
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17250
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16961
17251
  if (0 <= token && token < llama_n_vocab(model)) {
16962
17252
  switch (llama_vocab_get_type(model->vocab)) {
16963
17253
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16972,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16972
17262
  }
16973
17263
  memcpy(buf, result.c_str(), result.length());
16974
17264
  return result.length();
16975
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17265
+ } else if (
17266
+ (llama_is_user_defined_token(model->vocab, token)) ||
17267
+ (llama_is_control_token (model->vocab, token) && special)) {
16976
17268
  std::string result = model->vocab.id_to_token[token].text;
16977
17269
  if (length < (int) result.length()) {
16978
17270
  return -(int) result.length();
@@ -16985,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16985
17277
  }
16986
17278
  memcpy(buf, "\xe2\x96\x85", 3);
16987
17279
  return 3;
16988
- } else if (llama_is_control_token(model->vocab, token)) {
16989
- ;
16990
17280
  } else if (llama_is_byte_token(model->vocab, token)) {
16991
17281
  if (length < 1) {
16992
17282
  return -1;
@@ -17007,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
17007
17297
  }
17008
17298
  memcpy(buf, result.c_str(), result.length());
17009
17299
  return result.length();
17010
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17300
+ } else if (
17301
+ (llama_is_user_defined_token(model->vocab, token)) ||
17302
+ (llama_is_control_token (model->vocab, token) && special)) {
17011
17303
  std::string result = model->vocab.id_to_token[token].text;
17012
17304
  if (length < (int) result.length()) {
17013
17305
  return -(int) result.length();
17014
17306
  }
17015
17307
  memcpy(buf, result.c_str(), result.length());
17016
17308
  return result.length();
17017
- } else if (llama_is_control_token(model->vocab, token)) {
17018
- ;
17019
17309
  }
17020
17310
  break;
17021
17311
  }
@@ -17213,6 +17503,24 @@ static int32_t llama_chat_apply_template_internal(
17213
17503
  if (add_ass) {
17214
17504
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
17505
  }
17506
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17507
+ // Llama 3
17508
+ for (auto message : chat) {
17509
+ std::string role(message->role);
17510
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17511
+ }
17512
+ if (add_ass) {
17513
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17514
+ }
17515
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17516
+ // Phi 3
17517
+ for (auto message : chat) {
17518
+ std::string role(message->role);
17519
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17520
+ }
17521
+ if (add_ass) {
17522
+ ss << "<|assistant|>\n";
17523
+ }
17216
17524
  } else {
17217
17525
  // template not supported
17218
17526
  return -1;
@@ -17345,6 +17653,11 @@ const char * llama_print_system_info(void) {
17345
17653
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
17346
17654
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17347
17655
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
+ #ifdef GGML_USE_LLAMAFILE
17657
+ s += "LAMMAFILE = 1 | ";
17658
+ #else
17659
+ s += "LAMMAFILE = 0 | ";
17660
+ #endif
17348
17661
 
17349
17662
  return s.c_str();
17350
17663
  }