llama_cpp 0.14.4 → 0.14.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -261,6 +261,7 @@ enum llm_kv {
261
261
  LLM_KV_GENERAL_ALIGNMENT,
262
262
  LLM_KV_GENERAL_NAME,
263
263
  LLM_KV_GENERAL_AUTHOR,
264
+ LLM_KV_GENERAL_VERSION,
264
265
  LLM_KV_GENERAL_URL,
265
266
  LLM_KV_GENERAL_DESCRIPTION,
266
267
  LLM_KV_GENERAL_LICENSE,
@@ -317,6 +318,8 @@ enum llm_kv {
317
318
  LLM_KV_TOKENIZER_UNK_ID,
318
319
  LLM_KV_TOKENIZER_SEP_ID,
319
320
  LLM_KV_TOKENIZER_PAD_ID,
321
+ LLM_KV_TOKENIZER_CLS_ID,
322
+ LLM_KV_TOKENIZER_MASK_ID,
320
323
  LLM_KV_TOKENIZER_ADD_BOS,
321
324
  LLM_KV_TOKENIZER_ADD_EOS,
322
325
  LLM_KV_TOKENIZER_ADD_PREFIX,
@@ -330,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
330
333
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
331
334
  { LLM_KV_GENERAL_NAME, "general.name" },
332
335
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
336
+ { LLM_KV_GENERAL_VERSION, "general.version" },
333
337
  { LLM_KV_GENERAL_URL, "general.url" },
334
338
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
335
339
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -386,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
390
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
387
391
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
388
392
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
393
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
394
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
389
395
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
390
396
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
391
397
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
@@ -924,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
924
930
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
931
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
932
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
933
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
934
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
927
935
  },
928
936
  },
929
937
  {
@@ -1630,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
1630
1638
  #if defined(GGML_USE_CUDA)
1631
1639
  size_t total;
1632
1640
  size_t free;
1633
- ggml_backend_cuda_get_device_memory(device, &total, &free);
1641
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
1634
1642
  return free;
1635
1643
  #elif defined(GGML_USE_SYCL)
1636
1644
  size_t total;
1637
1645
  size_t free;
1638
- ggml_backend_sycl_get_device_memory(device, &total, &free);
1646
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
1639
1647
  return free;
1640
1648
  #elif defined(GGML_USE_VULKAN)
1641
1649
  size_t total;
1642
1650
  size_t free;
1643
- ggml_backend_vk_get_device_memory(device, &total, &free);
1651
+ ggml_backend_vk_get_device_memory(device, &free, &total);
1644
1652
  return free;
1645
1653
  #else
1646
1654
  return 1;
@@ -1697,6 +1705,8 @@ enum e_model {
1697
1705
  MODEL_MEDIUM,
1698
1706
  MODEL_LARGE,
1699
1707
  MODEL_XL,
1708
+ MODEL_8x7B,
1709
+ MODEL_8x22B,
1700
1710
  };
1701
1711
 
1702
1712
  static const size_t kiB = 1024;
@@ -2014,11 +2024,13 @@ struct llama_vocab {
2014
2024
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2015
2025
 
2016
2026
  // default LLaMA special tokens
2017
- id special_bos_id = 1;
2018
- id special_eos_id = 2;
2019
- id special_unk_id = 0;
2020
- id special_sep_id = -1;
2021
- id special_pad_id = -1;
2027
+ id special_bos_id = 1;
2028
+ id special_eos_id = 2;
2029
+ id special_unk_id = 0;
2030
+ id special_sep_id = -1;
2031
+ id special_pad_id = -1;
2032
+ id special_cls_id = -1;
2033
+ id special_mask_id = -1;
2022
2034
 
2023
2035
  int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2024
2036
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
@@ -2175,7 +2187,7 @@ struct llama_context {
2175
2187
 
2176
2188
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
2189
  size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2190
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
2179
2191
 
2180
2192
  bool logits_all = false;
2181
2193
 
@@ -3548,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
3548
3560
  case MODEL_MEDIUM: return "0.4B";
3549
3561
  case MODEL_LARGE: return "0.8B";
3550
3562
  case MODEL_XL: return "1.5B";
3563
+ case MODEL_8x7B: return "8x7B";
3564
+ case MODEL_8x22B: return "8x22B";
3551
3565
  default: return "?B";
3552
3566
  }
3553
3567
  }
@@ -3662,15 +3676,23 @@ static void llm_load_hparams(
3662
3676
  {
3663
3677
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3664
3678
 
3665
- switch (hparams.n_layer) {
3666
- case 22: model.type = e_model::MODEL_1B; break;
3667
- case 26: model.type = e_model::MODEL_3B; break;
3668
- case 32: model.type = e_model::MODEL_7B; break;
3669
- case 40: model.type = e_model::MODEL_13B; break;
3670
- case 48: model.type = e_model::MODEL_34B; break;
3671
- case 60: model.type = e_model::MODEL_30B; break;
3672
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3673
- default: model.type = e_model::MODEL_UNKNOWN;
3679
+ if (hparams.n_expert == 8) {
3680
+ switch (hparams.n_layer) {
3681
+ case 32: model.type = e_model::MODEL_8x7B; break;
3682
+ case 56: model.type = e_model::MODEL_8x22B; break;
3683
+ default: model.type = e_model::MODEL_UNKNOWN;
3684
+ }
3685
+ } else {
3686
+ switch (hparams.n_layer) {
3687
+ case 22: model.type = e_model::MODEL_1B; break;
3688
+ case 26: model.type = e_model::MODEL_3B; break;
3689
+ case 32: model.type = e_model::MODEL_7B; break;
3690
+ case 40: model.type = e_model::MODEL_13B; break;
3691
+ case 48: model.type = e_model::MODEL_34B; break;
3692
+ case 60: model.type = e_model::MODEL_30B; break;
3693
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3694
+ default: model.type = e_model::MODEL_UNKNOWN;
3695
+ }
3674
3696
  }
3675
3697
  } break;
3676
3698
  case LLM_ARCH_MINICPM:
@@ -3974,7 +3996,9 @@ static void llm_load_hparams(
3974
3996
  }
3975
3997
 
3976
3998
  // TODO: This should probably be in llama.h
3977
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
3999
+ static std::vector<llama_vocab::id> llama_tokenize_internal(
4000
+ const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
4001
+ );
3978
4002
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
3979
4003
 
3980
4004
  static void llm_load_vocab(
@@ -3996,23 +4020,27 @@ static void llm_load_vocab(
3996
4020
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
3997
4021
 
3998
4022
  // default special tokens
3999
- vocab.special_bos_id = -1;
4000
- vocab.special_eos_id = -1;
4001
- vocab.special_unk_id = -1;
4002
- vocab.special_sep_id = -1;
4003
- vocab.special_pad_id = -1;
4004
- vocab.linefeed_id = -1;
4023
+ vocab.special_bos_id = -1;
4024
+ vocab.special_eos_id = -1;
4025
+ vocab.special_unk_id = -1;
4026
+ vocab.special_sep_id = -1;
4027
+ vocab.special_pad_id = -1;
4028
+ vocab.special_cls_id = -1;
4029
+ vocab.special_mask_id = -1;
4030
+ vocab.linefeed_id = -1;
4005
4031
 
4006
4032
  return;
4007
4033
  } else if (tokenizer_name == "llama") {
4008
4034
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4009
4035
 
4010
4036
  // default special tokens
4011
- vocab.special_bos_id = 1;
4012
- vocab.special_eos_id = 2;
4013
- vocab.special_unk_id = 0;
4014
- vocab.special_sep_id = -1;
4015
- vocab.special_pad_id = -1;
4037
+ vocab.special_bos_id = 1;
4038
+ vocab.special_eos_id = 2;
4039
+ vocab.special_unk_id = 0;
4040
+ vocab.special_sep_id = -1;
4041
+ vocab.special_pad_id = -1;
4042
+ vocab.special_cls_id = -1;
4043
+ vocab.special_mask_id = -1;
4016
4044
 
4017
4045
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4018
4046
  if (add_space_prefix_keyidx != -1) {
@@ -4047,20 +4075,24 @@ static void llm_load_vocab(
4047
4075
  }
4048
4076
 
4049
4077
  // default special tokens
4050
- vocab.special_bos_id = 11;
4051
- vocab.special_eos_id = 11;
4052
- vocab.special_unk_id = -1;
4053
- vocab.special_sep_id = -1;
4054
- vocab.special_pad_id = -1;
4078
+ vocab.special_bos_id = 11;
4079
+ vocab.special_eos_id = 11;
4080
+ vocab.special_unk_id = -1;
4081
+ vocab.special_sep_id = -1;
4082
+ vocab.special_pad_id = -1;
4083
+ vocab.special_cls_id = -1;
4084
+ vocab.special_mask_id = -1;
4055
4085
  } else if (tokenizer_name == "bert") {
4056
4086
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
4057
4087
 
4058
4088
  // default special tokens
4059
- vocab.special_bos_id = 101;
4060
- vocab.special_eos_id = 102;
4061
- vocab.special_unk_id = 100;
4062
- vocab.special_sep_id = -1;
4063
- vocab.special_pad_id = -1;
4089
+ vocab.special_bos_id = -1;
4090
+ vocab.special_eos_id = -1;
4091
+ vocab.special_unk_id = 100;
4092
+ vocab.special_sep_id = 102;
4093
+ vocab.special_pad_id = 0;
4094
+ vocab.special_cls_id = 101;
4095
+ vocab.special_mask_id = 103;
4064
4096
  vocab.add_space_prefix = false;
4065
4097
  } else {
4066
4098
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
@@ -4123,11 +4155,13 @@ static void llm_load_vocab(
4123
4155
  // special tokens
4124
4156
  {
4125
4157
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4126
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4127
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4128
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4129
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4130
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4158
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4131
4165
  };
4132
4166
  for (const auto & it : special_token_types) {
4133
4167
  const std::string & key = kv(std::get<0>(it));
@@ -4319,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4319
4353
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4320
4354
 
4321
4355
  // special tokens
4322
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4323
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4324
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4325
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4326
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4327
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4356
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4357
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4358
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4359
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4360
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4361
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4362
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4363
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4328
4364
  }
4329
4365
 
4330
4366
  // Returns false if cancelled by progress_callback
@@ -5404,6 +5440,11 @@ static bool llm_load_tensors(
5404
5440
 
5405
5441
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
5442
 
5443
+ if (n_layer >= 64){
5444
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
5445
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
5446
+ }
5447
+
5407
5448
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
5449
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
5450
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
@@ -9452,6 +9493,31 @@ struct llm_build_context {
9452
9493
  cb(Vcur, "Vcur", il);
9453
9494
  }
9454
9495
 
9496
+ if (model.layers[il].attn_q_norm) {
9497
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
9498
+ ggml_element_size(Qcur) * n_embd_head,
9499
+ ggml_element_size(Qcur) * n_embd_head * n_head,
9500
+ 0);
9501
+ cb(Qcur, "Qcur", il);
9502
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
9503
+ ggml_element_size(Kcur) * n_embd_head,
9504
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
9505
+ 0);
9506
+ cb(Kcur, "Kcur", il);
9507
+
9508
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
9509
+ model.layers[il].attn_q_norm,
9510
+ NULL,
9511
+ LLM_NORM, cb, il);
9512
+ cb(Qcur, "Qcur", il);
9513
+
9514
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
9515
+ model.layers[il].attn_k_norm,
9516
+ NULL,
9517
+ LLM_NORM, cb, il);
9518
+ cb(Kcur, "Kcur", il);
9519
+ }
9520
+
9455
9521
  Qcur = ggml_rope_custom(
9456
9522
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
9523
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -10409,6 +10475,9 @@ static int llama_decode_internal(
10409
10475
  n_outputs_prev += lctx.n_outputs;
10410
10476
  }
10411
10477
 
10478
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
10479
+ lctx.n_outputs = n_outputs;
10480
+
10412
10481
  // wait for the computation to finish (automatically done when obtaining the model output)
10413
10482
  //llama_synchronize(&lctx);
10414
10483
 
@@ -11052,7 +11121,7 @@ struct llm_tokenizer_bpe {
11052
11121
  add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
11053
11122
  }
11054
11123
 
11055
- // add the fnished tokens to the final list keeping correct order for next and prev
11124
+ // add the finished tokens to the final list keeping correct order for next and prev
11056
11125
  for (auto & sym : symbols) {
11057
11126
  if (sym.n > 0) {
11058
11127
  sym.prev = final_prev_index;
@@ -11321,9 +11390,6 @@ struct llm_tokenizer_wpm {
11321
11390
  output.push_back(vocab.special_unk_id);
11322
11391
  }
11323
11392
  }
11324
-
11325
- // append eos token
11326
- output.push_back(vocab.special_eos_id);
11327
11393
  }
11328
11394
 
11329
11395
  std::vector<std::string> preprocess(const std::string & text) {
@@ -11528,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
11528
11594
  }
11529
11595
  }
11530
11596
 
11531
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
11597
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
11532
11598
  std::vector<llama_vocab::id> output;
11533
-
11534
- // OG tokenizer behavior:
11535
- //
11536
- // tokenizer.encode('', add_bos=True) returns [1]
11537
- // tokenizer.encode('', add_bos=False) returns []
11538
-
11539
- if (bos && vocab.special_bos_id != -1) {
11540
- output.push_back(vocab.special_bos_id);
11541
- }
11542
-
11543
- if (raw_text.empty()) {
11544
- return output;
11545
- }
11546
-
11547
11599
  std::forward_list<fragment_buffer_variant> fragment_buffer;
11548
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11549
11600
 
11550
- if (special) tokenizer_st_partition(vocab, fragment_buffer);
11601
+ if (!raw_text.empty()) {
11602
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11603
+ if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
11604
+ }
11551
11605
 
11552
11606
  switch (vocab.type) {
11553
11607
  case LLAMA_VOCAB_TYPE_SPM:
11554
11608
  {
11609
+ // OG tokenizer behavior:
11610
+ //
11611
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
11612
+ // tokenizer.encode('', add_special_tokens=False) returns []
11613
+
11614
+ if (add_special && vocab.special_add_bos != 0) {
11615
+ GGML_ASSERT(vocab.special_bos_id != -1);
11616
+ output.push_back(vocab.special_bos_id);
11617
+ }
11618
+
11555
11619
  for (const auto & fragment : fragment_buffer) {
11556
11620
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11557
11621
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -11577,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11577
11641
  output.push_back(fragment.token);
11578
11642
  }
11579
11643
  }
11644
+
11645
+ if (add_special && vocab.special_add_eos == 1) {
11646
+ GGML_ASSERT(vocab.special_eos_id != -1);
11647
+ output.push_back(vocab.special_eos_id);
11648
+ }
11580
11649
  } break;
11581
11650
  case LLAMA_VOCAB_TYPE_BPE:
11582
11651
  {
11652
+ if (add_special && vocab.special_add_bos == 1) {
11653
+ GGML_ASSERT(vocab.special_bos_id != -1);
11654
+ output.push_back(vocab.special_bos_id);
11655
+ }
11656
+
11583
11657
  for (const auto & fragment : fragment_buffer) {
11584
11658
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11585
11659
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11593,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11593
11667
  output.push_back(fragment.token);
11594
11668
  }
11595
11669
  }
11670
+
11671
+ GGML_ASSERT(vocab.special_add_eos != 1);
11596
11672
  } break;
11597
11673
  case LLAMA_VOCAB_TYPE_WPM:
11598
11674
  {
11675
+ if (add_special) {
11676
+ GGML_ASSERT(vocab.special_cls_id != -1);
11677
+ output.push_back(vocab.special_cls_id);
11678
+ }
11679
+
11599
11680
  for (const auto & fragment : fragment_buffer) {
11600
11681
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11601
11682
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11609,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11609
11690
  output.push_back(fragment.token);
11610
11691
  }
11611
11692
  }
11693
+
11694
+ if (add_special) {
11695
+ GGML_ASSERT(vocab.special_sep_id != -1);
11696
+ output.push_back(vocab.special_sep_id);
11697
+ }
11612
11698
  } break;
11613
11699
  case LLAMA_VOCAB_TYPE_NONE:
11614
11700
  GGML_ASSERT(false);
@@ -11775,7 +11861,9 @@ static void llama_grammar_advance_stack(
11775
11861
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11776
11862
 
11777
11863
  if (stack.empty()) {
11778
- new_stacks.emplace_back(stack);
11864
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11865
+ new_stacks.emplace_back(stack);
11866
+ }
11779
11867
  return;
11780
11868
  }
11781
11869
 
@@ -11812,7 +11900,10 @@ static void llama_grammar_advance_stack(
11812
11900
  }
11813
11901
  case LLAMA_GRETYPE_CHAR:
11814
11902
  case LLAMA_GRETYPE_CHAR_NOT:
11815
- new_stacks.emplace_back(stack);
11903
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11904
+ // only add the stack if it's not a duplicate of one we already have
11905
+ new_stacks.emplace_back(stack);
11906
+ }
11816
11907
  break;
11817
11908
  default:
11818
11909
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -11826,12 +11917,13 @@ static void llama_grammar_advance_stack(
11826
11917
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11827
11918
  // produces the N possible stacks if the given char is accepted at those
11828
11919
  // positions
11829
- std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11920
+ void llama_grammar_accept(
11830
11921
  const std::vector<std::vector<llama_grammar_element>> & rules,
11831
11922
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11832
- const uint32_t chr) {
11923
+ const uint32_t chr,
11924
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11833
11925
 
11834
- std::vector<std::vector<const llama_grammar_element *>> new_stacks;
11926
+ new_stacks.clear();
11835
11927
 
11836
11928
  for (const auto & stack : stacks) {
11837
11929
  if (stack.empty()) {
@@ -11850,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11850
11942
  llama_grammar_advance_stack(rules, new_stack, new_stacks);
11851
11943
  }
11852
11944
  }
11853
-
11854
- return new_stacks;
11855
11945
  }
11856
11946
 
11857
11947
  static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@@ -11865,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11865
11955
  const std::vector<llama_grammar_candidate> & candidates) {
11866
11956
 
11867
11957
  std::vector<llama_grammar_candidate> rejects;
11958
+ rejects.reserve(candidates.size());
11868
11959
 
11869
11960
  if (stack.empty()) {
11870
11961
  for (const auto & tok : candidates) {
@@ -11878,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11878
11969
  const llama_grammar_element * stack_pos = stack.back();
11879
11970
 
11880
11971
  std::vector<llama_grammar_candidate> next_candidates;
11972
+ next_candidates.reserve(candidates.size());
11973
+
11881
11974
  for (const auto & tok : candidates) {
11882
11975
  if (*tok.code_points == 0) {
11883
11976
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -12685,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12685
12778
  // Note terminating 0 in decoded string
12686
12779
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
12687
12780
  const auto & code_points = decoded.first;
12781
+ std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
12688
12782
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
12689
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
12783
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
12784
+ grammar->stacks = tmp_new_stacks;
12690
12785
  }
12691
12786
  grammar->partial_utf8 = decoded.second;
12692
12787
  GGML_ASSERT(!grammar->stacks.empty());
@@ -13318,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
13318
13413
  return new_type;
13319
13414
  }
13320
13415
 
13321
- static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13416
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13322
13417
  std::mutex mutex;
13323
- int counter = 0;
13418
+ int64_t counter = 0;
13324
13419
  size_t new_size = 0;
13325
13420
  if (nthread < 2) {
13326
13421
  // single-thread
@@ -13328,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13328
13423
  }
13329
13424
  auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
13330
13425
  nrows, n_per_row, imatrix]() {
13331
- const int nrows_per_chunk = chunk_size / n_per_row;
13426
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
13332
13427
  size_t local_size = 0;
13333
13428
  while (true) {
13334
13429
  std::unique_lock<std::mutex> lock(mutex);
13335
- int first_row = counter; counter += nrows_per_chunk;
13430
+ int64_t first_row = counter; counter += nrows_per_chunk;
13336
13431
  if (first_row >= nrows) {
13337
13432
  if (local_size > 0) {
13338
13433
  new_size += local_size;
@@ -13340,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13340
13435
  break;
13341
13436
  }
13342
13437
  lock.unlock();
13343
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13438
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13344
13439
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
13345
13440
  }
13346
13441
  };
@@ -13463,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13463
13558
  const std::string name = ggml_get_name(meta);
13464
13559
 
13465
13560
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13466
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
13561
+ if (name.find("attn_v.weight") != std::string::npos ||
13562
+ name.find("attn_qkv.weight") != std::string::npos) {
13467
13563
  ++qs.n_attention_wv;
13468
13564
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13469
13565
  qs.has_output = true;
@@ -13473,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13473
13569
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
13570
 
13475
13571
  // sanity checks
13476
- GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
13572
+ //
13573
+ // - qs.n_attention_wv == 0 for Mamba models
13574
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
13575
+ //
13576
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
13477
13577
 
13478
13578
  size_t total_size_org = 0;
13479
13579
  size_t total_size_new = 0;
@@ -13529,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13529
13629
 
13530
13630
  // quantize only 2D and 3D tensors (experts)
13531
13631
  quantize &= (ggml_n_dims(tensor) >= 2);
13632
+
13633
+ // do not quantize norm tensors
13634
+ quantize &= name.find("_norm.weight") == std::string::npos;
13635
+
13532
13636
  quantize &= params->quantize_output_tensor || name != "output.weight";
13533
13637
  quantize &= !params->only_copy;
13534
13638
 
@@ -13557,10 +13661,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13557
13661
  if (!params->pure && ggml_is_quantized(default_type)) {
13558
13662
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
13559
13663
  }
13560
- else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13664
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
13665
  new_type = params->token_embedding_type;
13562
13666
  }
13563
- else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13667
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
13668
  new_type = params->output_tensor_type;
13565
13669
  }
13566
13670
 
@@ -13575,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13575
13679
  new_size = ggml_nbytes(tensor);
13576
13680
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
13577
13681
  } else {
13578
- const size_t nelements = ggml_nelements(tensor);
13682
+ const int64_t nelements = ggml_nelements(tensor);
13579
13683
 
13580
13684
  const float * imatrix = nullptr;
13581
13685
  if (imatrix_data) {
@@ -13627,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13627
13731
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
13628
13732
  fflush(stdout);
13629
13733
 
13630
- if (work.size() < nelements * 4) {
13734
+ if (work.size() < (size_t)nelements * 4) {
13631
13735
  work.resize(nelements * 4); // upper bound on size
13632
13736
  }
13633
13737
  new_data = work.data();
13634
13738
 
13635
- const int n_per_row = tensor->ne[0];
13636
- const int nrows = tensor->ne[1];
13739
+ const int64_t n_per_row = tensor->ne[0];
13740
+ const int64_t nrows = tensor->ne[1];
13637
13741
 
13638
- static const int min_chunk_size = 32 * 512;
13639
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13742
+ static const int64_t min_chunk_size = 32 * 512;
13743
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13640
13744
 
13641
- const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
- const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13643
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
13745
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
13746
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13747
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
13644
13748
 
13645
13749
  // quantize each expert separately since they have different importance matrices
13646
13750
  new_size = 0;
@@ -14905,9 +15009,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
14905
15009
  llama_kv_cache_update_internal(*ctx);
14906
15010
  }
14907
15011
 
15012
+ // deprecated
15013
+ size_t llama_get_state_size(const struct llama_context * ctx) {
15014
+ return llama_state_get_size(ctx);
15015
+ }
15016
+
15017
+ // deprecated
15018
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15019
+ return llama_state_get_data(ctx, dst);
15020
+ }
15021
+
15022
+ // deprecated
15023
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15024
+ return llama_state_set_data(ctx, src);
15025
+ }
15026
+
15027
+ // deprecated
15028
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15029
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15030
+ }
15031
+
15032
+ // deprecated
15033
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15034
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
15035
+ }
14908
15036
 
14909
15037
  // Returns the *maximum* size of the state
14910
- size_t llama_get_state_size(const struct llama_context * ctx) {
15038
+ size_t llama_state_get_size(const struct llama_context * ctx) {
14911
15039
  const auto & cparams = ctx->cparams;
14912
15040
  const auto & hparams = ctx->model.hparams;
14913
15041
 
@@ -14995,15 +15123,15 @@ struct llama_data_file_context : llama_data_context {
14995
15123
  * file context:
14996
15124
  * llama_file file("/path", "wb");
14997
15125
  * llama_data_file_context data_ctx(&file);
14998
- * llama_copy_state_data(ctx, &data_ctx);
15126
+ * llama_state_get_data(ctx, &data_ctx);
14999
15127
  *
15000
15128
  * buffer context:
15001
15129
  * std::vector<uint8_t> buf(max_size, 0);
15002
15130
  * llama_data_buffer_context data_ctx(&buf.data());
15003
- * llama_copy_state_data(ctx, &data_ctx);
15131
+ * llama_state_get_data(ctx, &data_ctx);
15004
15132
  *
15005
15133
  */
15006
- static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15134
+ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15007
15135
  // copy rng
15008
15136
  {
15009
15137
  std::ostringstream rng_ss;
@@ -15147,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
15147
15275
  }
15148
15276
  }
15149
15277
 
15150
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15278
+ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15151
15279
  llama_data_buffer_context data_ctx(dst);
15152
- llama_copy_state_data_internal(ctx, &data_ctx);
15280
+ llama_state_get_data_internal(ctx, &data_ctx);
15153
15281
 
15154
15282
  return data_ctx.get_size_written();
15155
15283
  }
15156
15284
 
15157
15285
  // Sets the state reading from the specified source address
15158
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15286
+ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15159
15287
  const uint8_t * inp = src;
15160
15288
 
15161
15289
  // set rng
@@ -15307,14 +15435,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15307
15435
  }
15308
15436
 
15309
15437
  const size_t nread = inp - src;
15310
- const size_t max_size = llama_get_state_size(ctx);
15438
+ const size_t max_size = llama_state_get_size(ctx);
15311
15439
 
15312
15440
  GGML_ASSERT(nread <= max_size);
15313
15441
 
15314
15442
  return nread;
15315
15443
  }
15316
15444
 
15317
- static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15445
+ static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15318
15446
  llama_file file(path_session, "rb");
15319
15447
 
15320
15448
  // sanity checks
@@ -15352,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15352
15480
  // restore the context state
15353
15481
  {
15354
15482
  const size_t n_state_size_cur = file.size - file.tell();
15355
- const size_t n_state_size_max = llama_get_state_size(ctx);
15483
+ const size_t n_state_size_max = llama_state_get_size(ctx);
15356
15484
 
15357
15485
  if (n_state_size_cur > n_state_size_max) {
15358
15486
  LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
@@ -15362,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15362
15490
  std::vector<uint8_t> state_data(n_state_size_max);
15363
15491
  file.read_raw(state_data.data(), n_state_size_cur);
15364
15492
 
15365
- llama_set_state_data(ctx, state_data.data());
15493
+ llama_state_set_data(ctx, state_data.data());
15366
15494
  }
15367
15495
 
15368
15496
  return true;
15369
15497
  }
15370
15498
 
15371
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15499
+ bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15372
15500
  try {
15373
- return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15501
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15374
15502
  } catch (const std::exception & err) {
15375
15503
  LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
15376
15504
  return false;
15377
15505
  }
15378
15506
  }
15379
15507
 
15380
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15508
+ static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15381
15509
  llama_file file(path_session, "wb");
15382
15510
 
15383
15511
  file.write_u32(LLAMA_SESSION_MAGIC);
@@ -15391,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
15391
15519
 
15392
15520
  // save the context state using stream saving
15393
15521
  llama_data_file_context data_ctx(&file);
15394
- llama_copy_state_data_internal(ctx, &data_ctx);
15522
+ llama_state_get_data_internal(ctx, &data_ctx);
15395
15523
 
15396
15524
  return true;
15397
15525
  }
15398
15526
 
15527
+ bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15528
+ try {
15529
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
15530
+ } catch (const std::exception & err) {
15531
+ LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
15532
+ return false;
15533
+ }
15534
+ }
15535
+
15536
+ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
15537
+ // save the size of size_t as a uint32_t for safety check
15538
+ const size_t size_t_size_size = sizeof(uint32_t);
15539
+
15540
+ // other values
15541
+ const size_t s_cell_count_size = sizeof(uint32_t);
15542
+ const size_t s_layer_count_size = sizeof(uint32_t);
15543
+ const size_t n_embd_v_gqa_size = sizeof(uint32_t);
15544
+
15545
+ size_t s_cell_count = 0;
15546
+ size_t s_cell_data_size = 0;
15547
+ const auto & kv_self = ctx->kv_self;
15548
+ const auto & hparams = ctx->model.hparams;
15549
+
15550
+ const uint32_t n_layer = hparams.n_layer;
15551
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15552
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15553
+
15554
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15555
+ const auto & cell = kv_self.cells[i];
15556
+ if (cell.seq_id.count(seq_id) > 0) {
15557
+ ++s_cell_count;
15558
+ s_cell_data_size += sizeof(llama_pos);
15559
+ }
15560
+ }
15561
+
15562
+ for (int il = 0; il < (int)n_layer; ++il) {
15563
+ // types of keys and values
15564
+ s_cell_data_size += sizeof(int32_t) * 2;
15565
+ // k_size_row and v_size_el values of layer
15566
+ s_cell_data_size += sizeof(size_t) * 2;
15567
+
15568
+ // keys
15569
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15570
+ s_cell_data_size += k_size_row * s_cell_count;
15571
+
15572
+ // values (transposed)
15573
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15574
+ s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
15575
+ }
15576
+
15577
+ const size_t s_total = (
15578
+ size_t_size_size +
15579
+ s_cell_count_size +
15580
+ s_layer_count_size +
15581
+ n_embd_v_gqa_size +
15582
+ s_cell_data_size
15583
+ );
15584
+
15585
+ return s_total;
15586
+ }
15587
+
15588
+ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
15589
+ const auto & kv_self = ctx->kv_self;
15590
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15591
+
15592
+ // Save the size of size_t as a uint32_t for safety check
15593
+ const uint32_t size_t_size = sizeof(size_t);
15594
+ data_ctx.write(&size_t_size, sizeof(size_t_size));
15595
+
15596
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
15597
+ uint32_t cell_count = 0;
15598
+
15599
+ // Count the number of cells with the specified seq_id
15600
+ // Find all the ranges of cells with this seq id
15601
+ {
15602
+ uint32_t cell_range_begin = kv_self.size;
15603
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15604
+ const auto & cell = kv_self.cells[i];
15605
+ if (cell.has_seq_id(seq_id)) {
15606
+ ++cell_count;
15607
+ if (cell_range_begin == kv_self.size) {
15608
+ cell_range_begin = i;
15609
+ }
15610
+ }
15611
+ else {
15612
+ if (cell_range_begin != kv_self.size) {
15613
+ cell_ranges.push_back({ cell_range_begin, i });
15614
+ cell_range_begin = kv_self.size;
15615
+ }
15616
+ }
15617
+ }
15618
+ if (cell_range_begin != kv_self.size) {
15619
+ cell_ranges.push_back({ cell_range_begin, kv_self.size });
15620
+ }
15621
+
15622
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
15623
+ uint32_t cell_count_check = 0;
15624
+ for (const auto & range : cell_ranges) {
15625
+ cell_count_check += range.second - range.first;
15626
+ }
15627
+ GGML_ASSERT(cell_count == cell_count_check);
15628
+ }
15629
+
15630
+ // Write the cell count
15631
+ data_ctx.write(&cell_count, sizeof(cell_count));
15632
+
15633
+ const auto & hparams = ctx->model.hparams;
15634
+ const uint32_t n_layer = hparams.n_layer;
15635
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15636
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15637
+
15638
+ // Write the layer count
15639
+ data_ctx.write(&n_layer, sizeof(n_layer));
15640
+
15641
+ // Write n_embd_v_gqa
15642
+ data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
15643
+
15644
+ // Iterate the ranges and write all the pos (this is the token position in the prompt)
15645
+ for (const auto & range : cell_ranges) {
15646
+ for (uint32_t i = range.first; i < range.second; ++i) {
15647
+ const auto & cell = kv_self.cells[i];
15648
+ data_ctx.write(&cell.pos, sizeof(cell.pos));
15649
+ }
15650
+ }
15651
+
15652
+ // Iterate and write all the keys first, each row is a cell
15653
+ // Get whole range at a time
15654
+ std::vector<uint8_t> tmp_buf;
15655
+ for (int il = 0; il < (int)n_layer; ++il) {
15656
+ // Write key type
15657
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15658
+ data_ctx.write(&k_type_i, sizeof(k_type_i));
15659
+
15660
+ // Write row size of key
15661
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15662
+ data_ctx.write(&k_size_row, sizeof(k_size_row));
15663
+
15664
+ // Read each range of cells of k_size length each into tmp_buf and write out
15665
+ for (const auto & range : cell_ranges) {
15666
+ const size_t range_size = range.second - range.first;
15667
+ tmp_buf.resize(range_size * k_size_row);
15668
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
15669
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15670
+ }
15671
+ }
15672
+
15673
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
15674
+ const uint32_t kv_size = kv_self.size;
15675
+ for (int il = 0; il < (int)n_layer; ++il) {
15676
+ // Write value type
15677
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15678
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
15679
+
15680
+ // Write element size
15681
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15682
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
15683
+
15684
+ // For each row, we get the element values of each cell
15685
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15686
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
15687
+ for (const auto & range : cell_ranges) {
15688
+ const size_t range_size = range.second - range.first;
15689
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
15690
+ tmp_buf.resize(range_size * v_size_el);
15691
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
15692
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15693
+ }
15694
+ }
15695
+ }
15696
+
15697
+ return data_ctx.get_size_written();
15698
+ }
15699
+
15700
+ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
15701
+ llama_data_buffer_context data_ctx(dst);
15702
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15703
+ }
15704
+
15705
+ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
15706
+ auto & kv_self = ctx->kv_self;
15707
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15708
+
15709
+ // Wipe the slot
15710
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15711
+
15712
+ const uint8_t * inp = src;
15713
+
15714
+ // Read size of size_t
15715
+ uint32_t size_t_size;
15716
+ memcpy(&size_t_size, inp, sizeof(size_t_size));
15717
+ inp += sizeof(size_t_size);
15718
+ if (size_t_size != sizeof(size_t)) {
15719
+ LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
15720
+ return 0;
15721
+ }
15722
+
15723
+ // Read the cell count
15724
+ uint32_t cell_count;
15725
+ memcpy(&cell_count, inp, sizeof(cell_count));
15726
+ inp += sizeof(cell_count);
15727
+
15728
+ // Read the layer count
15729
+ uint32_t n_layer_ref;
15730
+ memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
15731
+ inp += sizeof(n_layer_ref);
15732
+
15733
+ // Read n_embd_v_gqa
15734
+ uint32_t n_embd_v_gqa_ref;
15735
+ memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
15736
+ inp += sizeof(n_embd_v_gqa_ref);
15737
+
15738
+ // Sanity check model compatibility
15739
+ const auto & hparams = ctx->model.hparams;
15740
+ const uint32_t n_layer = hparams.n_layer;
15741
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15742
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15743
+ if (n_layer != n_layer_ref) {
15744
+ LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
15745
+ return 0;
15746
+ }
15747
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
15748
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
15749
+ return 0;
15750
+ }
15751
+
15752
+ // Allocate the new cells for the slot
15753
+ if (cell_count) {
15754
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
15755
+ batch.n_tokens = cell_count;
15756
+ for (uint32_t i = 0; i < cell_count; ++i) {
15757
+ llama_pos pos;
15758
+ memcpy(&pos, inp, sizeof(pos));
15759
+ inp += sizeof(pos);
15760
+
15761
+ batch.pos[i] = pos;
15762
+ batch.n_seq_id[i] = 1;
15763
+ batch.seq_id[i][0] = dest_seq_id;
15764
+ }
15765
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
15766
+ llama_batch_free(batch);
15767
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
15768
+ return 0;
15769
+ }
15770
+
15771
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
15772
+ // Assume that this is one contiguous block of cells
15773
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
15774
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
15775
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
15776
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
15777
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
15778
+
15779
+ // Cleanup
15780
+ llama_batch_free(batch);
15781
+ }
15782
+
15783
+ const uint32_t kv_size = kv_self.size;
15784
+ const uint32_t kv_head = kv_self.head;
15785
+
15786
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
15787
+ for (int il = 0; il < (int)n_layer; ++il) {
15788
+ // Read type of key
15789
+ int32_t k_type_i_ref;
15790
+ memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
15791
+ inp += sizeof(k_type_i_ref);
15792
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15793
+ if (k_type_i != k_type_i_ref) {
15794
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15795
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
15796
+ return 0;
15797
+ }
15798
+
15799
+ // Read row size of key
15800
+ size_t k_size_row_ref;
15801
+ memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
15802
+ inp += sizeof(k_size_row_ref);
15803
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15804
+ if (k_size_row != k_size_row_ref) {
15805
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15806
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
15807
+ return 0;
15808
+ }
15809
+
15810
+ if (cell_count) {
15811
+ // Read and set the keys for the whole cell range
15812
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
15813
+ inp += cell_count * k_size_row;
15814
+ }
15815
+ }
15816
+
15817
+ // For each layer, read the values for each cell (transposed)
15818
+ for (int il = 0; il < (int)n_layer; ++il) {
15819
+ // Read type of value
15820
+ int32_t v_type_i_ref;
15821
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
15822
+ inp += sizeof(v_type_i_ref);
15823
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15824
+ if (v_type_i != v_type_i_ref) {
15825
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15826
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
15827
+ return 0;
15828
+ }
15829
+
15830
+ // Read element size of value
15831
+ size_t v_size_el_ref;
15832
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
15833
+ inp += sizeof(v_size_el_ref);
15834
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15835
+ if (v_size_el != v_size_el_ref) {
15836
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15837
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
15838
+ return 0;
15839
+ }
15840
+
15841
+ if (cell_count) {
15842
+ // For each row in the transposed matrix, read the values for the whole cell range
15843
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15844
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
15845
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
15846
+ inp += cell_count * v_size_el;
15847
+ }
15848
+ }
15849
+ }
15850
+
15851
+ const size_t nread = inp - src;
15852
+ return nread;
15853
+ }
15854
+
15855
+ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15856
+ llama_file file(filepath, "wb");
15857
+
15858
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
15859
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
15860
+
15861
+ // save the prompt
15862
+ file.write_u32((uint32_t)n_token_count);
15863
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
15864
+
15865
+ // save the context state using stream saving
15866
+ llama_data_file_context data_ctx(&file);
15867
+ llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15868
+
15869
+ const size_t res = file.tell();
15870
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
15871
+ return res;
15872
+ }
15873
+
15874
+ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15875
+ llama_file file(filepath, "rb");
15876
+
15877
+ // version checks
15878
+ {
15879
+ const uint32_t magic = file.read_u32();
15880
+ const uint32_t version = file.read_u32();
15881
+
15882
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
15883
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
15884
+ return 0;
15885
+ }
15886
+ }
15887
+
15888
+ // load the prompt
15889
+ {
15890
+ const uint32_t n_token_count = file.read_u32();
15891
+
15892
+ if (n_token_count > n_token_capacity) {
15893
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
15894
+ return 0;
15895
+ }
15896
+
15897
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
15898
+ *n_token_count_out = n_token_count;
15899
+ }
15900
+
15901
+ // restore the context state
15902
+ {
15903
+ const size_t state_size = file.size - file.tell();
15904
+ std::vector<uint8_t> state_data(state_size);
15905
+ file.read_raw(state_data.data(), state_size);
15906
+ const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
15907
+ if (!nread) {
15908
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
15909
+ return 0;
15910
+ }
15911
+ GGML_ASSERT(nread <= state_size);
15912
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
15913
+ }
15914
+
15915
+ return file.tell();
15916
+ }
15917
+
15918
+ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15919
+ try {
15920
+ return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
15921
+ } catch (const std::exception & err) {
15922
+ LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
15923
+ return 0;
15924
+ }
15925
+ }
15926
+
15927
+ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15928
+ try {
15929
+ return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
15930
+ } catch (const std::exception & err) {
15931
+ LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
15932
+ return 0;
15933
+ }
15934
+ }
15935
+
15399
15936
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
15400
15937
  ctx->cparams.n_threads = n_threads;
15401
15938
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -15509,23 +16046,31 @@ float * llama_get_logits(struct llama_context * ctx) {
15509
16046
  }
15510
16047
 
15511
16048
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
16049
+ int32_t j = -1;
15512
16050
  llama_synchronize(ctx);
15513
16051
 
15514
16052
  try {
15515
16053
  if (ctx->logits == nullptr) {
15516
16054
  throw std::runtime_error("no logits");
15517
16055
  }
15518
- if ((size_t) i >= ctx->output_ids.size()) {
16056
+
16057
+ if (i < 0) {
16058
+ j = ctx->n_outputs + i;
16059
+ if (j < 0) {
16060
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16061
+ }
16062
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15519
16063
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16064
+ } else {
16065
+ j = ctx->output_ids[i];
15520
16066
  }
15521
- const int32_t j = ctx->output_ids[i];
15522
16067
 
15523
16068
  if (j < 0) {
15524
16069
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
16070
  }
15526
- if ((size_t) j >= ctx->output_size) {
16071
+ if (j >= ctx->n_outputs) {
15527
16072
  // This should not happen
15528
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16073
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15529
16074
  }
15530
16075
 
15531
16076
  return ctx->logits + j*ctx->model.hparams.n_vocab;
@@ -15545,23 +16090,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
15545
16090
  }
15546
16091
 
15547
16092
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
16093
+ int32_t j = -1;
16094
+
15548
16095
  llama_synchronize(ctx);
15549
16096
 
15550
16097
  try {
15551
16098
  if (ctx->embd == nullptr) {
15552
16099
  throw std::runtime_error("no embeddings");
15553
16100
  }
15554
- if ((size_t) i >= ctx->output_ids.size()) {
16101
+
16102
+ if (i < 0) {
16103
+ j = ctx->n_outputs + i;
16104
+ if (j < 0) {
16105
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16106
+ }
16107
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15555
16108
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16109
+ } else {
16110
+ j = ctx->output_ids[i];
15556
16111
  }
15557
- const int32_t j = ctx->output_ids[i];
15558
16112
 
15559
16113
  if (j < 0) {
15560
16114
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
16115
  }
15562
- if ((size_t) j >= ctx->output_size) {
16116
+ if (j >= ctx->n_outputs) {
15563
16117
  // This should not happen
15564
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16118
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15565
16119
  }
15566
16120
 
15567
16121
  return ctx->embd + j*ctx->model.hparams.n_embd;
@@ -15608,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
15608
16162
  return model->vocab.special_eos_id;
15609
16163
  }
15610
16164
 
16165
+ llama_token llama_token_cls(const struct llama_model * model) {
16166
+ return model->vocab.special_cls_id;
16167
+ }
16168
+
16169
+ llama_token llama_token_sep(const struct llama_model * model) {
16170
+ return model->vocab.special_sep_id;
16171
+ }
16172
+
15611
16173
  llama_token llama_token_nl(const struct llama_model * model) {
15612
16174
  return model->vocab.linefeed_id;
15613
16175
  }
@@ -15642,9 +16204,9 @@ int32_t llama_tokenize(
15642
16204
  int32_t text_len,
15643
16205
  llama_token * tokens,
15644
16206
  int32_t n_tokens_max,
15645
- bool add_bos,
15646
- bool special) {
15647
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
16207
+ bool add_special,
16208
+ bool parse_special) {
16209
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
15648
16210
 
15649
16211
  if (n_tokens_max < (int) res.size()) {
15650
16212
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);