llama_cpp 0.14.4 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -261,6 +261,7 @@ enum llm_kv {
261
261
  LLM_KV_GENERAL_ALIGNMENT,
262
262
  LLM_KV_GENERAL_NAME,
263
263
  LLM_KV_GENERAL_AUTHOR,
264
+ LLM_KV_GENERAL_VERSION,
264
265
  LLM_KV_GENERAL_URL,
265
266
  LLM_KV_GENERAL_DESCRIPTION,
266
267
  LLM_KV_GENERAL_LICENSE,
@@ -317,6 +318,8 @@ enum llm_kv {
317
318
  LLM_KV_TOKENIZER_UNK_ID,
318
319
  LLM_KV_TOKENIZER_SEP_ID,
319
320
  LLM_KV_TOKENIZER_PAD_ID,
321
+ LLM_KV_TOKENIZER_CLS_ID,
322
+ LLM_KV_TOKENIZER_MASK_ID,
320
323
  LLM_KV_TOKENIZER_ADD_BOS,
321
324
  LLM_KV_TOKENIZER_ADD_EOS,
322
325
  LLM_KV_TOKENIZER_ADD_PREFIX,
@@ -330,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
330
333
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
331
334
  { LLM_KV_GENERAL_NAME, "general.name" },
332
335
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
336
+ { LLM_KV_GENERAL_VERSION, "general.version" },
333
337
  { LLM_KV_GENERAL_URL, "general.url" },
334
338
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
335
339
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -386,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
390
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
387
391
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
388
392
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
393
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
394
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
389
395
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
390
396
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
391
397
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
@@ -924,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
924
930
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
931
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
932
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
933
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
934
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
927
935
  },
928
936
  },
929
937
  {
@@ -1630,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
1630
1638
  #if defined(GGML_USE_CUDA)
1631
1639
  size_t total;
1632
1640
  size_t free;
1633
- ggml_backend_cuda_get_device_memory(device, &total, &free);
1641
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
1634
1642
  return free;
1635
1643
  #elif defined(GGML_USE_SYCL)
1636
1644
  size_t total;
1637
1645
  size_t free;
1638
- ggml_backend_sycl_get_device_memory(device, &total, &free);
1646
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
1639
1647
  return free;
1640
1648
  #elif defined(GGML_USE_VULKAN)
1641
1649
  size_t total;
1642
1650
  size_t free;
1643
- ggml_backend_vk_get_device_memory(device, &total, &free);
1651
+ ggml_backend_vk_get_device_memory(device, &free, &total);
1644
1652
  return free;
1645
1653
  #else
1646
1654
  return 1;
@@ -1697,6 +1705,8 @@ enum e_model {
1697
1705
  MODEL_MEDIUM,
1698
1706
  MODEL_LARGE,
1699
1707
  MODEL_XL,
1708
+ MODEL_8x7B,
1709
+ MODEL_8x22B,
1700
1710
  };
1701
1711
 
1702
1712
  static const size_t kiB = 1024;
@@ -2014,11 +2024,13 @@ struct llama_vocab {
2014
2024
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2015
2025
 
2016
2026
  // default LLaMA special tokens
2017
- id special_bos_id = 1;
2018
- id special_eos_id = 2;
2019
- id special_unk_id = 0;
2020
- id special_sep_id = -1;
2021
- id special_pad_id = -1;
2027
+ id special_bos_id = 1;
2028
+ id special_eos_id = 2;
2029
+ id special_unk_id = 0;
2030
+ id special_sep_id = -1;
2031
+ id special_pad_id = -1;
2032
+ id special_cls_id = -1;
2033
+ id special_mask_id = -1;
2022
2034
 
2023
2035
  int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2024
2036
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
@@ -2175,7 +2187,7 @@ struct llama_context {
2175
2187
 
2176
2188
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
2189
  size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2190
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
2179
2191
 
2180
2192
  bool logits_all = false;
2181
2193
 
@@ -3548,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
3548
3560
  case MODEL_MEDIUM: return "0.4B";
3549
3561
  case MODEL_LARGE: return "0.8B";
3550
3562
  case MODEL_XL: return "1.5B";
3563
+ case MODEL_8x7B: return "8x7B";
3564
+ case MODEL_8x22B: return "8x22B";
3551
3565
  default: return "?B";
3552
3566
  }
3553
3567
  }
@@ -3662,15 +3676,23 @@ static void llm_load_hparams(
3662
3676
  {
3663
3677
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3664
3678
 
3665
- switch (hparams.n_layer) {
3666
- case 22: model.type = e_model::MODEL_1B; break;
3667
- case 26: model.type = e_model::MODEL_3B; break;
3668
- case 32: model.type = e_model::MODEL_7B; break;
3669
- case 40: model.type = e_model::MODEL_13B; break;
3670
- case 48: model.type = e_model::MODEL_34B; break;
3671
- case 60: model.type = e_model::MODEL_30B; break;
3672
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3673
- default: model.type = e_model::MODEL_UNKNOWN;
3679
+ if (hparams.n_expert == 8) {
3680
+ switch (hparams.n_layer) {
3681
+ case 32: model.type = e_model::MODEL_8x7B; break;
3682
+ case 56: model.type = e_model::MODEL_8x22B; break;
3683
+ default: model.type = e_model::MODEL_UNKNOWN;
3684
+ }
3685
+ } else {
3686
+ switch (hparams.n_layer) {
3687
+ case 22: model.type = e_model::MODEL_1B; break;
3688
+ case 26: model.type = e_model::MODEL_3B; break;
3689
+ case 32: model.type = e_model::MODEL_7B; break;
3690
+ case 40: model.type = e_model::MODEL_13B; break;
3691
+ case 48: model.type = e_model::MODEL_34B; break;
3692
+ case 60: model.type = e_model::MODEL_30B; break;
3693
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3694
+ default: model.type = e_model::MODEL_UNKNOWN;
3695
+ }
3674
3696
  }
3675
3697
  } break;
3676
3698
  case LLM_ARCH_MINICPM:
@@ -3974,7 +3996,9 @@ static void llm_load_hparams(
3974
3996
  }
3975
3997
 
3976
3998
  // TODO: This should probably be in llama.h
3977
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
3999
+ static std::vector<llama_vocab::id> llama_tokenize_internal(
4000
+ const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
4001
+ );
3978
4002
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
3979
4003
 
3980
4004
  static void llm_load_vocab(
@@ -3996,23 +4020,27 @@ static void llm_load_vocab(
3996
4020
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
3997
4021
 
3998
4022
  // default special tokens
3999
- vocab.special_bos_id = -1;
4000
- vocab.special_eos_id = -1;
4001
- vocab.special_unk_id = -1;
4002
- vocab.special_sep_id = -1;
4003
- vocab.special_pad_id = -1;
4004
- vocab.linefeed_id = -1;
4023
+ vocab.special_bos_id = -1;
4024
+ vocab.special_eos_id = -1;
4025
+ vocab.special_unk_id = -1;
4026
+ vocab.special_sep_id = -1;
4027
+ vocab.special_pad_id = -1;
4028
+ vocab.special_cls_id = -1;
4029
+ vocab.special_mask_id = -1;
4030
+ vocab.linefeed_id = -1;
4005
4031
 
4006
4032
  return;
4007
4033
  } else if (tokenizer_name == "llama") {
4008
4034
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4009
4035
 
4010
4036
  // default special tokens
4011
- vocab.special_bos_id = 1;
4012
- vocab.special_eos_id = 2;
4013
- vocab.special_unk_id = 0;
4014
- vocab.special_sep_id = -1;
4015
- vocab.special_pad_id = -1;
4037
+ vocab.special_bos_id = 1;
4038
+ vocab.special_eos_id = 2;
4039
+ vocab.special_unk_id = 0;
4040
+ vocab.special_sep_id = -1;
4041
+ vocab.special_pad_id = -1;
4042
+ vocab.special_cls_id = -1;
4043
+ vocab.special_mask_id = -1;
4016
4044
 
4017
4045
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4018
4046
  if (add_space_prefix_keyidx != -1) {
@@ -4047,20 +4075,24 @@ static void llm_load_vocab(
4047
4075
  }
4048
4076
 
4049
4077
  // default special tokens
4050
- vocab.special_bos_id = 11;
4051
- vocab.special_eos_id = 11;
4052
- vocab.special_unk_id = -1;
4053
- vocab.special_sep_id = -1;
4054
- vocab.special_pad_id = -1;
4078
+ vocab.special_bos_id = 11;
4079
+ vocab.special_eos_id = 11;
4080
+ vocab.special_unk_id = -1;
4081
+ vocab.special_sep_id = -1;
4082
+ vocab.special_pad_id = -1;
4083
+ vocab.special_cls_id = -1;
4084
+ vocab.special_mask_id = -1;
4055
4085
  } else if (tokenizer_name == "bert") {
4056
4086
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
4057
4087
 
4058
4088
  // default special tokens
4059
- vocab.special_bos_id = 101;
4060
- vocab.special_eos_id = 102;
4061
- vocab.special_unk_id = 100;
4062
- vocab.special_sep_id = -1;
4063
- vocab.special_pad_id = -1;
4089
+ vocab.special_bos_id = -1;
4090
+ vocab.special_eos_id = -1;
4091
+ vocab.special_unk_id = 100;
4092
+ vocab.special_sep_id = 102;
4093
+ vocab.special_pad_id = 0;
4094
+ vocab.special_cls_id = 101;
4095
+ vocab.special_mask_id = 103;
4064
4096
  vocab.add_space_prefix = false;
4065
4097
  } else {
4066
4098
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
@@ -4123,11 +4155,13 @@ static void llm_load_vocab(
4123
4155
  // special tokens
4124
4156
  {
4125
4157
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4126
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4127
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4128
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4129
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4130
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4158
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4131
4165
  };
4132
4166
  for (const auto & it : special_token_types) {
4133
4167
  const std::string & key = kv(std::get<0>(it));
@@ -4319,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4319
4353
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4320
4354
 
4321
4355
  // special tokens
4322
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4323
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4324
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4325
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4326
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4327
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4356
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4357
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4358
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4359
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4360
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4361
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4362
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4363
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4328
4364
  }
4329
4365
 
4330
4366
  // Returns false if cancelled by progress_callback
@@ -5404,6 +5440,11 @@ static bool llm_load_tensors(
5404
5440
 
5405
5441
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
5442
 
5443
+ if (n_layer >= 64){
5444
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
5445
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
5446
+ }
5447
+
5407
5448
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
5449
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
5450
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
@@ -9452,6 +9493,31 @@ struct llm_build_context {
9452
9493
  cb(Vcur, "Vcur", il);
9453
9494
  }
9454
9495
 
9496
+ if (model.layers[il].attn_q_norm) {
9497
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
9498
+ ggml_element_size(Qcur) * n_embd_head,
9499
+ ggml_element_size(Qcur) * n_embd_head * n_head,
9500
+ 0);
9501
+ cb(Qcur, "Qcur", il);
9502
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
9503
+ ggml_element_size(Kcur) * n_embd_head,
9504
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
9505
+ 0);
9506
+ cb(Kcur, "Kcur", il);
9507
+
9508
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
9509
+ model.layers[il].attn_q_norm,
9510
+ NULL,
9511
+ LLM_NORM, cb, il);
9512
+ cb(Qcur, "Qcur", il);
9513
+
9514
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
9515
+ model.layers[il].attn_k_norm,
9516
+ NULL,
9517
+ LLM_NORM, cb, il);
9518
+ cb(Kcur, "Kcur", il);
9519
+ }
9520
+
9455
9521
  Qcur = ggml_rope_custom(
9456
9522
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
9523
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -10409,6 +10475,9 @@ static int llama_decode_internal(
10409
10475
  n_outputs_prev += lctx.n_outputs;
10410
10476
  }
10411
10477
 
10478
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
10479
+ lctx.n_outputs = n_outputs;
10480
+
10412
10481
  // wait for the computation to finish (automatically done when obtaining the model output)
10413
10482
  //llama_synchronize(&lctx);
10414
10483
 
@@ -11052,7 +11121,7 @@ struct llm_tokenizer_bpe {
11052
11121
  add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
11053
11122
  }
11054
11123
 
11055
- // add the fnished tokens to the final list keeping correct order for next and prev
11124
+ // add the finished tokens to the final list keeping correct order for next and prev
11056
11125
  for (auto & sym : symbols) {
11057
11126
  if (sym.n > 0) {
11058
11127
  sym.prev = final_prev_index;
@@ -11321,9 +11390,6 @@ struct llm_tokenizer_wpm {
11321
11390
  output.push_back(vocab.special_unk_id);
11322
11391
  }
11323
11392
  }
11324
-
11325
- // append eos token
11326
- output.push_back(vocab.special_eos_id);
11327
11393
  }
11328
11394
 
11329
11395
  std::vector<std::string> preprocess(const std::string & text) {
@@ -11528,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
11528
11594
  }
11529
11595
  }
11530
11596
 
11531
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
11597
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
11532
11598
  std::vector<llama_vocab::id> output;
11533
-
11534
- // OG tokenizer behavior:
11535
- //
11536
- // tokenizer.encode('', add_bos=True) returns [1]
11537
- // tokenizer.encode('', add_bos=False) returns []
11538
-
11539
- if (bos && vocab.special_bos_id != -1) {
11540
- output.push_back(vocab.special_bos_id);
11541
- }
11542
-
11543
- if (raw_text.empty()) {
11544
- return output;
11545
- }
11546
-
11547
11599
  std::forward_list<fragment_buffer_variant> fragment_buffer;
11548
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11549
11600
 
11550
- if (special) tokenizer_st_partition(vocab, fragment_buffer);
11601
+ if (!raw_text.empty()) {
11602
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11603
+ if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
11604
+ }
11551
11605
 
11552
11606
  switch (vocab.type) {
11553
11607
  case LLAMA_VOCAB_TYPE_SPM:
11554
11608
  {
11609
+ // OG tokenizer behavior:
11610
+ //
11611
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
11612
+ // tokenizer.encode('', add_special_tokens=False) returns []
11613
+
11614
+ if (add_special && vocab.special_add_bos != 0) {
11615
+ GGML_ASSERT(vocab.special_bos_id != -1);
11616
+ output.push_back(vocab.special_bos_id);
11617
+ }
11618
+
11555
11619
  for (const auto & fragment : fragment_buffer) {
11556
11620
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11557
11621
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -11577,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11577
11641
  output.push_back(fragment.token);
11578
11642
  }
11579
11643
  }
11644
+
11645
+ if (add_special && vocab.special_add_eos == 1) {
11646
+ GGML_ASSERT(vocab.special_eos_id != -1);
11647
+ output.push_back(vocab.special_eos_id);
11648
+ }
11580
11649
  } break;
11581
11650
  case LLAMA_VOCAB_TYPE_BPE:
11582
11651
  {
11652
+ if (add_special && vocab.special_add_bos == 1) {
11653
+ GGML_ASSERT(vocab.special_bos_id != -1);
11654
+ output.push_back(vocab.special_bos_id);
11655
+ }
11656
+
11583
11657
  for (const auto & fragment : fragment_buffer) {
11584
11658
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11585
11659
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11593,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11593
11667
  output.push_back(fragment.token);
11594
11668
  }
11595
11669
  }
11670
+
11671
+ GGML_ASSERT(vocab.special_add_eos != 1);
11596
11672
  } break;
11597
11673
  case LLAMA_VOCAB_TYPE_WPM:
11598
11674
  {
11675
+ if (add_special) {
11676
+ GGML_ASSERT(vocab.special_cls_id != -1);
11677
+ output.push_back(vocab.special_cls_id);
11678
+ }
11679
+
11599
11680
  for (const auto & fragment : fragment_buffer) {
11600
11681
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11601
11682
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11609,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11609
11690
  output.push_back(fragment.token);
11610
11691
  }
11611
11692
  }
11693
+
11694
+ if (add_special) {
11695
+ GGML_ASSERT(vocab.special_sep_id != -1);
11696
+ output.push_back(vocab.special_sep_id);
11697
+ }
11612
11698
  } break;
11613
11699
  case LLAMA_VOCAB_TYPE_NONE:
11614
11700
  GGML_ASSERT(false);
@@ -11775,7 +11861,9 @@ static void llama_grammar_advance_stack(
11775
11861
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11776
11862
 
11777
11863
  if (stack.empty()) {
11778
- new_stacks.emplace_back(stack);
11864
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11865
+ new_stacks.emplace_back(stack);
11866
+ }
11779
11867
  return;
11780
11868
  }
11781
11869
 
@@ -11812,7 +11900,10 @@ static void llama_grammar_advance_stack(
11812
11900
  }
11813
11901
  case LLAMA_GRETYPE_CHAR:
11814
11902
  case LLAMA_GRETYPE_CHAR_NOT:
11815
- new_stacks.emplace_back(stack);
11903
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11904
+ // only add the stack if it's not a duplicate of one we already have
11905
+ new_stacks.emplace_back(stack);
11906
+ }
11816
11907
  break;
11817
11908
  default:
11818
11909
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -11826,12 +11917,13 @@ static void llama_grammar_advance_stack(
11826
11917
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11827
11918
  // produces the N possible stacks if the given char is accepted at those
11828
11919
  // positions
11829
- std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11920
+ void llama_grammar_accept(
11830
11921
  const std::vector<std::vector<llama_grammar_element>> & rules,
11831
11922
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11832
- const uint32_t chr) {
11923
+ const uint32_t chr,
11924
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11833
11925
 
11834
- std::vector<std::vector<const llama_grammar_element *>> new_stacks;
11926
+ new_stacks.clear();
11835
11927
 
11836
11928
  for (const auto & stack : stacks) {
11837
11929
  if (stack.empty()) {
@@ -11850,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11850
11942
  llama_grammar_advance_stack(rules, new_stack, new_stacks);
11851
11943
  }
11852
11944
  }
11853
-
11854
- return new_stacks;
11855
11945
  }
11856
11946
 
11857
11947
  static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@@ -11865,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11865
11955
  const std::vector<llama_grammar_candidate> & candidates) {
11866
11956
 
11867
11957
  std::vector<llama_grammar_candidate> rejects;
11958
+ rejects.reserve(candidates.size());
11868
11959
 
11869
11960
  if (stack.empty()) {
11870
11961
  for (const auto & tok : candidates) {
@@ -11878,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11878
11969
  const llama_grammar_element * stack_pos = stack.back();
11879
11970
 
11880
11971
  std::vector<llama_grammar_candidate> next_candidates;
11972
+ next_candidates.reserve(candidates.size());
11973
+
11881
11974
  for (const auto & tok : candidates) {
11882
11975
  if (*tok.code_points == 0) {
11883
11976
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -12685,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12685
12778
  // Note terminating 0 in decoded string
12686
12779
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
12687
12780
  const auto & code_points = decoded.first;
12781
+ std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
12688
12782
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
12689
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
12783
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
12784
+ grammar->stacks = tmp_new_stacks;
12690
12785
  }
12691
12786
  grammar->partial_utf8 = decoded.second;
12692
12787
  GGML_ASSERT(!grammar->stacks.empty());
@@ -13318,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
13318
13413
  return new_type;
13319
13414
  }
13320
13415
 
13321
- static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13416
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13322
13417
  std::mutex mutex;
13323
- int counter = 0;
13418
+ int64_t counter = 0;
13324
13419
  size_t new_size = 0;
13325
13420
  if (nthread < 2) {
13326
13421
  // single-thread
@@ -13328,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13328
13423
  }
13329
13424
  auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
13330
13425
  nrows, n_per_row, imatrix]() {
13331
- const int nrows_per_chunk = chunk_size / n_per_row;
13426
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
13332
13427
  size_t local_size = 0;
13333
13428
  while (true) {
13334
13429
  std::unique_lock<std::mutex> lock(mutex);
13335
- int first_row = counter; counter += nrows_per_chunk;
13430
+ int64_t first_row = counter; counter += nrows_per_chunk;
13336
13431
  if (first_row >= nrows) {
13337
13432
  if (local_size > 0) {
13338
13433
  new_size += local_size;
@@ -13340,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13340
13435
  break;
13341
13436
  }
13342
13437
  lock.unlock();
13343
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13438
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13344
13439
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
13345
13440
  }
13346
13441
  };
@@ -13463,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13463
13558
  const std::string name = ggml_get_name(meta);
13464
13559
 
13465
13560
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13466
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
13561
+ if (name.find("attn_v.weight") != std::string::npos ||
13562
+ name.find("attn_qkv.weight") != std::string::npos) {
13467
13563
  ++qs.n_attention_wv;
13468
13564
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13469
13565
  qs.has_output = true;
@@ -13473,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13473
13569
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
13570
 
13475
13571
  // sanity checks
13476
- GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
13572
+ //
13573
+ // - qs.n_attention_wv == 0 for Mamba models
13574
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
13575
+ //
13576
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
13477
13577
 
13478
13578
  size_t total_size_org = 0;
13479
13579
  size_t total_size_new = 0;
@@ -13529,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13529
13629
 
13530
13630
  // quantize only 2D and 3D tensors (experts)
13531
13631
  quantize &= (ggml_n_dims(tensor) >= 2);
13632
+
13633
+ // do not quantize norm tensors
13634
+ quantize &= name.find("_norm.weight") == std::string::npos;
13635
+
13532
13636
  quantize &= params->quantize_output_tensor || name != "output.weight";
13533
13637
  quantize &= !params->only_copy;
13534
13638
 
@@ -13557,10 +13661,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13557
13661
  if (!params->pure && ggml_is_quantized(default_type)) {
13558
13662
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
13559
13663
  }
13560
- else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13664
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
13665
  new_type = params->token_embedding_type;
13562
13666
  }
13563
- else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13667
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
13668
  new_type = params->output_tensor_type;
13565
13669
  }
13566
13670
 
@@ -13575,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13575
13679
  new_size = ggml_nbytes(tensor);
13576
13680
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
13577
13681
  } else {
13578
- const size_t nelements = ggml_nelements(tensor);
13682
+ const int64_t nelements = ggml_nelements(tensor);
13579
13683
 
13580
13684
  const float * imatrix = nullptr;
13581
13685
  if (imatrix_data) {
@@ -13627,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13627
13731
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
13628
13732
  fflush(stdout);
13629
13733
 
13630
- if (work.size() < nelements * 4) {
13734
+ if (work.size() < (size_t)nelements * 4) {
13631
13735
  work.resize(nelements * 4); // upper bound on size
13632
13736
  }
13633
13737
  new_data = work.data();
13634
13738
 
13635
- const int n_per_row = tensor->ne[0];
13636
- const int nrows = tensor->ne[1];
13739
+ const int64_t n_per_row = tensor->ne[0];
13740
+ const int64_t nrows = tensor->ne[1];
13637
13741
 
13638
- static const int min_chunk_size = 32 * 512;
13639
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13742
+ static const int64_t min_chunk_size = 32 * 512;
13743
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13640
13744
 
13641
- const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
- const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13643
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
13745
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
13746
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13747
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
13644
13748
 
13645
13749
  // quantize each expert separately since they have different importance matrices
13646
13750
  new_size = 0;
@@ -14905,9 +15009,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
14905
15009
  llama_kv_cache_update_internal(*ctx);
14906
15010
  }
14907
15011
 
15012
+ // deprecated
15013
+ size_t llama_get_state_size(const struct llama_context * ctx) {
15014
+ return llama_state_get_size(ctx);
15015
+ }
15016
+
15017
+ // deprecated
15018
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15019
+ return llama_state_get_data(ctx, dst);
15020
+ }
15021
+
15022
+ // deprecated
15023
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15024
+ return llama_state_set_data(ctx, src);
15025
+ }
15026
+
15027
+ // deprecated
15028
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15029
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15030
+ }
15031
+
15032
+ // deprecated
15033
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15034
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
15035
+ }
14908
15036
 
14909
15037
  // Returns the *maximum* size of the state
14910
- size_t llama_get_state_size(const struct llama_context * ctx) {
15038
+ size_t llama_state_get_size(const struct llama_context * ctx) {
14911
15039
  const auto & cparams = ctx->cparams;
14912
15040
  const auto & hparams = ctx->model.hparams;
14913
15041
 
@@ -14995,15 +15123,15 @@ struct llama_data_file_context : llama_data_context {
14995
15123
  * file context:
14996
15124
  * llama_file file("/path", "wb");
14997
15125
  * llama_data_file_context data_ctx(&file);
14998
- * llama_copy_state_data(ctx, &data_ctx);
15126
+ * llama_state_get_data(ctx, &data_ctx);
14999
15127
  *
15000
15128
  * buffer context:
15001
15129
  * std::vector<uint8_t> buf(max_size, 0);
15002
15130
  * llama_data_buffer_context data_ctx(&buf.data());
15003
- * llama_copy_state_data(ctx, &data_ctx);
15131
+ * llama_state_get_data(ctx, &data_ctx);
15004
15132
  *
15005
15133
  */
15006
- static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15134
+ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15007
15135
  // copy rng
15008
15136
  {
15009
15137
  std::ostringstream rng_ss;
@@ -15147,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
15147
15275
  }
15148
15276
  }
15149
15277
 
15150
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15278
+ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15151
15279
  llama_data_buffer_context data_ctx(dst);
15152
- llama_copy_state_data_internal(ctx, &data_ctx);
15280
+ llama_state_get_data_internal(ctx, &data_ctx);
15153
15281
 
15154
15282
  return data_ctx.get_size_written();
15155
15283
  }
15156
15284
 
15157
15285
  // Sets the state reading from the specified source address
15158
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15286
+ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15159
15287
  const uint8_t * inp = src;
15160
15288
 
15161
15289
  // set rng
@@ -15307,14 +15435,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15307
15435
  }
15308
15436
 
15309
15437
  const size_t nread = inp - src;
15310
- const size_t max_size = llama_get_state_size(ctx);
15438
+ const size_t max_size = llama_state_get_size(ctx);
15311
15439
 
15312
15440
  GGML_ASSERT(nread <= max_size);
15313
15441
 
15314
15442
  return nread;
15315
15443
  }
15316
15444
 
15317
- static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15445
+ static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15318
15446
  llama_file file(path_session, "rb");
15319
15447
 
15320
15448
  // sanity checks
@@ -15352,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15352
15480
  // restore the context state
15353
15481
  {
15354
15482
  const size_t n_state_size_cur = file.size - file.tell();
15355
- const size_t n_state_size_max = llama_get_state_size(ctx);
15483
+ const size_t n_state_size_max = llama_state_get_size(ctx);
15356
15484
 
15357
15485
  if (n_state_size_cur > n_state_size_max) {
15358
15486
  LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
@@ -15362,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15362
15490
  std::vector<uint8_t> state_data(n_state_size_max);
15363
15491
  file.read_raw(state_data.data(), n_state_size_cur);
15364
15492
 
15365
- llama_set_state_data(ctx, state_data.data());
15493
+ llama_state_set_data(ctx, state_data.data());
15366
15494
  }
15367
15495
 
15368
15496
  return true;
15369
15497
  }
15370
15498
 
15371
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15499
+ bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15372
15500
  try {
15373
- return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15501
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15374
15502
  } catch (const std::exception & err) {
15375
15503
  LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
15376
15504
  return false;
15377
15505
  }
15378
15506
  }
15379
15507
 
15380
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15508
+ static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15381
15509
  llama_file file(path_session, "wb");
15382
15510
 
15383
15511
  file.write_u32(LLAMA_SESSION_MAGIC);
@@ -15391,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
15391
15519
 
15392
15520
  // save the context state using stream saving
15393
15521
  llama_data_file_context data_ctx(&file);
15394
- llama_copy_state_data_internal(ctx, &data_ctx);
15522
+ llama_state_get_data_internal(ctx, &data_ctx);
15395
15523
 
15396
15524
  return true;
15397
15525
  }
15398
15526
 
15527
+ bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15528
+ try {
15529
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
15530
+ } catch (const std::exception & err) {
15531
+ LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
15532
+ return false;
15533
+ }
15534
+ }
15535
+
15536
+ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
15537
+ // save the size of size_t as a uint32_t for safety check
15538
+ const size_t size_t_size_size = sizeof(uint32_t);
15539
+
15540
+ // other values
15541
+ const size_t s_cell_count_size = sizeof(uint32_t);
15542
+ const size_t s_layer_count_size = sizeof(uint32_t);
15543
+ const size_t n_embd_v_gqa_size = sizeof(uint32_t);
15544
+
15545
+ size_t s_cell_count = 0;
15546
+ size_t s_cell_data_size = 0;
15547
+ const auto & kv_self = ctx->kv_self;
15548
+ const auto & hparams = ctx->model.hparams;
15549
+
15550
+ const uint32_t n_layer = hparams.n_layer;
15551
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15552
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15553
+
15554
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15555
+ const auto & cell = kv_self.cells[i];
15556
+ if (cell.seq_id.count(seq_id) > 0) {
15557
+ ++s_cell_count;
15558
+ s_cell_data_size += sizeof(llama_pos);
15559
+ }
15560
+ }
15561
+
15562
+ for (int il = 0; il < (int)n_layer; ++il) {
15563
+ // types of keys and values
15564
+ s_cell_data_size += sizeof(int32_t) * 2;
15565
+ // k_size_row and v_size_el values of layer
15566
+ s_cell_data_size += sizeof(size_t) * 2;
15567
+
15568
+ // keys
15569
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15570
+ s_cell_data_size += k_size_row * s_cell_count;
15571
+
15572
+ // values (transposed)
15573
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15574
+ s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
15575
+ }
15576
+
15577
+ const size_t s_total = (
15578
+ size_t_size_size +
15579
+ s_cell_count_size +
15580
+ s_layer_count_size +
15581
+ n_embd_v_gqa_size +
15582
+ s_cell_data_size
15583
+ );
15584
+
15585
+ return s_total;
15586
+ }
15587
+
15588
+ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
15589
+ const auto & kv_self = ctx->kv_self;
15590
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15591
+
15592
+ // Save the size of size_t as a uint32_t for safety check
15593
+ const uint32_t size_t_size = sizeof(size_t);
15594
+ data_ctx.write(&size_t_size, sizeof(size_t_size));
15595
+
15596
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
15597
+ uint32_t cell_count = 0;
15598
+
15599
+ // Count the number of cells with the specified seq_id
15600
+ // Find all the ranges of cells with this seq id
15601
+ {
15602
+ uint32_t cell_range_begin = kv_self.size;
15603
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15604
+ const auto & cell = kv_self.cells[i];
15605
+ if (cell.has_seq_id(seq_id)) {
15606
+ ++cell_count;
15607
+ if (cell_range_begin == kv_self.size) {
15608
+ cell_range_begin = i;
15609
+ }
15610
+ }
15611
+ else {
15612
+ if (cell_range_begin != kv_self.size) {
15613
+ cell_ranges.push_back({ cell_range_begin, i });
15614
+ cell_range_begin = kv_self.size;
15615
+ }
15616
+ }
15617
+ }
15618
+ if (cell_range_begin != kv_self.size) {
15619
+ cell_ranges.push_back({ cell_range_begin, kv_self.size });
15620
+ }
15621
+
15622
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
15623
+ uint32_t cell_count_check = 0;
15624
+ for (const auto & range : cell_ranges) {
15625
+ cell_count_check += range.second - range.first;
15626
+ }
15627
+ GGML_ASSERT(cell_count == cell_count_check);
15628
+ }
15629
+
15630
+ // Write the cell count
15631
+ data_ctx.write(&cell_count, sizeof(cell_count));
15632
+
15633
+ const auto & hparams = ctx->model.hparams;
15634
+ const uint32_t n_layer = hparams.n_layer;
15635
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15636
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15637
+
15638
+ // Write the layer count
15639
+ data_ctx.write(&n_layer, sizeof(n_layer));
15640
+
15641
+ // Write n_embd_v_gqa
15642
+ data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
15643
+
15644
+ // Iterate the ranges and write all the pos (this is the token position in the prompt)
15645
+ for (const auto & range : cell_ranges) {
15646
+ for (uint32_t i = range.first; i < range.second; ++i) {
15647
+ const auto & cell = kv_self.cells[i];
15648
+ data_ctx.write(&cell.pos, sizeof(cell.pos));
15649
+ }
15650
+ }
15651
+
15652
+ // Iterate and write all the keys first, each row is a cell
15653
+ // Get whole range at a time
15654
+ std::vector<uint8_t> tmp_buf;
15655
+ for (int il = 0; il < (int)n_layer; ++il) {
15656
+ // Write key type
15657
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15658
+ data_ctx.write(&k_type_i, sizeof(k_type_i));
15659
+
15660
+ // Write row size of key
15661
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15662
+ data_ctx.write(&k_size_row, sizeof(k_size_row));
15663
+
15664
+ // Read each range of cells of k_size length each into tmp_buf and write out
15665
+ for (const auto & range : cell_ranges) {
15666
+ const size_t range_size = range.second - range.first;
15667
+ tmp_buf.resize(range_size * k_size_row);
15668
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
15669
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15670
+ }
15671
+ }
15672
+
15673
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
15674
+ const uint32_t kv_size = kv_self.size;
15675
+ for (int il = 0; il < (int)n_layer; ++il) {
15676
+ // Write value type
15677
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15678
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
15679
+
15680
+ // Write element size
15681
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15682
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
15683
+
15684
+ // For each row, we get the element values of each cell
15685
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15686
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
15687
+ for (const auto & range : cell_ranges) {
15688
+ const size_t range_size = range.second - range.first;
15689
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
15690
+ tmp_buf.resize(range_size * v_size_el);
15691
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
15692
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15693
+ }
15694
+ }
15695
+ }
15696
+
15697
+ return data_ctx.get_size_written();
15698
+ }
15699
+
15700
+ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
15701
+ llama_data_buffer_context data_ctx(dst);
15702
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15703
+ }
15704
+
15705
+ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
15706
+ auto & kv_self = ctx->kv_self;
15707
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15708
+
15709
+ // Wipe the slot
15710
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15711
+
15712
+ const uint8_t * inp = src;
15713
+
15714
+ // Read size of size_t
15715
+ uint32_t size_t_size;
15716
+ memcpy(&size_t_size, inp, sizeof(size_t_size));
15717
+ inp += sizeof(size_t_size);
15718
+ if (size_t_size != sizeof(size_t)) {
15719
+ LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
15720
+ return 0;
15721
+ }
15722
+
15723
+ // Read the cell count
15724
+ uint32_t cell_count;
15725
+ memcpy(&cell_count, inp, sizeof(cell_count));
15726
+ inp += sizeof(cell_count);
15727
+
15728
+ // Read the layer count
15729
+ uint32_t n_layer_ref;
15730
+ memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
15731
+ inp += sizeof(n_layer_ref);
15732
+
15733
+ // Read n_embd_v_gqa
15734
+ uint32_t n_embd_v_gqa_ref;
15735
+ memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
15736
+ inp += sizeof(n_embd_v_gqa_ref);
15737
+
15738
+ // Sanity check model compatibility
15739
+ const auto & hparams = ctx->model.hparams;
15740
+ const uint32_t n_layer = hparams.n_layer;
15741
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15742
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15743
+ if (n_layer != n_layer_ref) {
15744
+ LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
15745
+ return 0;
15746
+ }
15747
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
15748
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
15749
+ return 0;
15750
+ }
15751
+
15752
+ // Allocate the new cells for the slot
15753
+ if (cell_count) {
15754
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
15755
+ batch.n_tokens = cell_count;
15756
+ for (uint32_t i = 0; i < cell_count; ++i) {
15757
+ llama_pos pos;
15758
+ memcpy(&pos, inp, sizeof(pos));
15759
+ inp += sizeof(pos);
15760
+
15761
+ batch.pos[i] = pos;
15762
+ batch.n_seq_id[i] = 1;
15763
+ batch.seq_id[i][0] = dest_seq_id;
15764
+ }
15765
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
15766
+ llama_batch_free(batch);
15767
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
15768
+ return 0;
15769
+ }
15770
+
15771
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
15772
+ // Assume that this is one contiguous block of cells
15773
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
15774
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
15775
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
15776
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
15777
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
15778
+
15779
+ // Cleanup
15780
+ llama_batch_free(batch);
15781
+ }
15782
+
15783
+ const uint32_t kv_size = kv_self.size;
15784
+ const uint32_t kv_head = kv_self.head;
15785
+
15786
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
15787
+ for (int il = 0; il < (int)n_layer; ++il) {
15788
+ // Read type of key
15789
+ int32_t k_type_i_ref;
15790
+ memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
15791
+ inp += sizeof(k_type_i_ref);
15792
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15793
+ if (k_type_i != k_type_i_ref) {
15794
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15795
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
15796
+ return 0;
15797
+ }
15798
+
15799
+ // Read row size of key
15800
+ size_t k_size_row_ref;
15801
+ memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
15802
+ inp += sizeof(k_size_row_ref);
15803
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15804
+ if (k_size_row != k_size_row_ref) {
15805
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15806
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
15807
+ return 0;
15808
+ }
15809
+
15810
+ if (cell_count) {
15811
+ // Read and set the keys for the whole cell range
15812
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
15813
+ inp += cell_count * k_size_row;
15814
+ }
15815
+ }
15816
+
15817
+ // For each layer, read the values for each cell (transposed)
15818
+ for (int il = 0; il < (int)n_layer; ++il) {
15819
+ // Read type of value
15820
+ int32_t v_type_i_ref;
15821
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
15822
+ inp += sizeof(v_type_i_ref);
15823
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15824
+ if (v_type_i != v_type_i_ref) {
15825
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15826
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
15827
+ return 0;
15828
+ }
15829
+
15830
+ // Read element size of value
15831
+ size_t v_size_el_ref;
15832
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
15833
+ inp += sizeof(v_size_el_ref);
15834
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15835
+ if (v_size_el != v_size_el_ref) {
15836
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15837
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
15838
+ return 0;
15839
+ }
15840
+
15841
+ if (cell_count) {
15842
+ // For each row in the transposed matrix, read the values for the whole cell range
15843
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15844
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
15845
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
15846
+ inp += cell_count * v_size_el;
15847
+ }
15848
+ }
15849
+ }
15850
+
15851
+ const size_t nread = inp - src;
15852
+ return nread;
15853
+ }
15854
+
15855
+ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15856
+ llama_file file(filepath, "wb");
15857
+
15858
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
15859
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
15860
+
15861
+ // save the prompt
15862
+ file.write_u32((uint32_t)n_token_count);
15863
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
15864
+
15865
+ // save the context state using stream saving
15866
+ llama_data_file_context data_ctx(&file);
15867
+ llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15868
+
15869
+ const size_t res = file.tell();
15870
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
15871
+ return res;
15872
+ }
15873
+
15874
+ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15875
+ llama_file file(filepath, "rb");
15876
+
15877
+ // version checks
15878
+ {
15879
+ const uint32_t magic = file.read_u32();
15880
+ const uint32_t version = file.read_u32();
15881
+
15882
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
15883
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
15884
+ return 0;
15885
+ }
15886
+ }
15887
+
15888
+ // load the prompt
15889
+ {
15890
+ const uint32_t n_token_count = file.read_u32();
15891
+
15892
+ if (n_token_count > n_token_capacity) {
15893
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
15894
+ return 0;
15895
+ }
15896
+
15897
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
15898
+ *n_token_count_out = n_token_count;
15899
+ }
15900
+
15901
+ // restore the context state
15902
+ {
15903
+ const size_t state_size = file.size - file.tell();
15904
+ std::vector<uint8_t> state_data(state_size);
15905
+ file.read_raw(state_data.data(), state_size);
15906
+ const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
15907
+ if (!nread) {
15908
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
15909
+ return 0;
15910
+ }
15911
+ GGML_ASSERT(nread <= state_size);
15912
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
15913
+ }
15914
+
15915
+ return file.tell();
15916
+ }
15917
+
15918
+ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15919
+ try {
15920
+ return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
15921
+ } catch (const std::exception & err) {
15922
+ LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
15923
+ return 0;
15924
+ }
15925
+ }
15926
+
15927
+ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15928
+ try {
15929
+ return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
15930
+ } catch (const std::exception & err) {
15931
+ LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
15932
+ return 0;
15933
+ }
15934
+ }
15935
+
15399
15936
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
15400
15937
  ctx->cparams.n_threads = n_threads;
15401
15938
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -15509,23 +16046,31 @@ float * llama_get_logits(struct llama_context * ctx) {
15509
16046
  }
15510
16047
 
15511
16048
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
16049
+ int32_t j = -1;
15512
16050
  llama_synchronize(ctx);
15513
16051
 
15514
16052
  try {
15515
16053
  if (ctx->logits == nullptr) {
15516
16054
  throw std::runtime_error("no logits");
15517
16055
  }
15518
- if ((size_t) i >= ctx->output_ids.size()) {
16056
+
16057
+ if (i < 0) {
16058
+ j = ctx->n_outputs + i;
16059
+ if (j < 0) {
16060
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16061
+ }
16062
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15519
16063
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16064
+ } else {
16065
+ j = ctx->output_ids[i];
15520
16066
  }
15521
- const int32_t j = ctx->output_ids[i];
15522
16067
 
15523
16068
  if (j < 0) {
15524
16069
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
16070
  }
15526
- if ((size_t) j >= ctx->output_size) {
16071
+ if (j >= ctx->n_outputs) {
15527
16072
  // This should not happen
15528
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16073
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15529
16074
  }
15530
16075
 
15531
16076
  return ctx->logits + j*ctx->model.hparams.n_vocab;
@@ -15545,23 +16090,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
15545
16090
  }
15546
16091
 
15547
16092
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
16093
+ int32_t j = -1;
16094
+
15548
16095
  llama_synchronize(ctx);
15549
16096
 
15550
16097
  try {
15551
16098
  if (ctx->embd == nullptr) {
15552
16099
  throw std::runtime_error("no embeddings");
15553
16100
  }
15554
- if ((size_t) i >= ctx->output_ids.size()) {
16101
+
16102
+ if (i < 0) {
16103
+ j = ctx->n_outputs + i;
16104
+ if (j < 0) {
16105
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16106
+ }
16107
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15555
16108
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16109
+ } else {
16110
+ j = ctx->output_ids[i];
15556
16111
  }
15557
- const int32_t j = ctx->output_ids[i];
15558
16112
 
15559
16113
  if (j < 0) {
15560
16114
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
16115
  }
15562
- if ((size_t) j >= ctx->output_size) {
16116
+ if (j >= ctx->n_outputs) {
15563
16117
  // This should not happen
15564
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16118
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15565
16119
  }
15566
16120
 
15567
16121
  return ctx->embd + j*ctx->model.hparams.n_embd;
@@ -15608,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
15608
16162
  return model->vocab.special_eos_id;
15609
16163
  }
15610
16164
 
16165
+ llama_token llama_token_cls(const struct llama_model * model) {
16166
+ return model->vocab.special_cls_id;
16167
+ }
16168
+
16169
+ llama_token llama_token_sep(const struct llama_model * model) {
16170
+ return model->vocab.special_sep_id;
16171
+ }
16172
+
15611
16173
  llama_token llama_token_nl(const struct llama_model * model) {
15612
16174
  return model->vocab.linefeed_id;
15613
16175
  }
@@ -15642,9 +16204,9 @@ int32_t llama_tokenize(
15642
16204
  int32_t text_len,
15643
16205
  llama_token * tokens,
15644
16206
  int32_t n_tokens_max,
15645
- bool add_bos,
15646
- bool special) {
15647
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
16207
+ bool add_special,
16208
+ bool parse_special) {
16209
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
15648
16210
 
15649
16211
  if (n_tokens_max < (int) res.size()) {
15650
16212
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);