llama_cpp 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,6 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
- #include <cstddef>
5
- #include <cstdint>
6
- #include <cstdio>
7
4
  #endif
8
5
 
9
6
  #include "llama.h"
@@ -62,6 +59,9 @@
62
59
  #include <cinttypes>
63
60
  #include <climits>
64
61
  #include <cstdarg>
62
+ #include <cstddef>
63
+ #include <cstdint>
64
+ #include <cstdio>
65
65
  #include <cstring>
66
66
  #include <ctime>
67
67
  #include <fstream>
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
114
114
  }
115
115
 
116
116
  void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
- for (size_t pos = 0; ; pos += replace.length()) {
118
- pos = s.find(search, pos);
119
- if (pos == std::string::npos) break;
120
- s.erase(pos, search.length());
121
- s.insert(pos, replace);
117
+ std::string result;
118
+ for (size_t pos = 0; ; pos += search.length()) {
119
+ auto new_pos = s.find(search, pos);
120
+ if (new_pos == std::string::npos) {
121
+ result += s.substr(pos, s.size() - pos);
122
+ break;
123
+ }
124
+ result += s.substr(pos, new_pos - pos) + replace;
125
+ pos = new_pos;
122
126
  }
127
+ s = std::move(result);
123
128
  }
124
129
 
125
130
  static void zeros(std::ofstream & file, size_t n) {
@@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
796
801
  (void) tensor;
797
802
  }
798
803
 
799
- static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
804
+ static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
800
805
  std::vector<char> result(8, 0);
801
- const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
806
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
802
807
  if (n_tokens < 0) {
803
808
  result.resize(-n_tokens);
804
- int check = llama_token_to_str(ctx, token, result.data(), result.size());
809
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
805
810
  GGML_ASSERT(check == -n_tokens);
806
811
  } else {
807
812
  result.resize(n_tokens);
@@ -955,10 +960,10 @@ struct llama_vocab {
955
960
  id linefeed_id = 13;
956
961
 
957
962
  int find_bpe_rank(std::string token_left, std::string token_right) const {
958
- replace_all(token_left, " ", "Ġ");
959
- replace_all(token_left, "\n", "Ċ");
960
- replace_all(token_right, " ", "Ġ");
961
- replace_all(token_right, "\n", "Ċ");
963
+ replace_all(token_left, " ", "\u0120");
964
+ replace_all(token_left, "\n", "\u010A");
965
+ replace_all(token_right, " ", "\u0120");
966
+ replace_all(token_right, "\n", "\u010A");
962
967
 
963
968
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
964
969
  if (it == bpe_ranks.end()) {
@@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init(
1144
1149
 
1145
1150
  enum llama_fver {
1146
1151
  GGUF_FILE_VERSION_V1 = 1,
1152
+ GGUF_FILE_VERSION_V2 = 2,
1147
1153
  };
1148
1154
 
1149
1155
  static const char * llama_file_version_name(llama_fver version) {
1150
1156
  switch (version) {
1151
- case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
1157
+ case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
1158
+ case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
1152
1159
  }
1153
1160
 
1154
1161
  return "unknown";
@@ -1635,7 +1642,8 @@ static void llm_load_hparams(
1635
1642
  }
1636
1643
 
1637
1644
  // TODO: This should probably be in llama.h
1638
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
1645
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
1646
+ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1639
1647
 
1640
1648
  static void llm_load_vocab(
1641
1649
  llama_model_loader & ml,
@@ -1737,7 +1745,11 @@ static void llm_load_vocab(
1737
1745
  }
1738
1746
 
1739
1747
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1740
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
1748
+ if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1749
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1750
+ } else {
1751
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
1752
+ }
1741
1753
 
1742
1754
  // special tokens
1743
1755
  GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
2635
2647
 
2636
2648
  const size_t wsize = ggml_type_size(cur->type);
2637
2649
 
2638
- struct ggml_tensor * tmpq = ggml_view_3d(
2650
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
2651
+ // non-contiguous views is added for the rope operator
2652
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
2639
2653
  ctx0, cur, n_embd_head, n_head, N,
2640
2654
  wsize * n_embd_head,
2641
2655
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2642
- 0);
2656
+ 0));
2643
2657
  offload_func_kq(tmpq);
2644
2658
 
2645
- struct ggml_tensor * tmpk = ggml_view_3d(
2659
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
2646
2660
  ctx0, cur, n_embd_head, n_head_kv, N,
2647
2661
  wsize * n_embd_head,
2648
2662
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2649
- wsize * n_embd_head * n_head);
2663
+ wsize * n_embd_head * n_head));
2650
2664
  offload_func_kq(tmpk);
2651
2665
 
2652
2666
  struct ggml_tensor * tmpv = ggml_view_3d(
@@ -2831,7 +2845,6 @@ static bool llama_eval_internal(
2831
2845
 
2832
2846
  GGML_ASSERT(n_tokens > 0);
2833
2847
  GGML_ASSERT(n_past >= 0);
2834
- GGML_ASSERT(n_threads > 0);
2835
2848
  // TODO: keep the values of n_batch and n_ctx
2836
2849
  // GGML_ASSERT(n_tokens <= n_batch);
2837
2850
  // GGML_ASSERT(n_past + n_tokens <= n_ctx);
@@ -2842,6 +2855,8 @@ static bool llama_eval_internal(
2842
2855
  ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
2843
2856
  #endif
2844
2857
 
2858
+ GGML_ASSERT(n_threads > 0);
2859
+
2845
2860
  const int N = n_tokens;
2846
2861
 
2847
2862
  const auto & model = lctx.model;
@@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3026
3041
  return vocab.token_to_id.at(buf);
3027
3042
  }
3028
3043
 
3029
- static std::string llama_escape_whitespace(const std::string& text) {
3030
- std::string result = "\xe2\x96\x81";
3031
- for (size_t offs = 0; offs < text.length(); ++offs) {
3032
- if (text[offs] == ' ') {
3033
- result += "\xe2\x96\x81";
3034
- } else {
3035
- result += text[offs];
3036
- }
3037
- }
3038
- return result;
3044
+ static void llama_escape_whitespace(std::string & text) {
3045
+ replace_all(text, " ", "\xe2\x96\x81");
3039
3046
  }
3040
3047
 
3041
3048
  static void llama_unescape_whitespace(std::string & word) {
@@ -3204,7 +3211,7 @@ private:
3204
3211
 
3205
3212
  struct llm_bigram_bpe {
3206
3213
  struct comparator {
3207
- bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
3214
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
3208
3215
  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
3209
3216
  }
3210
3217
  };
@@ -3219,7 +3226,7 @@ struct llm_bigram_bpe {
3219
3226
  };
3220
3227
 
3221
3228
  struct llm_tokenizer_bpe {
3222
- llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
3229
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
3223
3230
 
3224
3231
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
3225
3232
  int final_prev_index = -1;
@@ -3352,26 +3359,23 @@ private:
3352
3359
  }
3353
3360
 
3354
3361
  // probably not 100% correct
3355
- // TODO: this is quite slow - how to make it more efficient?
3356
- static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
3362
+ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3357
3363
  std::vector<std::string> words;
3358
3364
 
3359
3365
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
3360
3366
  const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
3361
3367
  const std::regex re(pattern);
3362
- std::smatch m;
3363
3368
 
3364
- while (std::regex_search(text, m, re)) {
3365
- for (auto x : m) {
3366
- words.push_back(x);
3367
- }
3368
- text = m.suffix();
3369
+ auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
3370
+ auto words_end = std::sregex_iterator();
3371
+ auto n_words = std::distance(words_begin, words_end);
3372
+ words.reserve(n_words);
3373
+ for (auto it = words_begin; it != words_end; ++it) {
3374
+ words.push_back(it->str());
3369
3375
  }
3370
-
3371
3376
  return words;
3372
- }
3373
3377
 
3374
- bool flag_g2ws = false;
3378
+ }
3375
3379
 
3376
3380
  const llama_vocab & vocab;
3377
3381
 
@@ -3381,9 +3385,18 @@ private:
3381
3385
  llm_bigram_bpe::queue work_queue;
3382
3386
  };
3383
3387
 
3384
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
3388
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
3385
3389
  std::vector<llama_vocab::id> output;
3386
3390
 
3391
+ // OG tokenizer behavior:
3392
+ //
3393
+ // tokenizer.encode('', add_bos=True) returns [1]
3394
+ // tokenizer.encode('', add_bos=False) returns []
3395
+
3396
+ if (bos && vocab.special_bos_id != -1) {
3397
+ output.push_back(vocab.special_bos_id);
3398
+ }
3399
+
3387
3400
  if (raw_text.empty()) {
3388
3401
  return output;
3389
3402
  }
@@ -3391,29 +3404,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3391
3404
  switch (vocab.type) {
3392
3405
  case LLAMA_VOCAB_TYPE_SPM:
3393
3406
  {
3394
- llm_tokenizer_spm tokenizer(vocab);
3407
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
3408
+ raw_text = " " + raw_text;
3395
3409
 
3396
- if (bos) {
3397
- output.push_back(vocab.special_bos_id);
3398
- }
3399
-
3400
- std::string text;
3401
- if (escape) {
3402
- text = llama_escape_whitespace(raw_text);
3403
- } else {
3404
- text = raw_text;
3405
- }
3406
-
3407
- tokenizer.tokenize(text, output);
3410
+ llm_tokenizer_spm tokenizer(vocab);
3411
+ llama_escape_whitespace(raw_text);
3412
+ tokenizer.tokenize(raw_text, output);
3408
3413
  } break;
3409
3414
  case LLAMA_VOCAB_TYPE_BPE:
3410
3415
  {
3411
- llm_tokenizer_bpe tokenizer(vocab, escape);
3412
-
3413
- if (bos && vocab.special_bos_id != -1) {
3414
- output.push_back(vocab.special_bos_id);
3415
- }
3416
-
3416
+ llm_tokenizer_bpe tokenizer(vocab);
3417
3417
  tokenizer.tokenize(raw_text, output);
3418
3418
  } break;
3419
3419
  };
@@ -3908,7 +3908,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
3908
3908
 
3909
3909
  // Calculate absolute value of second derivatives
3910
3910
  for (size_t i = 0; i < second_derivatives.size(); ++i) {
3911
- second_derivatives[i] = abs(second_derivatives[i]);
3911
+ second_derivatives[i] = std::abs(second_derivatives[i]);
3912
3912
  }
3913
3913
 
3914
3914
  // Normalize the second derivatives
@@ -4099,16 +4099,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
4099
4099
  std::vector<llama_grammar_candidate> candidates_grammar;
4100
4100
 
4101
4101
  for (size_t i = 0; i < candidates->size; ++i) {
4102
- const llama_token id = candidates->data[i].id;
4103
- const std::string text = llama_token_to_text(ctx, id);
4102
+ const llama_token id = candidates->data[i].id;
4103
+ const std::string piece = llama_token_to_str(ctx, id);
4104
4104
  if (id == eos) {
4105
4105
  if (!allow_eos) {
4106
4106
  candidates->data[i].logit = -INFINITY;
4107
4107
  }
4108
- } else if (text.empty() || text[0] == 0) {
4108
+ } else if (piece.empty() || piece[0] == 0) {
4109
4109
  candidates->data[i].logit = -INFINITY;
4110
4110
  } else {
4111
- candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
4111
+ candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
4112
4112
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
4113
4113
  }
4114
4114
  }
@@ -4312,10 +4312,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4312
4312
  GGML_ASSERT(false);
4313
4313
  }
4314
4314
 
4315
- const std::string text = llama_token_to_text(ctx, token);
4315
+ const std::string piece = llama_token_to_str(ctx, token);
4316
4316
 
4317
4317
  // Note terminating 0 in decoded string
4318
- const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
4318
+ const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
4319
4319
  const auto & code_points = decoded.first;
4320
4320
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
4321
4321
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4326
4326
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4327
4327
  }
4328
4328
 
4329
+ //
4330
+ // Beam search
4331
+ //
4332
+
4333
+ struct llama_beam {
4334
+ std::vector<llama_token> tokens;
4335
+ float p; // Cumulative beam probability (renormalized relative to all beams)
4336
+ bool eob; // Initialize end-of-beam to false. Callback sets this to true.
4337
+ // Sort beams by probability. In case of ties, prefer beams at eob.
4338
+ bool operator<(const llama_beam & rhs) const {
4339
+ return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
4340
+ }
4341
+ // Shift off first n tokens and discard them.
4342
+ void shift_tokens(const size_t n) {
4343
+ if (n) {
4344
+ std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
4345
+ tokens.resize(tokens.size() - n);
4346
+ }
4347
+ }
4348
+ llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
4349
+ };
4350
+
4351
+ // A struct for calculating logit-related info.
4352
+ struct llama_logit_info {
4353
+ const float * const logits;
4354
+ const int n_vocab;
4355
+ const float max_l;
4356
+ const float normalizer;
4357
+ struct sum_exp {
4358
+ float max_l;
4359
+ float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
4360
+ };
4361
+ llama_logit_info(llama_context * ctx)
4362
+ : logits(llama_get_logits(ctx))
4363
+ , n_vocab(llama_n_vocab(ctx))
4364
+ , max_l(*std::max_element(logits, logits + n_vocab))
4365
+ , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4366
+ { }
4367
+ llama_token_data get_token_data(const llama_token token_id) const {
4368
+ constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
4369
+ return {token_id, logits[token_id], p};
4370
+ }
4371
+ // Return top k token_data by logit.
4372
+ std::vector<llama_token_data> top_k(size_t k) {
4373
+ std::vector<llama_token_data> min_heap; // min-heap by logit
4374
+ const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
4375
+ min_heap.reserve(k_min);
4376
+ for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
4377
+ min_heap.push_back(get_token_data(token_id));
4378
+ }
4379
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
4380
+ std::make_heap(min_heap.begin(), min_heap.end(), comp);
4381
+ for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
4382
+ if (min_heap.front().logit < logits[token_id]) {
4383
+ std::pop_heap(min_heap.begin(), min_heap.end(), comp);
4384
+ min_heap.back().id = token_id;
4385
+ min_heap.back().logit = logits[token_id];
4386
+ std::push_heap(min_heap.begin(), min_heap.end(), comp);
4387
+ }
4388
+ }
4389
+ return min_heap;
4390
+ }
4391
+ float probability_from_logit(float logit) {
4392
+ return normalizer * std::exp(logit - max_l);
4393
+ }
4394
+ };
4395
+
4396
+ struct llama_beam_search_data {
4397
+ llama_context * ctx;
4398
+ size_t n_beams;
4399
+ int n_past;
4400
+ int n_predict;
4401
+ int n_threads;
4402
+ std::vector<llama_beam> beams;
4403
+ std::vector<llama_beam> next_beams;
4404
+
4405
+ // Re-calculated on each loop iteration
4406
+ size_t common_prefix_length;
4407
+
4408
+ // Used to communicate to/from callback on beams state.
4409
+ std::vector<llama_beam_view> beam_views;
4410
+
4411
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
4412
+ : ctx(ctx)
4413
+ , n_beams(n_beams)
4414
+ , n_past(n_past)
4415
+ , n_predict(n_predict)
4416
+ , n_threads(n_threads)
4417
+ , beam_views(n_beams) {
4418
+ beams.reserve(n_beams);
4419
+ next_beams.reserve(n_beams);
4420
+ }
4421
+
4422
+ // Collapse beams to a single beam given by index.
4423
+ void collapse_beams(const size_t beam_idx) {
4424
+ if (0u < beam_idx) {
4425
+ std::swap(beams[0], beams[beam_idx]);
4426
+ }
4427
+ beams.resize(1);
4428
+ }
4429
+
4430
+ // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
4431
+ // The repetative patterns below reflect the 2 stages of heaps:
4432
+ // * Gather elements until the vector is full, then call std::make_heap() on it.
4433
+ // * If the heap is full and a new element is found that should be included, pop the
4434
+ // least element to the back(), replace it with the new, then push it into the heap.
4435
+ void fill_next_beams_by_top_probabilities(llama_beam & beam) {
4436
+ // Min-heaps use a greater-than comparator.
4437
+ const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
4438
+ if (beam.eob) {
4439
+ // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
4440
+ if (next_beams.size() < n_beams) {
4441
+ next_beams.push_back(std::move(beam));
4442
+ if (next_beams.size() == n_beams) {
4443
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4444
+ }
4445
+ } else if (next_beams.front().p < beam.p) {
4446
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4447
+ next_beams.back() = std::move(beam);
4448
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4449
+ }
4450
+ } else {
4451
+ // beam is not at end-of-sentence, so branch with next top_k tokens.
4452
+ if (!beam.tokens.empty()) {
4453
+ llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
4454
+ }
4455
+ llama_logit_info logit_info(ctx);
4456
+ std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
4457
+ size_t i=0;
4458
+ if (next_beams.size() < n_beams) {
4459
+ for (; next_beams.size() < n_beams ; ++i) {
4460
+ llama_beam next_beam = beam;
4461
+ next_beam.tokens.push_back(next_tokens[i].id);
4462
+ next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
4463
+ next_beams.push_back(std::move(next_beam));
4464
+ }
4465
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4466
+ } else {
4467
+ for (; next_beams.front().p == 0.0f ; ++i) {
4468
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4469
+ next_beams.back() = beam;
4470
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4471
+ next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
4472
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4473
+ }
4474
+ }
4475
+ for (; i < n_beams ; ++i) {
4476
+ const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
4477
+ if (next_beams.front().p < next_p) {
4478
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4479
+ next_beams.back() = beam;
4480
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4481
+ next_beams.back().p = next_p;
4482
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4483
+ }
4484
+ }
4485
+ }
4486
+ }
4487
+
4488
+ // Find common_prefix_length based on beams.
4489
+ // Requires beams is not empty.
4490
+ size_t find_common_prefix_length() {
4491
+ size_t common_prefix_length = beams[0].tokens.size();
4492
+ for (size_t i = 1 ; i < beams.size() ; ++i) {
4493
+ common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
4494
+ for (size_t j = 0 ; j < common_prefix_length ; ++j) {
4495
+ if (beams[0].tokens[j] != beams[i].tokens[j]) {
4496
+ common_prefix_length = j;
4497
+ break;
4498
+ }
4499
+ }
4500
+ }
4501
+ return common_prefix_length;
4502
+ }
4503
+
4504
+ // Construct beams_state to send back to caller via the callback function.
4505
+ // Side effect: set common_prefix_length = find_common_prefix_length();
4506
+ llama_beams_state get_beams_state(const bool last_call) {
4507
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4508
+ beam_views[i] = beams[i].view();
4509
+ }
4510
+ common_prefix_length = find_common_prefix_length();
4511
+ return {beam_views.data(), beams.size(), common_prefix_length, last_call};
4512
+ }
4513
+
4514
+ // Loop:
4515
+ // * while i < n_predict, AND
4516
+ // * any of the beams have not yet reached end-of-beam (eob), AND
4517
+ // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
4518
+ // (since all other beam probabilities can only decrease)
4519
+ void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
4520
+ beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
4521
+ const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
4522
+ for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
4523
+ !beams[top_beam_index()].eob ; ++i) {
4524
+ callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
4525
+ update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
4526
+ if (common_prefix_length) {
4527
+ llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
4528
+ n_past += common_prefix_length;
4529
+ }
4530
+ // Zero-out next_beam probabilities to place them last in following min-heap.
4531
+ std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
4532
+ for (llama_beam & beam : beams) {
4533
+ beam.shift_tokens(common_prefix_length);
4534
+ fill_next_beams_by_top_probabilities(beam);
4535
+ }
4536
+ // next_beams become the beams of next/final iteration. Swap them to re-use memory.
4537
+ beams.swap(next_beams);
4538
+ renormalize_beam_probabilities(beams);
4539
+ }
4540
+ collapse_beams(top_beam_index());
4541
+ callback(callback_data, get_beams_state(true));
4542
+ }
4543
+
4544
+ // As beams grow, the cumulative probabilities decrease.
4545
+ // Renormalize them to avoid floating point underflow.
4546
+ static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
4547
+ const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
4548
+ const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
4549
+ std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
4550
+ }
4551
+
4552
+ // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
4553
+ size_t top_beam_index() {
4554
+ return std::max_element(beams.begin(), beams.end()) - beams.begin();
4555
+ }
4556
+
4557
+ // Copy (p,eob) for each beam which may have been changed by the callback.
4558
+ void update_beams_from_beam_views() {
4559
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4560
+ beams[i].p = beam_views[i].p;
4561
+ beams[i].eob = beam_views[i].eob;
4562
+ }
4563
+ }
4564
+ };
4565
+
4566
+ void llama_beam_search(llama_context * ctx,
4567
+ llama_beam_search_callback_fn_t callback, void * callback_data,
4568
+ size_t n_beams, int n_past, int n_predict, int n_threads) {
4569
+ assert(ctx);
4570
+ const int64_t t_start_sample_us = ggml_time_us();
4571
+
4572
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
4573
+
4574
+ beam_search_data.loop(callback, callback_data);
4575
+
4576
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4577
+ ctx->n_sample++;
4578
+ }
4579
+
4329
4580
  //
4330
4581
  // quantization
4331
4582
  //
@@ -4423,6 +4674,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4423
4674
 
4424
4675
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
4425
4676
 
4677
+ llama_model model;
4678
+ llm_load_arch(*ml, model);
4679
+ llm_load_hparams(*ml, model, 0, 0, 0);
4680
+
4426
4681
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4427
4682
  struct gguf_context * ctx_out = gguf_init_empty();
4428
4683
 
@@ -4448,6 +4703,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4448
4703
  ++n_feed_forward_w2;
4449
4704
  }
4450
4705
  }
4706
+ if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
4707
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
4708
+ __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
4709
+ }
4451
4710
 
4452
4711
  int i_attention_wv = 0;
4453
4712
  int i_feed_forward_w2 = 0;
@@ -4524,8 +4783,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4524
4783
 
4525
4784
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4526
4785
  int nx = tensor->ne[0];
4527
- int ny = tensor->ne[1];
4528
- if (nx % QK_K == 0 && ny % QK_K == 0) {
4786
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4787
+ new_type = GGML_TYPE_Q8_0;
4788
+ }
4789
+ else if (new_type != GGML_TYPE_Q8_0) {
4529
4790
  new_type = GGML_TYPE_Q6_K;
4530
4791
  }
4531
4792
  } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4539,21 +4800,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4539
4800
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4540
4801
  else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4541
4802
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4803
+ if (model.type == MODEL_70B) {
4804
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4805
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4806
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4807
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4808
+ }
4542
4809
  ++i_attention_wv;
4543
4810
  } else if (name.find("ffn_down.weight") != std::string::npos) {
4544
4811
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4545
4812
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4546
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4813
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4814
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4815
+ : GGML_TYPE_Q3_K;
4816
+ }
4817
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4818
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4819
+ }
4820
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4821
+ if (model.arch == LLM_ARCH_FALCON) {
4822
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4823
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4824
+ } else {
4825
+ if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4826
+ }
4827
+ }
4828
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4829
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4830
+ new_type = GGML_TYPE_Q5_K;
4547
4831
  }
4548
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4549
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4550
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4551
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
4552
4832
  ++i_feed_forward_w2;
4553
4833
  } else if (name.find("attn_output.weight") != std::string::npos) {
4554
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4555
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4556
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4834
+ if (model.arch != LLM_ARCH_FALCON) {
4835
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4836
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4837
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4838
+ } else {
4839
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4840
+ }
4841
+ }
4842
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
4843
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4844
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4845
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4557
4846
  }
4558
4847
  else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4559
4848
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -4568,8 +4857,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4568
4857
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4569
4858
  int nx = tensor->ne[0];
4570
4859
  int ny = tensor->ne[1];
4571
- if (nx % QK_K != 0 || ny % QK_K != 0) {
4572
- LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
4860
+ if (nx % QK_K != 0) {
4861
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4573
4862
  convert_incompatible_tensor = true;
4574
4863
  }
4575
4864
  }
@@ -4998,7 +5287,7 @@ struct llama_context_params llama_context_default_params() {
4998
5287
  /*.progress_callback =*/ nullptr,
4999
5288
  /*.progress_callback_user_data =*/ nullptr,
5000
5289
  /*.low_vram =*/ false,
5001
- /*.mul_mat_q =*/ false,
5290
+ /*.mul_mat_q =*/ true,
5002
5291
  /*.f16_kv =*/ true,
5003
5292
  /*.logits_all =*/ false,
5004
5293
  /*.vocab_only =*/ false,
@@ -5297,13 +5586,29 @@ int llama_model_n_embd(const struct llama_model * model) {
5297
5586
  return model->hparams.n_embd;
5298
5587
  }
5299
5588
 
5300
- int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
5589
+ int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
5301
5590
  return snprintf(buf, buf_size, "%s %s %s",
5302
5591
  model->name.c_str(),
5303
5592
  llama_model_type_name(model->type),
5304
5593
  llama_model_ftype_name(model->ftype).c_str());
5305
5594
  }
5306
5595
 
5596
+ uint64_t llama_model_size(const struct llama_model * model) {
5597
+ uint64_t size = 0;
5598
+ for (const auto & it : model->tensors_by_name) {
5599
+ size += ggml_nbytes(it.second);
5600
+ }
5601
+ return size;
5602
+ }
5603
+
5604
+ uint64_t llama_model_n_params(const struct llama_model * model) {
5605
+ uint64_t nparams = 0;
5606
+ for (const auto & it : model->tensors_by_name) {
5607
+ nparams += ggml_nelements(it.second);
5608
+ }
5609
+ return nparams;
5610
+ }
5611
+
5307
5612
  int llama_model_quantize(
5308
5613
  const char * fname_inp,
5309
5614
  const char * fname_out,
@@ -5828,8 +6133,7 @@ int llama_tokenize_with_model(
5828
6133
  llama_token * tokens,
5829
6134
  int n_max_tokens,
5830
6135
  bool add_bos) {
5831
- auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
5832
- auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
6136
+ auto res = llama_tokenize_internal(model->vocab, text, add_bos);
5833
6137
 
5834
6138
  if (n_max_tokens < (int) res.size()) {
5835
6139
  LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -5843,12 +6147,12 @@ int llama_tokenize_with_model(
5843
6147
  return res.size();
5844
6148
  }
5845
6149
 
5846
- int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
5847
- return llama_token_to_str_with_model(&ctx->model, token, buf, length);
6150
+ int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6151
+ return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
5848
6152
  }
5849
6153
 
5850
- // does not write null-terminator to str
5851
- int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6154
+ // does not write null-terminator to buf
6155
+ int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
5852
6156
  if (0 <= token && token < llama_model_n_vocab(model)) {
5853
6157
  if (llama_is_normal_token(model->vocab, token)) {
5854
6158
  std::string result = model->vocab.id_to_token[token].text;
@@ -5936,11 +6240,40 @@ const char * llama_print_system_info(void) {
5936
6240
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
5937
6241
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
5938
6242
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
6243
+ s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
5939
6244
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
5940
6245
 
5941
6246
  return s.c_str();
5942
6247
  }
5943
6248
 
6249
+ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6250
+ fprintf(stream, "\n");
6251
+ fprintf(stream, "###########\n");
6252
+ fprintf(stream, "# Timings #\n");
6253
+ fprintf(stream, "###########\n");
6254
+ fprintf(stream, "\n");
6255
+
6256
+ fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
6257
+ 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
6258
+ fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
6259
+ 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
6260
+ fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
6261
+ 1.0e-3 * ctx->t_sample_us / ctx->n_sample);
6262
+ fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
6263
+ fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
6264
+ fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
6265
+ fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
6266
+ fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
6267
+ fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
6268
+ fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
6269
+ fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
6270
+ 1.0e6 * ctx->n_eval / ctx->t_eval_us);
6271
+ fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
6272
+ 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
6273
+ fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
6274
+ 1.0e6 * ctx->n_sample / ctx->t_sample_us);
6275
+ }
6276
+
5944
6277
  // For internal test use
5945
6278
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
5946
6279
  return ctx->model.tensors_by_name;
@@ -5951,10 +6284,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
5951
6284
  g_state.log_callback_user_data = user_data;
5952
6285
  }
5953
6286
 
5954
- #if defined(_MSC_VER) && !defined(vsnprintf)
5955
- #define vsnprintf _vsnprintf
5956
- #endif
5957
-
5958
6287
  static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
5959
6288
  va_list args_copy;
5960
6289
  va_copy(args_copy, args);