llama_cpp 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,6 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
- #include <cstddef>
5
- #include <cstdint>
6
- #include <cstdio>
7
4
  #endif
8
5
 
9
6
  #include "llama.h"
@@ -62,6 +59,9 @@
62
59
  #include <cinttypes>
63
60
  #include <climits>
64
61
  #include <cstdarg>
62
+ #include <cstddef>
63
+ #include <cstdint>
64
+ #include <cstdio>
65
65
  #include <cstring>
66
66
  #include <ctime>
67
67
  #include <fstream>
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
114
114
  }
115
115
 
116
116
  void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
- for (size_t pos = 0; ; pos += replace.length()) {
118
- pos = s.find(search, pos);
119
- if (pos == std::string::npos) break;
120
- s.erase(pos, search.length());
121
- s.insert(pos, replace);
117
+ std::string result;
118
+ for (size_t pos = 0; ; pos += search.length()) {
119
+ auto new_pos = s.find(search, pos);
120
+ if (new_pos == std::string::npos) {
121
+ result += s.substr(pos, s.size() - pos);
122
+ break;
123
+ }
124
+ result += s.substr(pos, new_pos - pos) + replace;
125
+ pos = new_pos;
122
126
  }
127
+ s = std::move(result);
123
128
  }
124
129
 
125
130
  static void zeros(std::ofstream & file, size_t n) {
@@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
796
801
  (void) tensor;
797
802
  }
798
803
 
799
- static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
804
+ static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
800
805
  std::vector<char> result(8, 0);
801
- const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
806
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
802
807
  if (n_tokens < 0) {
803
808
  result.resize(-n_tokens);
804
- int check = llama_token_to_str(ctx, token, result.data(), result.size());
809
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
805
810
  GGML_ASSERT(check == -n_tokens);
806
811
  } else {
807
812
  result.resize(n_tokens);
@@ -955,10 +960,10 @@ struct llama_vocab {
955
960
  id linefeed_id = 13;
956
961
 
957
962
  int find_bpe_rank(std::string token_left, std::string token_right) const {
958
- replace_all(token_left, " ", "Ġ");
959
- replace_all(token_left, "\n", "Ċ");
960
- replace_all(token_right, " ", "Ġ");
961
- replace_all(token_right, "\n", "Ċ");
963
+ replace_all(token_left, " ", "\u0120");
964
+ replace_all(token_left, "\n", "\u010A");
965
+ replace_all(token_right, " ", "\u0120");
966
+ replace_all(token_right, "\n", "\u010A");
962
967
 
963
968
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
964
969
  if (it == bpe_ranks.end()) {
@@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init(
1144
1149
 
1145
1150
  enum llama_fver {
1146
1151
  GGUF_FILE_VERSION_V1 = 1,
1152
+ GGUF_FILE_VERSION_V2 = 2,
1147
1153
  };
1148
1154
 
1149
1155
  static const char * llama_file_version_name(llama_fver version) {
1150
1156
  switch (version) {
1151
- case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
1157
+ case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
1158
+ case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
1152
1159
  }
1153
1160
 
1154
1161
  return "unknown";
@@ -1635,7 +1642,8 @@ static void llm_load_hparams(
1635
1642
  }
1636
1643
 
1637
1644
  // TODO: This should probably be in llama.h
1638
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
1645
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
1646
+ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1639
1647
 
1640
1648
  static void llm_load_vocab(
1641
1649
  llama_model_loader & ml,
@@ -1737,7 +1745,11 @@ static void llm_load_vocab(
1737
1745
  }
1738
1746
 
1739
1747
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1740
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
1748
+ if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1749
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1750
+ } else {
1751
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
1752
+ }
1741
1753
 
1742
1754
  // special tokens
1743
1755
  GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
2635
2647
 
2636
2648
  const size_t wsize = ggml_type_size(cur->type);
2637
2649
 
2638
- struct ggml_tensor * tmpq = ggml_view_3d(
2650
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
2651
+ // non-contiguous views is added for the rope operator
2652
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
2639
2653
  ctx0, cur, n_embd_head, n_head, N,
2640
2654
  wsize * n_embd_head,
2641
2655
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2642
- 0);
2656
+ 0));
2643
2657
  offload_func_kq(tmpq);
2644
2658
 
2645
- struct ggml_tensor * tmpk = ggml_view_3d(
2659
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
2646
2660
  ctx0, cur, n_embd_head, n_head_kv, N,
2647
2661
  wsize * n_embd_head,
2648
2662
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2649
- wsize * n_embd_head * n_head);
2663
+ wsize * n_embd_head * n_head));
2650
2664
  offload_func_kq(tmpk);
2651
2665
 
2652
2666
  struct ggml_tensor * tmpv = ggml_view_3d(
@@ -2831,7 +2845,6 @@ static bool llama_eval_internal(
2831
2845
 
2832
2846
  GGML_ASSERT(n_tokens > 0);
2833
2847
  GGML_ASSERT(n_past >= 0);
2834
- GGML_ASSERT(n_threads > 0);
2835
2848
  // TODO: keep the values of n_batch and n_ctx
2836
2849
  // GGML_ASSERT(n_tokens <= n_batch);
2837
2850
  // GGML_ASSERT(n_past + n_tokens <= n_ctx);
@@ -2842,6 +2855,8 @@ static bool llama_eval_internal(
2842
2855
  ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
2843
2856
  #endif
2844
2857
 
2858
+ GGML_ASSERT(n_threads > 0);
2859
+
2845
2860
  const int N = n_tokens;
2846
2861
 
2847
2862
  const auto & model = lctx.model;
@@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3026
3041
  return vocab.token_to_id.at(buf);
3027
3042
  }
3028
3043
 
3029
- static std::string llama_escape_whitespace(const std::string& text) {
3030
- std::string result = "\xe2\x96\x81";
3031
- for (size_t offs = 0; offs < text.length(); ++offs) {
3032
- if (text[offs] == ' ') {
3033
- result += "\xe2\x96\x81";
3034
- } else {
3035
- result += text[offs];
3036
- }
3037
- }
3038
- return result;
3044
+ static void llama_escape_whitespace(std::string & text) {
3045
+ replace_all(text, " ", "\xe2\x96\x81");
3039
3046
  }
3040
3047
 
3041
3048
  static void llama_unescape_whitespace(std::string & word) {
@@ -3204,7 +3211,7 @@ private:
3204
3211
 
3205
3212
  struct llm_bigram_bpe {
3206
3213
  struct comparator {
3207
- bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
3214
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
3208
3215
  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
3209
3216
  }
3210
3217
  };
@@ -3219,7 +3226,7 @@ struct llm_bigram_bpe {
3219
3226
  };
3220
3227
 
3221
3228
  struct llm_tokenizer_bpe {
3222
- llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
3229
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
3223
3230
 
3224
3231
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
3225
3232
  int final_prev_index = -1;
@@ -3352,26 +3359,23 @@ private:
3352
3359
  }
3353
3360
 
3354
3361
  // probably not 100% correct
3355
- // TODO: this is quite slow - how to make it more efficient?
3356
- static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
3362
+ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3357
3363
  std::vector<std::string> words;
3358
3364
 
3359
3365
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
3360
3366
  const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
3361
3367
  const std::regex re(pattern);
3362
- std::smatch m;
3363
3368
 
3364
- while (std::regex_search(text, m, re)) {
3365
- for (auto x : m) {
3366
- words.push_back(x);
3367
- }
3368
- text = m.suffix();
3369
+ auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
3370
+ auto words_end = std::sregex_iterator();
3371
+ auto n_words = std::distance(words_begin, words_end);
3372
+ words.reserve(n_words);
3373
+ for (auto it = words_begin; it != words_end; ++it) {
3374
+ words.push_back(it->str());
3369
3375
  }
3370
-
3371
3376
  return words;
3372
- }
3373
3377
 
3374
- bool flag_g2ws = false;
3378
+ }
3375
3379
 
3376
3380
  const llama_vocab & vocab;
3377
3381
 
@@ -3381,9 +3385,18 @@ private:
3381
3385
  llm_bigram_bpe::queue work_queue;
3382
3386
  };
3383
3387
 
3384
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
3388
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
3385
3389
  std::vector<llama_vocab::id> output;
3386
3390
 
3391
+ // OG tokenizer behavior:
3392
+ //
3393
+ // tokenizer.encode('', add_bos=True) returns [1]
3394
+ // tokenizer.encode('', add_bos=False) returns []
3395
+
3396
+ if (bos && vocab.special_bos_id != -1) {
3397
+ output.push_back(vocab.special_bos_id);
3398
+ }
3399
+
3387
3400
  if (raw_text.empty()) {
3388
3401
  return output;
3389
3402
  }
@@ -3391,29 +3404,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3391
3404
  switch (vocab.type) {
3392
3405
  case LLAMA_VOCAB_TYPE_SPM:
3393
3406
  {
3394
- llm_tokenizer_spm tokenizer(vocab);
3407
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
3408
+ raw_text = " " + raw_text;
3395
3409
 
3396
- if (bos) {
3397
- output.push_back(vocab.special_bos_id);
3398
- }
3399
-
3400
- std::string text;
3401
- if (escape) {
3402
- text = llama_escape_whitespace(raw_text);
3403
- } else {
3404
- text = raw_text;
3405
- }
3406
-
3407
- tokenizer.tokenize(text, output);
3410
+ llm_tokenizer_spm tokenizer(vocab);
3411
+ llama_escape_whitespace(raw_text);
3412
+ tokenizer.tokenize(raw_text, output);
3408
3413
  } break;
3409
3414
  case LLAMA_VOCAB_TYPE_BPE:
3410
3415
  {
3411
- llm_tokenizer_bpe tokenizer(vocab, escape);
3412
-
3413
- if (bos && vocab.special_bos_id != -1) {
3414
- output.push_back(vocab.special_bos_id);
3415
- }
3416
-
3416
+ llm_tokenizer_bpe tokenizer(vocab);
3417
3417
  tokenizer.tokenize(raw_text, output);
3418
3418
  } break;
3419
3419
  };
@@ -3908,7 +3908,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
3908
3908
 
3909
3909
  // Calculate absolute value of second derivatives
3910
3910
  for (size_t i = 0; i < second_derivatives.size(); ++i) {
3911
- second_derivatives[i] = abs(second_derivatives[i]);
3911
+ second_derivatives[i] = std::abs(second_derivatives[i]);
3912
3912
  }
3913
3913
 
3914
3914
  // Normalize the second derivatives
@@ -4099,16 +4099,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
4099
4099
  std::vector<llama_grammar_candidate> candidates_grammar;
4100
4100
 
4101
4101
  for (size_t i = 0; i < candidates->size; ++i) {
4102
- const llama_token id = candidates->data[i].id;
4103
- const std::string text = llama_token_to_text(ctx, id);
4102
+ const llama_token id = candidates->data[i].id;
4103
+ const std::string piece = llama_token_to_str(ctx, id);
4104
4104
  if (id == eos) {
4105
4105
  if (!allow_eos) {
4106
4106
  candidates->data[i].logit = -INFINITY;
4107
4107
  }
4108
- } else if (text.empty() || text[0] == 0) {
4108
+ } else if (piece.empty() || piece[0] == 0) {
4109
4109
  candidates->data[i].logit = -INFINITY;
4110
4110
  } else {
4111
- candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
4111
+ candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
4112
4112
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
4113
4113
  }
4114
4114
  }
@@ -4312,10 +4312,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4312
4312
  GGML_ASSERT(false);
4313
4313
  }
4314
4314
 
4315
- const std::string text = llama_token_to_text(ctx, token);
4315
+ const std::string piece = llama_token_to_str(ctx, token);
4316
4316
 
4317
4317
  // Note terminating 0 in decoded string
4318
- const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
4318
+ const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
4319
4319
  const auto & code_points = decoded.first;
4320
4320
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
4321
4321
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4326
4326
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4327
4327
  }
4328
4328
 
4329
+ //
4330
+ // Beam search
4331
+ //
4332
+
4333
+ struct llama_beam {
4334
+ std::vector<llama_token> tokens;
4335
+ float p; // Cumulative beam probability (renormalized relative to all beams)
4336
+ bool eob; // Initialize end-of-beam to false. Callback sets this to true.
4337
+ // Sort beams by probability. In case of ties, prefer beams at eob.
4338
+ bool operator<(const llama_beam & rhs) const {
4339
+ return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
4340
+ }
4341
+ // Shift off first n tokens and discard them.
4342
+ void shift_tokens(const size_t n) {
4343
+ if (n) {
4344
+ std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
4345
+ tokens.resize(tokens.size() - n);
4346
+ }
4347
+ }
4348
+ llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
4349
+ };
4350
+
4351
+ // A struct for calculating logit-related info.
4352
+ struct llama_logit_info {
4353
+ const float * const logits;
4354
+ const int n_vocab;
4355
+ const float max_l;
4356
+ const float normalizer;
4357
+ struct sum_exp {
4358
+ float max_l;
4359
+ float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
4360
+ };
4361
+ llama_logit_info(llama_context * ctx)
4362
+ : logits(llama_get_logits(ctx))
4363
+ , n_vocab(llama_n_vocab(ctx))
4364
+ , max_l(*std::max_element(logits, logits + n_vocab))
4365
+ , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4366
+ { }
4367
+ llama_token_data get_token_data(const llama_token token_id) const {
4368
+ constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
4369
+ return {token_id, logits[token_id], p};
4370
+ }
4371
+ // Return top k token_data by logit.
4372
+ std::vector<llama_token_data> top_k(size_t k) {
4373
+ std::vector<llama_token_data> min_heap; // min-heap by logit
4374
+ const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
4375
+ min_heap.reserve(k_min);
4376
+ for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
4377
+ min_heap.push_back(get_token_data(token_id));
4378
+ }
4379
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
4380
+ std::make_heap(min_heap.begin(), min_heap.end(), comp);
4381
+ for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
4382
+ if (min_heap.front().logit < logits[token_id]) {
4383
+ std::pop_heap(min_heap.begin(), min_heap.end(), comp);
4384
+ min_heap.back().id = token_id;
4385
+ min_heap.back().logit = logits[token_id];
4386
+ std::push_heap(min_heap.begin(), min_heap.end(), comp);
4387
+ }
4388
+ }
4389
+ return min_heap;
4390
+ }
4391
+ float probability_from_logit(float logit) {
4392
+ return normalizer * std::exp(logit - max_l);
4393
+ }
4394
+ };
4395
+
4396
+ struct llama_beam_search_data {
4397
+ llama_context * ctx;
4398
+ size_t n_beams;
4399
+ int n_past;
4400
+ int n_predict;
4401
+ int n_threads;
4402
+ std::vector<llama_beam> beams;
4403
+ std::vector<llama_beam> next_beams;
4404
+
4405
+ // Re-calculated on each loop iteration
4406
+ size_t common_prefix_length;
4407
+
4408
+ // Used to communicate to/from callback on beams state.
4409
+ std::vector<llama_beam_view> beam_views;
4410
+
4411
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
4412
+ : ctx(ctx)
4413
+ , n_beams(n_beams)
4414
+ , n_past(n_past)
4415
+ , n_predict(n_predict)
4416
+ , n_threads(n_threads)
4417
+ , beam_views(n_beams) {
4418
+ beams.reserve(n_beams);
4419
+ next_beams.reserve(n_beams);
4420
+ }
4421
+
4422
+ // Collapse beams to a single beam given by index.
4423
+ void collapse_beams(const size_t beam_idx) {
4424
+ if (0u < beam_idx) {
4425
+ std::swap(beams[0], beams[beam_idx]);
4426
+ }
4427
+ beams.resize(1);
4428
+ }
4429
+
4430
+ // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
4431
+ // The repetative patterns below reflect the 2 stages of heaps:
4432
+ // * Gather elements until the vector is full, then call std::make_heap() on it.
4433
+ // * If the heap is full and a new element is found that should be included, pop the
4434
+ // least element to the back(), replace it with the new, then push it into the heap.
4435
+ void fill_next_beams_by_top_probabilities(llama_beam & beam) {
4436
+ // Min-heaps use a greater-than comparator.
4437
+ const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
4438
+ if (beam.eob) {
4439
+ // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
4440
+ if (next_beams.size() < n_beams) {
4441
+ next_beams.push_back(std::move(beam));
4442
+ if (next_beams.size() == n_beams) {
4443
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4444
+ }
4445
+ } else if (next_beams.front().p < beam.p) {
4446
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4447
+ next_beams.back() = std::move(beam);
4448
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4449
+ }
4450
+ } else {
4451
+ // beam is not at end-of-sentence, so branch with next top_k tokens.
4452
+ if (!beam.tokens.empty()) {
4453
+ llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
4454
+ }
4455
+ llama_logit_info logit_info(ctx);
4456
+ std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
4457
+ size_t i=0;
4458
+ if (next_beams.size() < n_beams) {
4459
+ for (; next_beams.size() < n_beams ; ++i) {
4460
+ llama_beam next_beam = beam;
4461
+ next_beam.tokens.push_back(next_tokens[i].id);
4462
+ next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
4463
+ next_beams.push_back(std::move(next_beam));
4464
+ }
4465
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4466
+ } else {
4467
+ for (; next_beams.front().p == 0.0f ; ++i) {
4468
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4469
+ next_beams.back() = beam;
4470
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4471
+ next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
4472
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4473
+ }
4474
+ }
4475
+ for (; i < n_beams ; ++i) {
4476
+ const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
4477
+ if (next_beams.front().p < next_p) {
4478
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4479
+ next_beams.back() = beam;
4480
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4481
+ next_beams.back().p = next_p;
4482
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4483
+ }
4484
+ }
4485
+ }
4486
+ }
4487
+
4488
+ // Find common_prefix_length based on beams.
4489
+ // Requires beams is not empty.
4490
+ size_t find_common_prefix_length() {
4491
+ size_t common_prefix_length = beams[0].tokens.size();
4492
+ for (size_t i = 1 ; i < beams.size() ; ++i) {
4493
+ common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
4494
+ for (size_t j = 0 ; j < common_prefix_length ; ++j) {
4495
+ if (beams[0].tokens[j] != beams[i].tokens[j]) {
4496
+ common_prefix_length = j;
4497
+ break;
4498
+ }
4499
+ }
4500
+ }
4501
+ return common_prefix_length;
4502
+ }
4503
+
4504
+ // Construct beams_state to send back to caller via the callback function.
4505
+ // Side effect: set common_prefix_length = find_common_prefix_length();
4506
+ llama_beams_state get_beams_state(const bool last_call) {
4507
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4508
+ beam_views[i] = beams[i].view();
4509
+ }
4510
+ common_prefix_length = find_common_prefix_length();
4511
+ return {beam_views.data(), beams.size(), common_prefix_length, last_call};
4512
+ }
4513
+
4514
+ // Loop:
4515
+ // * while i < n_predict, AND
4516
+ // * any of the beams have not yet reached end-of-beam (eob), AND
4517
+ // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
4518
+ // (since all other beam probabilities can only decrease)
4519
+ void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
4520
+ beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
4521
+ const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
4522
+ for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
4523
+ !beams[top_beam_index()].eob ; ++i) {
4524
+ callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
4525
+ update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
4526
+ if (common_prefix_length) {
4527
+ llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
4528
+ n_past += common_prefix_length;
4529
+ }
4530
+ // Zero-out next_beam probabilities to place them last in following min-heap.
4531
+ std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
4532
+ for (llama_beam & beam : beams) {
4533
+ beam.shift_tokens(common_prefix_length);
4534
+ fill_next_beams_by_top_probabilities(beam);
4535
+ }
4536
+ // next_beams become the beams of next/final iteration. Swap them to re-use memory.
4537
+ beams.swap(next_beams);
4538
+ renormalize_beam_probabilities(beams);
4539
+ }
4540
+ collapse_beams(top_beam_index());
4541
+ callback(callback_data, get_beams_state(true));
4542
+ }
4543
+
4544
+ // As beams grow, the cumulative probabilities decrease.
4545
+ // Renormalize them to avoid floating point underflow.
4546
+ static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
4547
+ const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
4548
+ const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
4549
+ std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
4550
+ }
4551
+
4552
+ // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
4553
+ size_t top_beam_index() {
4554
+ return std::max_element(beams.begin(), beams.end()) - beams.begin();
4555
+ }
4556
+
4557
+ // Copy (p,eob) for each beam which may have been changed by the callback.
4558
+ void update_beams_from_beam_views() {
4559
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4560
+ beams[i].p = beam_views[i].p;
4561
+ beams[i].eob = beam_views[i].eob;
4562
+ }
4563
+ }
4564
+ };
4565
+
4566
+ void llama_beam_search(llama_context * ctx,
4567
+ llama_beam_search_callback_fn_t callback, void * callback_data,
4568
+ size_t n_beams, int n_past, int n_predict, int n_threads) {
4569
+ assert(ctx);
4570
+ const int64_t t_start_sample_us = ggml_time_us();
4571
+
4572
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
4573
+
4574
+ beam_search_data.loop(callback, callback_data);
4575
+
4576
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4577
+ ctx->n_sample++;
4578
+ }
4579
+
4329
4580
  //
4330
4581
  // quantization
4331
4582
  //
@@ -4423,6 +4674,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4423
4674
 
4424
4675
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
4425
4676
 
4677
+ llama_model model;
4678
+ llm_load_arch(*ml, model);
4679
+ llm_load_hparams(*ml, model, 0, 0, 0);
4680
+
4426
4681
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4427
4682
  struct gguf_context * ctx_out = gguf_init_empty();
4428
4683
 
@@ -4448,6 +4703,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4448
4703
  ++n_feed_forward_w2;
4449
4704
  }
4450
4705
  }
4706
+ if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
4707
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
4708
+ __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
4709
+ }
4451
4710
 
4452
4711
  int i_attention_wv = 0;
4453
4712
  int i_feed_forward_w2 = 0;
@@ -4524,8 +4783,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4524
4783
 
4525
4784
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4526
4785
  int nx = tensor->ne[0];
4527
- int ny = tensor->ne[1];
4528
- if (nx % QK_K == 0 && ny % QK_K == 0) {
4786
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4787
+ new_type = GGML_TYPE_Q8_0;
4788
+ }
4789
+ else if (new_type != GGML_TYPE_Q8_0) {
4529
4790
  new_type = GGML_TYPE_Q6_K;
4530
4791
  }
4531
4792
  } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4539,21 +4800,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4539
4800
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4540
4801
  else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4541
4802
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4803
+ if (model.type == MODEL_70B) {
4804
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4805
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4806
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4807
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4808
+ }
4542
4809
  ++i_attention_wv;
4543
4810
  } else if (name.find("ffn_down.weight") != std::string::npos) {
4544
4811
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4545
4812
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4546
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4813
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4814
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4815
+ : GGML_TYPE_Q3_K;
4816
+ }
4817
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4818
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4819
+ }
4820
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4821
+ if (model.arch == LLM_ARCH_FALCON) {
4822
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4823
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4824
+ } else {
4825
+ if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4826
+ }
4827
+ }
4828
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4829
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4830
+ new_type = GGML_TYPE_Q5_K;
4547
4831
  }
4548
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4549
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4550
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4551
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
4552
4832
  ++i_feed_forward_w2;
4553
4833
  } else if (name.find("attn_output.weight") != std::string::npos) {
4554
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4555
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4556
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4834
+ if (model.arch != LLM_ARCH_FALCON) {
4835
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4836
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4837
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4838
+ } else {
4839
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4840
+ }
4841
+ }
4842
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
4843
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4844
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4845
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4557
4846
  }
4558
4847
  else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4559
4848
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -4568,8 +4857,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4568
4857
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4569
4858
  int nx = tensor->ne[0];
4570
4859
  int ny = tensor->ne[1];
4571
- if (nx % QK_K != 0 || ny % QK_K != 0) {
4572
- LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
4860
+ if (nx % QK_K != 0) {
4861
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4573
4862
  convert_incompatible_tensor = true;
4574
4863
  }
4575
4864
  }
@@ -4998,7 +5287,7 @@ struct llama_context_params llama_context_default_params() {
4998
5287
  /*.progress_callback =*/ nullptr,
4999
5288
  /*.progress_callback_user_data =*/ nullptr,
5000
5289
  /*.low_vram =*/ false,
5001
- /*.mul_mat_q =*/ false,
5290
+ /*.mul_mat_q =*/ true,
5002
5291
  /*.f16_kv =*/ true,
5003
5292
  /*.logits_all =*/ false,
5004
5293
  /*.vocab_only =*/ false,
@@ -5297,13 +5586,29 @@ int llama_model_n_embd(const struct llama_model * model) {
5297
5586
  return model->hparams.n_embd;
5298
5587
  }
5299
5588
 
5300
- int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
5589
+ int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
5301
5590
  return snprintf(buf, buf_size, "%s %s %s",
5302
5591
  model->name.c_str(),
5303
5592
  llama_model_type_name(model->type),
5304
5593
  llama_model_ftype_name(model->ftype).c_str());
5305
5594
  }
5306
5595
 
5596
+ uint64_t llama_model_size(const struct llama_model * model) {
5597
+ uint64_t size = 0;
5598
+ for (const auto & it : model->tensors_by_name) {
5599
+ size += ggml_nbytes(it.second);
5600
+ }
5601
+ return size;
5602
+ }
5603
+
5604
+ uint64_t llama_model_n_params(const struct llama_model * model) {
5605
+ uint64_t nparams = 0;
5606
+ for (const auto & it : model->tensors_by_name) {
5607
+ nparams += ggml_nelements(it.second);
5608
+ }
5609
+ return nparams;
5610
+ }
5611
+
5307
5612
  int llama_model_quantize(
5308
5613
  const char * fname_inp,
5309
5614
  const char * fname_out,
@@ -5828,8 +6133,7 @@ int llama_tokenize_with_model(
5828
6133
  llama_token * tokens,
5829
6134
  int n_max_tokens,
5830
6135
  bool add_bos) {
5831
- auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
5832
- auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
6136
+ auto res = llama_tokenize_internal(model->vocab, text, add_bos);
5833
6137
 
5834
6138
  if (n_max_tokens < (int) res.size()) {
5835
6139
  LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -5843,12 +6147,12 @@ int llama_tokenize_with_model(
5843
6147
  return res.size();
5844
6148
  }
5845
6149
 
5846
- int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
5847
- return llama_token_to_str_with_model(&ctx->model, token, buf, length);
6150
+ int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6151
+ return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
5848
6152
  }
5849
6153
 
5850
- // does not write null-terminator to str
5851
- int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6154
+ // does not write null-terminator to buf
6155
+ int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
5852
6156
  if (0 <= token && token < llama_model_n_vocab(model)) {
5853
6157
  if (llama_is_normal_token(model->vocab, token)) {
5854
6158
  std::string result = model->vocab.id_to_token[token].text;
@@ -5936,11 +6240,40 @@ const char * llama_print_system_info(void) {
5936
6240
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
5937
6241
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
5938
6242
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
6243
+ s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
5939
6244
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
5940
6245
 
5941
6246
  return s.c_str();
5942
6247
  }
5943
6248
 
6249
+ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6250
+ fprintf(stream, "\n");
6251
+ fprintf(stream, "###########\n");
6252
+ fprintf(stream, "# Timings #\n");
6253
+ fprintf(stream, "###########\n");
6254
+ fprintf(stream, "\n");
6255
+
6256
+ fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
6257
+ 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
6258
+ fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
6259
+ 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
6260
+ fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
6261
+ 1.0e-3 * ctx->t_sample_us / ctx->n_sample);
6262
+ fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
6263
+ fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
6264
+ fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
6265
+ fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
6266
+ fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
6267
+ fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
6268
+ fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
6269
+ fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
6270
+ 1.0e6 * ctx->n_eval / ctx->t_eval_us);
6271
+ fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
6272
+ 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
6273
+ fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
6274
+ 1.0e6 * ctx->n_sample / ctx->t_sample_us);
6275
+ }
6276
+
5944
6277
  // For internal test use
5945
6278
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
5946
6279
  return ctx->model.tensors_by_name;
@@ -5951,10 +6284,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
5951
6284
  g_state.log_callback_user_data = user_data;
5952
6285
  }
5953
6286
 
5954
- #if defined(_MSC_VER) && !defined(vsnprintf)
5955
- #define vsnprintf _vsnprintf
5956
- #endif
5957
-
5958
6287
  static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
5959
6288
  va_list args_copy;
5960
6289
  va_copy(args_copy, args);