llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,6 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
- #include <cstddef>
5
- #include <cstdint>
6
- #include <cstdio>
7
4
  #endif
8
5
 
9
6
  #include "llama.h"
@@ -62,6 +59,9 @@
62
59
  #include <cinttypes>
63
60
  #include <climits>
64
61
  #include <cstdarg>
62
+ #include <cstddef>
63
+ #include <cstdint>
64
+ #include <cstdio>
65
65
  #include <cstring>
66
66
  #include <ctime>
67
67
  #include <fstream>
@@ -114,13 +114,21 @@ static size_t utf8_len(char src) {
114
114
  }
115
115
 
116
116
  void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
- for (size_t pos = 0; ; pos += replace.length()) {
118
- pos = s.find(search, pos);
119
- if (pos == std::string::npos) break;
120
- s.erase(pos, search.length());
121
- s.insert(pos, replace);
117
+ std::string result;
118
+ for (size_t pos = 0; ; pos += search.length()) {
119
+ auto new_pos = s.find(search, pos);
120
+ if (new_pos == std::string::npos) {
121
+ result += s.substr(pos, s.size() - pos);
122
+ break;
123
+ }
124
+ result += s.substr(pos, new_pos - pos) + replace;
125
+ pos = new_pos;
122
126
  }
127
+ s = std::move(result);
123
128
  }
129
+ #ifdef GGML_USE_CPU_HBM
130
+ #include <hbwmalloc.h>
131
+ #endif
124
132
 
125
133
  static void zeros(std::ofstream & file, size_t n) {
126
134
  char zero = 0;
@@ -320,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
320
328
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
321
329
  },
322
330
  },
331
+ {
332
+ LLM_ARCH_GPT2,
333
+ {
334
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
335
+ },
336
+ },
337
+ {
338
+ LLM_ARCH_GPTJ,
339
+ {
340
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
341
+ },
342
+ },
343
+ {
344
+ LLM_ARCH_GPTNEOX,
345
+ {
346
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
347
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
348
+ { LLM_TENSOR_OUTPUT, "output" },
349
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
350
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
351
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
352
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
353
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
354
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
355
+ },
356
+ },
357
+ {
358
+ LLM_ARCH_MPT,
359
+ {
360
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
+ },
362
+ },
363
+ {
364
+ LLM_ARCH_UNKNOWN,
365
+ {
366
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
367
+ },
368
+ },
323
369
  };
324
370
 
325
371
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -407,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
407
453
  #elif GGML_USE_METAL
408
454
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
409
455
  # define llama_host_free(data) ggml_metal_host_free(data)
456
+ #elif GGML_USE_CPU_HBM
457
+ # define llama_host_malloc(n) hbw_malloc(n)
458
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
410
459
  #else
411
460
  # define llama_host_malloc(n) malloc(n)
412
461
  # define llama_host_free(data) free(data)
@@ -563,16 +612,16 @@ struct llama_mmap {
563
612
 
564
613
  if (prefetch > 0) {
565
614
  // Advise the kernel to preload the mapped memory
566
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
567
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
615
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
616
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
568
617
  strerror(errno));
569
618
  }
570
619
  }
571
620
  if (numa) {
572
621
  // advise the kernel not to use readahead
573
622
  // (because the next page might not belong on the same node)
574
- if (madvise(addr, file->size, MADV_RANDOM)) {
575
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
623
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
624
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
576
625
  strerror(errno));
577
626
  }
578
627
  }
@@ -609,7 +658,9 @@ struct llama_mmap {
609
658
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
610
659
  if (prefetch) {
611
660
  // Advise the kernel to preload the mapped memory
661
+
612
662
  WIN32_MEMORY_RANGE_ENTRY range;
663
+
613
664
  range.VirtualAddress = addr;
614
665
  range.NumberOfBytes = (SIZE_T)size;
615
666
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -796,12 +847,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
796
847
  (void) tensor;
797
848
  }
798
849
 
799
- static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
850
+ static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
800
851
  std::vector<char> result(8, 0);
801
- const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
852
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
802
853
  if (n_tokens < 0) {
803
854
  result.resize(-n_tokens);
804
- int check = llama_token_to_str(ctx, token, result.data(), result.size());
855
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
805
856
  GGML_ASSERT(check == -n_tokens);
806
857
  } else {
807
858
  result.resize(n_tokens);
@@ -955,10 +1006,10 @@ struct llama_vocab {
955
1006
  id linefeed_id = 13;
956
1007
 
957
1008
  int find_bpe_rank(std::string token_left, std::string token_right) const {
958
- replace_all(token_left, " ", "Ġ");
959
- replace_all(token_left, "\n", "Ċ");
960
- replace_all(token_right, " ", "Ġ");
961
- replace_all(token_right, "\n", "Ċ");
1009
+ replace_all(token_left, " ", "\u0120");
1010
+ replace_all(token_left, "\n", "\u010A");
1011
+ replace_all(token_right, " ", "\u0120");
1012
+ replace_all(token_right, "\n", "\u010A");
962
1013
 
963
1014
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
964
1015
  if (it == bpe_ranks.end()) {
@@ -1144,11 +1195,13 @@ static bool llama_kv_cache_init(
1144
1195
 
1145
1196
  enum llama_fver {
1146
1197
  GGUF_FILE_VERSION_V1 = 1,
1198
+ GGUF_FILE_VERSION_V2 = 2,
1147
1199
  };
1148
1200
 
1149
1201
  static const char * llama_file_version_name(llama_fver version) {
1150
1202
  switch (version) {
1151
- case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
1203
+ case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
1204
+ case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
1152
1205
  }
1153
1206
 
1154
1207
  return "unknown";
@@ -1439,7 +1492,11 @@ struct llama_model_loader {
1439
1492
  // allocate temp buffer if not using mmap
1440
1493
  if (!use_mmap && cur->data == NULL) {
1441
1494
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1442
- cur->data = malloc(ggml_nbytes(cur));
1495
+ #ifdef GGML_USE_CPU_HBM
1496
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1497
+ #else
1498
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1499
+ #endif
1443
1500
  }
1444
1501
 
1445
1502
  load_data_for(cur);
@@ -1593,9 +1650,13 @@ static void llm_load_hparams(
1593
1650
 
1594
1651
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1595
1652
 
1596
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1597
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1653
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1654
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1655
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1656
+ }
1598
1657
  }
1658
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1659
+ // gpt-j n_rot = rotary_dim
1599
1660
  }
1600
1661
 
1601
1662
  // arch-specific KVs
@@ -1635,7 +1696,8 @@ static void llm_load_hparams(
1635
1696
  }
1636
1697
 
1637
1698
  // TODO: This should probably be in llama.h
1638
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
1699
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
1700
+ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1639
1701
 
1640
1702
  static void llm_load_vocab(
1641
1703
  llama_model_loader & ml,
@@ -1737,7 +1799,11 @@ static void llm_load_vocab(
1737
1799
  }
1738
1800
 
1739
1801
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1740
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
1802
+ if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1803
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1804
+ } else {
1805
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
1806
+ }
1741
1807
 
1742
1808
  // special tokens
1743
1809
  GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@@ -2635,18 +2701,20 @@ static struct ggml_cgraph * llm_build_falcon(
2635
2701
 
2636
2702
  const size_t wsize = ggml_type_size(cur->type);
2637
2703
 
2638
- struct ggml_tensor * tmpq = ggml_view_3d(
2704
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
2705
+ // non-contiguous views is added for the rope operator
2706
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
2639
2707
  ctx0, cur, n_embd_head, n_head, N,
2640
2708
  wsize * n_embd_head,
2641
2709
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2642
- 0);
2710
+ 0));
2643
2711
  offload_func_kq(tmpq);
2644
2712
 
2645
- struct ggml_tensor * tmpk = ggml_view_3d(
2713
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
2646
2714
  ctx0, cur, n_embd_head, n_head_kv, N,
2647
2715
  wsize * n_embd_head,
2648
2716
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2649
- wsize * n_embd_head * n_head);
2717
+ wsize * n_embd_head * n_head));
2650
2718
  offload_func_kq(tmpk);
2651
2719
 
2652
2720
  struct ggml_tensor * tmpv = ggml_view_3d(
@@ -2831,7 +2899,6 @@ static bool llama_eval_internal(
2831
2899
 
2832
2900
  GGML_ASSERT(n_tokens > 0);
2833
2901
  GGML_ASSERT(n_past >= 0);
2834
- GGML_ASSERT(n_threads > 0);
2835
2902
  // TODO: keep the values of n_batch and n_ctx
2836
2903
  // GGML_ASSERT(n_tokens <= n_batch);
2837
2904
  // GGML_ASSERT(n_past + n_tokens <= n_ctx);
@@ -2842,6 +2909,8 @@ static bool llama_eval_internal(
2842
2909
  ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
2843
2910
  #endif
2844
2911
 
2912
+ GGML_ASSERT(n_threads > 0);
2913
+
2845
2914
  const int N = n_tokens;
2846
2915
 
2847
2916
  const auto & model = lctx.model;
@@ -2880,7 +2949,12 @@ static bool llama_eval_internal(
2880
2949
 
2881
2950
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2882
2951
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2883
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
2952
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
2953
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
2954
+ // with the BLAS calls. need a better solution
2955
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
2956
+ n_threads = std::min(4, n_threads);
2957
+ }
2884
2958
 
2885
2959
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2886
2960
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -2985,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
2985
3059
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
2986
3060
  }
2987
3061
 
2988
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
2989
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
2990
- }
2991
-
2992
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
2993
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
2994
- }
2995
-
2996
3062
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
2997
3063
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
2998
3064
  }
2999
3065
 
3000
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3001
- GGML_ASSERT(llama_is_control_token(vocab, id));
3002
- return id == vocab.special_bos_id;
3003
- }
3004
-
3005
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3006
- GGML_ASSERT(llama_is_control_token(vocab, id));
3007
- return id == vocab.special_eos_id;
3008
- }
3009
-
3010
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3011
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3012
- return id == vocab.special_pad_id;
3013
- }
3014
-
3015
3066
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3016
3067
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3017
3068
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3026,16 +3077,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3026
3077
  return vocab.token_to_id.at(buf);
3027
3078
  }
3028
3079
 
3029
- static std::string llama_escape_whitespace(const std::string& text) {
3030
- std::string result = "\xe2\x96\x81";
3031
- for (size_t offs = 0; offs < text.length(); ++offs) {
3032
- if (text[offs] == ' ') {
3033
- result += "\xe2\x96\x81";
3034
- } else {
3035
- result += text[offs];
3036
- }
3037
- }
3038
- return result;
3080
+ static void llama_escape_whitespace(std::string & text) {
3081
+ replace_all(text, " ", "\xe2\x96\x81");
3039
3082
  }
3040
3083
 
3041
3084
  static void llama_unescape_whitespace(std::string & word) {
@@ -3204,7 +3247,7 @@ private:
3204
3247
 
3205
3248
  struct llm_bigram_bpe {
3206
3249
  struct comparator {
3207
- bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
3250
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
3208
3251
  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
3209
3252
  }
3210
3253
  };
@@ -3219,7 +3262,7 @@ struct llm_bigram_bpe {
3219
3262
  };
3220
3263
 
3221
3264
  struct llm_tokenizer_bpe {
3222
- llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
3265
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
3223
3266
 
3224
3267
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
3225
3268
  int final_prev_index = -1;
@@ -3312,9 +3355,15 @@ struct llm_tokenizer_bpe {
3312
3355
  std::string byte_str(1, *j);
3313
3356
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3314
3357
  if (token_multibyte == vocab.token_to_id.end()) {
3315
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3358
+ try {
3359
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3360
+ output.push_back(token_byte);
3361
+ } catch (const std::out_of_range & err) {
3362
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3363
+ }
3364
+ } else {
3365
+ output.push_back((*token_multibyte).second);
3316
3366
  }
3317
- output.push_back((*token_multibyte).second);
3318
3367
  }
3319
3368
  } else {
3320
3369
  output.push_back((*token).second);
@@ -3352,26 +3401,23 @@ private:
3352
3401
  }
3353
3402
 
3354
3403
  // probably not 100% correct
3355
- // TODO: this is quite slow - how to make it more efficient?
3356
- static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
3404
+ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3357
3405
  std::vector<std::string> words;
3358
3406
 
3359
3407
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
3360
3408
  const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
3361
3409
  const std::regex re(pattern);
3362
- std::smatch m;
3363
3410
 
3364
- while (std::regex_search(text, m, re)) {
3365
- for (auto x : m) {
3366
- words.push_back(x);
3367
- }
3368
- text = m.suffix();
3411
+ auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
3412
+ auto words_end = std::sregex_iterator();
3413
+ auto n_words = std::distance(words_begin, words_end);
3414
+ words.reserve(n_words);
3415
+ for (auto it = words_begin; it != words_end; ++it) {
3416
+ words.push_back(it->str());
3369
3417
  }
3370
-
3371
3418
  return words;
3372
- }
3373
3419
 
3374
- bool flag_g2ws = false;
3420
+ }
3375
3421
 
3376
3422
  const llama_vocab & vocab;
3377
3423
 
@@ -3381,9 +3427,18 @@ private:
3381
3427
  llm_bigram_bpe::queue work_queue;
3382
3428
  };
3383
3429
 
3384
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
3430
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
3385
3431
  std::vector<llama_vocab::id> output;
3386
3432
 
3433
+ // OG tokenizer behavior:
3434
+ //
3435
+ // tokenizer.encode('', add_bos=True) returns [1]
3436
+ // tokenizer.encode('', add_bos=False) returns []
3437
+
3438
+ if (bos && vocab.special_bos_id != -1) {
3439
+ output.push_back(vocab.special_bos_id);
3440
+ }
3441
+
3387
3442
  if (raw_text.empty()) {
3388
3443
  return output;
3389
3444
  }
@@ -3391,29 +3446,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3391
3446
  switch (vocab.type) {
3392
3447
  case LLAMA_VOCAB_TYPE_SPM:
3393
3448
  {
3394
- llm_tokenizer_spm tokenizer(vocab);
3395
-
3396
- if (bos) {
3397
- output.push_back(vocab.special_bos_id);
3398
- }
3449
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
3450
+ raw_text = " " + raw_text;
3399
3451
 
3400
- std::string text;
3401
- if (escape) {
3402
- text = llama_escape_whitespace(raw_text);
3403
- } else {
3404
- text = raw_text;
3405
- }
3406
-
3407
- tokenizer.tokenize(text, output);
3452
+ llm_tokenizer_spm tokenizer(vocab);
3453
+ llama_escape_whitespace(raw_text);
3454
+ tokenizer.tokenize(raw_text, output);
3408
3455
  } break;
3409
3456
  case LLAMA_VOCAB_TYPE_BPE:
3410
3457
  {
3411
- llm_tokenizer_bpe tokenizer(vocab, escape);
3412
-
3413
- if (bos && vocab.special_bos_id != -1) {
3414
- output.push_back(vocab.special_bos_id);
3415
- }
3416
-
3458
+ llm_tokenizer_bpe tokenizer(vocab);
3417
3459
  tokenizer.tokenize(raw_text, output);
3418
3460
  } break;
3419
3461
  };
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
3595
3637
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
3638
 
3597
3639
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
3640
+ new_stacks.emplace_back(stack);
3599
3641
  return;
3600
3642
  }
3601
3643
 
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
3632
3674
  }
3633
3675
  case LLAMA_GRETYPE_CHAR:
3634
3676
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
3677
+ new_stacks.emplace_back(stack);
3636
3678
  break;
3637
3679
  default:
3638
3680
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
3839
  delete grammar;
3798
3840
  }
3799
3841
 
3842
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
3843
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
3844
+
3845
+ // redirect elements in stacks to point to new rules
3846
+ for (size_t is = 0; is < result->stacks.size(); is++) {
3847
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
3848
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
3849
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
3850
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
3851
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
3852
+ }
3853
+ }
3854
+ }
3855
+ }
3856
+ }
3857
+
3858
+ return result;
3859
+ }
3860
+
3800
3861
  //
3801
3862
  // sampling
3802
3863
  //
@@ -3908,7 +3969,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
3908
3969
 
3909
3970
  // Calculate absolute value of second derivatives
3910
3971
  for (size_t i = 0; i < second_derivatives.size(); ++i) {
3911
- second_derivatives[i] = abs(second_derivatives[i]);
3972
+ second_derivatives[i] = std::abs(second_derivatives[i]);
3912
3973
  }
3913
3974
 
3914
3975
  // Normalize the second derivatives
@@ -4099,16 +4160,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
4099
4160
  std::vector<llama_grammar_candidate> candidates_grammar;
4100
4161
 
4101
4162
  for (size_t i = 0; i < candidates->size; ++i) {
4102
- const llama_token id = candidates->data[i].id;
4103
- const std::string text = llama_token_to_text(ctx, id);
4163
+ const llama_token id = candidates->data[i].id;
4164
+ const std::string piece = llama_token_to_str(ctx, id);
4104
4165
  if (id == eos) {
4105
4166
  if (!allow_eos) {
4106
4167
  candidates->data[i].logit = -INFINITY;
4107
4168
  }
4108
- } else if (text.empty() || text[0] == 0) {
4169
+ } else if (piece.empty() || piece[0] == 0) {
4109
4170
  candidates->data[i].logit = -INFINITY;
4110
4171
  } else {
4111
- candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
4172
+ candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
4112
4173
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
4113
4174
  }
4114
4175
  }
@@ -4312,10 +4373,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4312
4373
  GGML_ASSERT(false);
4313
4374
  }
4314
4375
 
4315
- const std::string text = llama_token_to_text(ctx, token);
4376
+ const std::string piece = llama_token_to_str(ctx, token);
4316
4377
 
4317
4378
  // Note terminating 0 in decoded string
4318
- const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
4379
+ const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
4319
4380
  const auto & code_points = decoded.first;
4320
4381
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
4321
4382
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -4326,6 +4387,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4326
4387
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4327
4388
  }
4328
4389
 
4390
+ //
4391
+ // Beam search
4392
+ //
4393
+
4394
+ struct llama_beam {
4395
+ std::vector<llama_token> tokens;
4396
+ float p; // Cumulative beam probability (renormalized relative to all beams)
4397
+ bool eob; // Initialize end-of-beam to false. Callback sets this to true.
4398
+ // Sort beams by probability. In case of ties, prefer beams at eob.
4399
+ bool operator<(const llama_beam & rhs) const {
4400
+ return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
4401
+ }
4402
+ // Shift off first n tokens and discard them.
4403
+ void shift_tokens(const size_t n) {
4404
+ if (n) {
4405
+ std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
4406
+ tokens.resize(tokens.size() - n);
4407
+ }
4408
+ }
4409
+ llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
4410
+ };
4411
+
4412
+ // A struct for calculating logit-related info.
4413
+ struct llama_logit_info {
4414
+ const float * const logits;
4415
+ const int n_vocab;
4416
+ const float max_l;
4417
+ const float normalizer;
4418
+ struct sum_exp {
4419
+ float max_l;
4420
+ float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
4421
+ };
4422
+ llama_logit_info(llama_context * ctx)
4423
+ : logits(llama_get_logits(ctx))
4424
+ , n_vocab(llama_n_vocab(ctx))
4425
+ , max_l(*std::max_element(logits, logits + n_vocab))
4426
+ , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4427
+ { }
4428
+ llama_token_data get_token_data(const llama_token token_id) const {
4429
+ constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
4430
+ return {token_id, logits[token_id], p};
4431
+ }
4432
+ // Return top k token_data by logit.
4433
+ std::vector<llama_token_data> top_k(size_t k) {
4434
+ std::vector<llama_token_data> min_heap; // min-heap by logit
4435
+ const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
4436
+ min_heap.reserve(k_min);
4437
+ for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
4438
+ min_heap.push_back(get_token_data(token_id));
4439
+ }
4440
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
4441
+ std::make_heap(min_heap.begin(), min_heap.end(), comp);
4442
+ for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
4443
+ if (min_heap.front().logit < logits[token_id]) {
4444
+ std::pop_heap(min_heap.begin(), min_heap.end(), comp);
4445
+ min_heap.back().id = token_id;
4446
+ min_heap.back().logit = logits[token_id];
4447
+ std::push_heap(min_heap.begin(), min_heap.end(), comp);
4448
+ }
4449
+ }
4450
+ return min_heap;
4451
+ }
4452
+ float probability_from_logit(float logit) const {
4453
+ return normalizer * std::exp(logit - max_l);
4454
+ }
4455
+ };
4456
+
4457
+ struct llama_beam_search_data {
4458
+ llama_context * ctx;
4459
+ size_t n_beams;
4460
+ int n_past;
4461
+ int n_predict;
4462
+ int n_threads;
4463
+ std::vector<llama_beam> beams;
4464
+ std::vector<llama_beam> next_beams;
4465
+
4466
+ // Re-calculated on each loop iteration
4467
+ size_t common_prefix_length;
4468
+
4469
+ // Used to communicate to/from callback on beams state.
4470
+ std::vector<llama_beam_view> beam_views;
4471
+
4472
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
4473
+ : ctx(ctx)
4474
+ , n_beams(n_beams)
4475
+ , n_past(n_past)
4476
+ , n_predict(n_predict)
4477
+ , n_threads(n_threads)
4478
+ , beam_views(n_beams) {
4479
+ beams.reserve(n_beams);
4480
+ next_beams.reserve(n_beams);
4481
+ }
4482
+
4483
+ // Collapse beams to a single beam given by index.
4484
+ void collapse_beams(const size_t beam_idx) {
4485
+ if (0u < beam_idx) {
4486
+ std::swap(beams[0], beams[beam_idx]);
4487
+ }
4488
+ beams.resize(1);
4489
+ }
4490
+
4491
+ // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
4492
+ // The repetative patterns below reflect the 2 stages of heaps:
4493
+ // * Gather elements until the vector is full, then call std::make_heap() on it.
4494
+ // * If the heap is full and a new element is found that should be included, pop the
4495
+ // least element to the back(), replace it with the new, then push it into the heap.
4496
+ void fill_next_beams_by_top_probabilities(llama_beam & beam) {
4497
+ // Min-heaps use a greater-than comparator.
4498
+ const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
4499
+ if (beam.eob) {
4500
+ // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
4501
+ if (next_beams.size() < n_beams) {
4502
+ next_beams.push_back(std::move(beam));
4503
+ if (next_beams.size() == n_beams) {
4504
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4505
+ }
4506
+ } else if (next_beams.front().p < beam.p) {
4507
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4508
+ next_beams.back() = std::move(beam);
4509
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4510
+ }
4511
+ } else {
4512
+ // beam is not at end-of-sentence, so branch with next top_k tokens.
4513
+ if (!beam.tokens.empty()) {
4514
+ llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
4515
+ }
4516
+ llama_logit_info logit_info(ctx);
4517
+ std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
4518
+ size_t i=0;
4519
+ if (next_beams.size() < n_beams) {
4520
+ for (; next_beams.size() < n_beams ; ++i) {
4521
+ llama_beam next_beam = beam;
4522
+ next_beam.tokens.push_back(next_tokens[i].id);
4523
+ next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
4524
+ next_beams.push_back(std::move(next_beam));
4525
+ }
4526
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4527
+ } else {
4528
+ for (; next_beams.front().p == 0.0f ; ++i) {
4529
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4530
+ next_beams.back() = beam;
4531
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4532
+ next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
4533
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4534
+ }
4535
+ }
4536
+ for (; i < n_beams ; ++i) {
4537
+ const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
4538
+ if (next_beams.front().p < next_p) {
4539
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4540
+ next_beams.back() = beam;
4541
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4542
+ next_beams.back().p = next_p;
4543
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4544
+ }
4545
+ }
4546
+ }
4547
+ }
4548
+
4549
+ // Find common_prefix_length based on beams.
4550
+ // Requires beams is not empty.
4551
+ size_t find_common_prefix_length() {
4552
+ size_t common_prefix_length = beams[0].tokens.size();
4553
+ for (size_t i = 1 ; i < beams.size() ; ++i) {
4554
+ common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
4555
+ for (size_t j = 0 ; j < common_prefix_length ; ++j) {
4556
+ if (beams[0].tokens[j] != beams[i].tokens[j]) {
4557
+ common_prefix_length = j;
4558
+ break;
4559
+ }
4560
+ }
4561
+ }
4562
+ return common_prefix_length;
4563
+ }
4564
+
4565
+ // Construct beams_state to send back to caller via the callback function.
4566
+ // Side effect: set common_prefix_length = find_common_prefix_length();
4567
+ llama_beams_state get_beams_state(const bool last_call) {
4568
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4569
+ beam_views[i] = beams[i].view();
4570
+ }
4571
+ common_prefix_length = find_common_prefix_length();
4572
+ return {beam_views.data(), beams.size(), common_prefix_length, last_call};
4573
+ }
4574
+
4575
+ // Loop:
4576
+ // * while i < n_predict, AND
4577
+ // * any of the beams have not yet reached end-of-beam (eob), AND
4578
+ // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
4579
+ // (since all other beam probabilities can only decrease)
4580
+ void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
4581
+ beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
4582
+ const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
4583
+ for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
4584
+ !beams[top_beam_index()].eob ; ++i) {
4585
+ callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
4586
+ update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
4587
+ if (common_prefix_length) {
4588
+ llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
4589
+ n_past += common_prefix_length;
4590
+ }
4591
+ // Zero-out next_beam probabilities to place them last in following min-heap.
4592
+ std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
4593
+ for (llama_beam & beam : beams) {
4594
+ beam.shift_tokens(common_prefix_length);
4595
+ fill_next_beams_by_top_probabilities(beam);
4596
+ }
4597
+ // next_beams become the beams of next/final iteration. Swap them to re-use memory.
4598
+ beams.swap(next_beams);
4599
+ renormalize_beam_probabilities(beams);
4600
+ }
4601
+ collapse_beams(top_beam_index());
4602
+ callback(callback_data, get_beams_state(true));
4603
+ }
4604
+
4605
+ // As beams grow, the cumulative probabilities decrease.
4606
+ // Renormalize them to avoid floating point underflow.
4607
+ static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
4608
+ const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
4609
+ const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
4610
+ std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
4611
+ }
4612
+
4613
+ // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
4614
+ size_t top_beam_index() {
4615
+ return std::max_element(beams.begin(), beams.end()) - beams.begin();
4616
+ }
4617
+
4618
+ // Copy (p,eob) for each beam which may have been changed by the callback.
4619
+ void update_beams_from_beam_views() {
4620
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4621
+ beams[i].p = beam_views[i].p;
4622
+ beams[i].eob = beam_views[i].eob;
4623
+ }
4624
+ }
4625
+ };
4626
+
4627
+ void llama_beam_search(llama_context * ctx,
4628
+ llama_beam_search_callback_fn_t callback, void * callback_data,
4629
+ size_t n_beams, int n_past, int n_predict, int n_threads) {
4630
+ assert(ctx);
4631
+ const int64_t t_start_sample_us = ggml_time_us();
4632
+
4633
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
4634
+
4635
+ beam_search_data.loop(callback, callback_data);
4636
+
4637
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4638
+ ctx->n_sample++;
4639
+ }
4640
+
4329
4641
  //
4330
4642
  // quantization
4331
4643
  //
@@ -4423,6 +4735,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4423
4735
 
4424
4736
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
4425
4737
 
4738
+ llama_model model;
4739
+ llm_load_arch(*ml, model);
4740
+ llm_load_hparams(*ml, model, 0, 0, 0);
4741
+
4742
+ if (params->only_copy) {
4743
+ ftype = model.ftype;
4744
+ }
4745
+
4426
4746
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4427
4747
  struct gguf_context * ctx_out = gguf_init_empty();
4428
4748
 
@@ -4448,6 +4768,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4448
4768
  ++n_feed_forward_w2;
4449
4769
  }
4450
4770
  }
4771
+ if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
4772
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
4773
+ __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
4774
+ }
4451
4775
 
4452
4776
  int i_attention_wv = 0;
4453
4777
  int i_feed_forward_w2 = 0;
@@ -4460,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4460
4784
  std::vector<std::thread> workers;
4461
4785
  std::mutex mutex;
4462
4786
 
4787
+ #ifdef GGML_USE_K_QUANTS
4463
4788
  auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4464
4789
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4465
4790
  };
4791
+ #endif
4466
4792
 
4467
4793
  int idx = 0;
4468
4794
 
@@ -4505,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4505
4831
  // quantize only 2D tensors
4506
4832
  quantize &= (tensor->n_dims == 2);
4507
4833
  quantize &= params->quantize_output_tensor || name != "output.weight";
4508
- quantize &= quantized_type != tensor->type;
4834
+ quantize &= !params->only_copy;
4509
4835
 
4510
4836
  enum ggml_type new_type;
4511
4837
  void * new_data;
4512
4838
  size_t new_size;
4513
4839
 
4514
- if (!quantize) {
4515
- new_type = tensor->type;
4516
- new_data = tensor->data;
4517
- new_size = ggml_nbytes(tensor);
4518
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4519
- } else {
4840
+ if (quantize) {
4520
4841
  new_type = quantized_type;
4521
4842
  #ifdef GGML_USE_K_QUANTS
4522
4843
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4524,8 +4845,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4524
4845
 
4525
4846
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4526
4847
  int nx = tensor->ne[0];
4527
- int ny = tensor->ne[1];
4528
- if (nx % QK_K == 0 && ny % QK_K == 0) {
4848
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
+ new_type = GGML_TYPE_Q8_0;
4850
+ }
4851
+ else if (new_type != GGML_TYPE_Q8_0) {
4529
4852
  new_type = GGML_TYPE_Q6_K;
4530
4853
  }
4531
4854
  } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4539,21 +4862,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4539
4862
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4540
4863
  else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4541
4864
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
+ if (model.type == MODEL_70B) {
4866
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
+ }
4542
4871
  ++i_attention_wv;
4543
4872
  } else if (name.find("ffn_down.weight") != std::string::npos) {
4544
4873
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4545
4874
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4546
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4875
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
+ : GGML_TYPE_Q3_K;
4878
+ }
4879
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
+ }
4882
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
+ if (model.arch == LLM_ARCH_FALCON) {
4884
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
+ } else {
4887
+ if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
+ }
4889
+ }
4890
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
+ new_type = GGML_TYPE_Q5_K;
4547
4893
  }
4548
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4549
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4550
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4551
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
4552
4894
  ++i_feed_forward_w2;
4553
4895
  } else if (name.find("attn_output.weight") != std::string::npos) {
4554
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4555
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4556
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4896
+ if (model.arch != LLM_ARCH_FALCON) {
4897
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
+ } else {
4901
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
+ }
4903
+ }
4904
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4557
4908
  }
4558
4909
  else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4559
4910
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -4568,8 +4919,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4568
4919
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4569
4920
  int nx = tensor->ne[0];
4570
4921
  int ny = tensor->ne[1];
4571
- if (nx % QK_K != 0 || ny % QK_K != 0) {
4572
- LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
4922
+ if (nx % QK_K != 0) {
4923
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4573
4924
  convert_incompatible_tensor = true;
4574
4925
  }
4575
4926
  }
@@ -4585,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4585
4936
  }
4586
4937
  }
4587
4938
  #endif
4588
-
4939
+ // If we've decided to quantize to the same type the tensor is already
4940
+ // in then there's nothing to do.
4941
+ quantize = tensor->type != new_type;
4942
+ }
4943
+ if (!quantize) {
4944
+ new_type = tensor->type;
4945
+ new_data = tensor->data;
4946
+ new_size = ggml_nbytes(tensor);
4947
+ LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4948
+ } else {
4589
4949
  const size_t nelements = ggml_nelements(tensor);
4590
4950
 
4591
4951
  float * f32_data;
@@ -4990,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
4990
5350
  /*.seed =*/ LLAMA_DEFAULT_SEED,
4991
5351
  /*.n_ctx =*/ 512,
4992
5352
  /*.n_batch =*/ 512,
4993
- /*.gpu_layers =*/ 0,
5353
+ /*.n_gpu_layers =*/ 0,
4994
5354
  /*.main_gpu =*/ 0,
4995
5355
  /*.tensor_split =*/ nullptr,
4996
5356
  /*.rope_freq_base =*/ 10000.0f,
@@ -4998,7 +5358,7 @@ struct llama_context_params llama_context_default_params() {
4998
5358
  /*.progress_callback =*/ nullptr,
4999
5359
  /*.progress_callback_user_data =*/ nullptr,
5000
5360
  /*.low_vram =*/ false,
5001
- /*.mul_mat_q =*/ false,
5361
+ /*.mul_mat_q =*/ true,
5002
5362
  /*.f16_kv =*/ true,
5003
5363
  /*.logits_all =*/ false,
5004
5364
  /*.vocab_only =*/ false,
@@ -5007,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
5007
5367
  /*.embedding =*/ false,
5008
5368
  };
5009
5369
 
5370
+ #ifdef GGML_USE_METAL
5371
+ result.n_gpu_layers = 1;
5372
+ #endif
5373
+
5010
5374
  return result;
5011
5375
  }
5012
5376
 
@@ -5016,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5016
5380
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5017
5381
  /*.allow_requantize =*/ false,
5018
5382
  /*.quantize_output_tensor =*/ true,
5383
+ /*.only_copy =*/ false,
5019
5384
  };
5020
5385
 
5021
5386
  return result;
@@ -5198,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
5198
5563
  }
5199
5564
  #endif
5200
5565
  }
5201
- }
5202
5566
 
5203
5567
  #ifdef GGML_USE_METAL
5204
- if (params.n_gpu_layers > 0) {
5205
- // this allocates all Metal resources and memory buffers
5568
+ if (params.n_gpu_layers > 0) {
5569
+ // this allocates all Metal resources and memory buffers
5206
5570
 
5207
- void * data_ptr = NULL;
5208
- size_t data_size = 0;
5571
+ void * data_ptr = NULL;
5572
+ size_t data_size = 0;
5209
5573
 
5210
- if (params.use_mmap) {
5211
- data_ptr = ctx->model.mapping->addr;
5212
- data_size = ctx->model.mapping->size;
5213
- } else {
5214
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5215
- data_size = ggml_get_mem_size (ctx->model.ctx);
5216
- }
5574
+ if (params.use_mmap) {
5575
+ data_ptr = ctx->model.mapping->addr;
5576
+ data_size = ctx->model.mapping->size;
5577
+ } else {
5578
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5579
+ data_size = ggml_get_mem_size (ctx->model.ctx);
5580
+ }
5217
5581
 
5218
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5582
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5219
5583
 
5220
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5584
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5221
5585
 
5222
5586
  #define LLAMA_METAL_CHECK_BUF(result) \
5223
- if (!(result)) { \
5224
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5225
- llama_free(ctx); \
5226
- return NULL; \
5227
- }
5587
+ if (!(result)) { \
5588
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5589
+ llama_free(ctx); \
5590
+ return NULL; \
5591
+ }
5228
5592
 
5229
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5593
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5230
5594
 
5231
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5232
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5595
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5596
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5233
5597
 
5234
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5598
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5235
5599
  #undef LLAMA_METAL_CHECK_BUF
5236
- }
5600
+ }
5237
5601
  #endif
5602
+ }
5238
5603
 
5239
5604
  #ifdef GGML_USE_MPI
5240
5605
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5297,13 +5662,29 @@ int llama_model_n_embd(const struct llama_model * model) {
5297
5662
  return model->hparams.n_embd;
5298
5663
  }
5299
5664
 
5300
- int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
5665
+ int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
5301
5666
  return snprintf(buf, buf_size, "%s %s %s",
5302
5667
  model->name.c_str(),
5303
5668
  llama_model_type_name(model->type),
5304
5669
  llama_model_ftype_name(model->ftype).c_str());
5305
5670
  }
5306
5671
 
5672
+ uint64_t llama_model_size(const struct llama_model * model) {
5673
+ uint64_t size = 0;
5674
+ for (const auto & it : model->tensors_by_name) {
5675
+ size += ggml_nbytes(it.second);
5676
+ }
5677
+ return size;
5678
+ }
5679
+
5680
+ uint64_t llama_model_n_params(const struct llama_model * model) {
5681
+ uint64_t nparams = 0;
5682
+ for (const auto & it : model->tensors_by_name) {
5683
+ nparams += ggml_nelements(it.second);
5684
+ }
5685
+ return nparams;
5686
+ }
5687
+
5307
5688
  int llama_model_quantize(
5308
5689
  const char * fname_inp,
5309
5690
  const char * fname_out,
@@ -5552,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5552
5933
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5553
5934
  rng_ss >> ctx->rng;
5554
5935
 
5555
- GGML_ASSERT(rng_ss.fail() == false);
5936
+ GGML_ASSERT(!rng_ss.fail());
5556
5937
  }
5557
5938
 
5558
5939
  // set logits
@@ -5828,8 +6209,7 @@ int llama_tokenize_with_model(
5828
6209
  llama_token * tokens,
5829
6210
  int n_max_tokens,
5830
6211
  bool add_bos) {
5831
- auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
5832
- auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
6212
+ auto res = llama_tokenize_internal(model->vocab, text, add_bos);
5833
6213
 
5834
6214
  if (n_max_tokens < (int) res.size()) {
5835
6215
  LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -5843,12 +6223,12 @@ int llama_tokenize_with_model(
5843
6223
  return res.size();
5844
6224
  }
5845
6225
 
5846
- int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
5847
- return llama_token_to_str_with_model(&ctx->model, token, buf, length);
6226
+ int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6227
+ return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
5848
6228
  }
5849
6229
 
5850
- // does not write null-terminator to str
5851
- int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6230
+ // does not write null-terminator to buf
6231
+ int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
5852
6232
  if (0 <= token && token < llama_model_n_vocab(model)) {
5853
6233
  if (llama_is_normal_token(model->vocab, token)) {
5854
6234
  std::string result = model->vocab.id_to_token[token].text;
@@ -5936,11 +6316,40 @@ const char * llama_print_system_info(void) {
5936
6316
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
5937
6317
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
5938
6318
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
6319
+ s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
5939
6320
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
5940
6321
 
5941
6322
  return s.c_str();
5942
6323
  }
5943
6324
 
6325
+ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6326
+ fprintf(stream, "\n");
6327
+ fprintf(stream, "###########\n");
6328
+ fprintf(stream, "# Timings #\n");
6329
+ fprintf(stream, "###########\n");
6330
+ fprintf(stream, "\n");
6331
+
6332
+ fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
6333
+ 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
6334
+ fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
6335
+ 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
6336
+ fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
6337
+ 1.0e-3 * ctx->t_sample_us / ctx->n_sample);
6338
+ fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
6339
+ fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
6340
+ fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
6341
+ fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
6342
+ fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
6343
+ fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
6344
+ fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
6345
+ fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
6346
+ 1.0e6 * ctx->n_eval / ctx->t_eval_us);
6347
+ fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
6348
+ 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
6349
+ fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
6350
+ 1.0e6 * ctx->n_sample / ctx->t_sample_us);
6351
+ }
6352
+
5944
6353
  // For internal test use
5945
6354
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
5946
6355
  return ctx->model.tensors_by_name;
@@ -5951,10 +6360,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
5951
6360
  g_state.log_callback_user_data = user_data;
5952
6361
  }
5953
6362
 
5954
- #if defined(_MSC_VER) && !defined(vsnprintf)
5955
- #define vsnprintf _vsnprintf
5956
- #endif
5957
-
5958
6363
  static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
5959
6364
  va_list args_copy;
5960
6365
  va_copy(args_copy, args);