llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,6 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
- #include <cstddef>
5
- #include <cstdint>
6
- #include <cstdio>
7
4
  #endif
8
5
 
9
6
  #include "llama.h"
@@ -62,6 +59,9 @@
62
59
  #include <cinttypes>
63
60
  #include <climits>
64
61
  #include <cstdarg>
62
+ #include <cstddef>
63
+ #include <cstdint>
64
+ #include <cstdio>
65
65
  #include <cstring>
66
66
  #include <ctime>
67
67
  #include <fstream>
@@ -114,13 +114,21 @@ static size_t utf8_len(char src) {
114
114
  }
115
115
 
116
116
  void replace_all(std::string & s, const std::string & search, const std::string & replace) {
117
- for (size_t pos = 0; ; pos += replace.length()) {
118
- pos = s.find(search, pos);
119
- if (pos == std::string::npos) break;
120
- s.erase(pos, search.length());
121
- s.insert(pos, replace);
117
+ std::string result;
118
+ for (size_t pos = 0; ; pos += search.length()) {
119
+ auto new_pos = s.find(search, pos);
120
+ if (new_pos == std::string::npos) {
121
+ result += s.substr(pos, s.size() - pos);
122
+ break;
123
+ }
124
+ result += s.substr(pos, new_pos - pos) + replace;
125
+ pos = new_pos;
122
126
  }
127
+ s = std::move(result);
123
128
  }
129
+ #ifdef GGML_USE_CPU_HBM
130
+ #include <hbwmalloc.h>
131
+ #endif
124
132
 
125
133
  static void zeros(std::ofstream & file, size_t n) {
126
134
  char zero = 0;
@@ -320,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
320
328
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
321
329
  },
322
330
  },
331
+ {
332
+ LLM_ARCH_GPT2,
333
+ {
334
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
335
+ },
336
+ },
337
+ {
338
+ LLM_ARCH_GPTJ,
339
+ {
340
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
341
+ },
342
+ },
343
+ {
344
+ LLM_ARCH_GPTNEOX,
345
+ {
346
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
347
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
348
+ { LLM_TENSOR_OUTPUT, "output" },
349
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
350
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
351
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
352
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
353
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
354
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
355
+ },
356
+ },
357
+ {
358
+ LLM_ARCH_MPT,
359
+ {
360
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
+ },
362
+ },
363
+ {
364
+ LLM_ARCH_UNKNOWN,
365
+ {
366
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
367
+ },
368
+ },
323
369
  };
324
370
 
325
371
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -407,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
407
453
  #elif GGML_USE_METAL
408
454
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
409
455
  # define llama_host_free(data) ggml_metal_host_free(data)
456
+ #elif GGML_USE_CPU_HBM
457
+ # define llama_host_malloc(n) hbw_malloc(n)
458
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
410
459
  #else
411
460
  # define llama_host_malloc(n) malloc(n)
412
461
  # define llama_host_free(data) free(data)
@@ -563,16 +612,16 @@ struct llama_mmap {
563
612
 
564
613
  if (prefetch > 0) {
565
614
  // Advise the kernel to preload the mapped memory
566
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
567
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
615
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
616
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
568
617
  strerror(errno));
569
618
  }
570
619
  }
571
620
  if (numa) {
572
621
  // advise the kernel not to use readahead
573
622
  // (because the next page might not belong on the same node)
574
- if (madvise(addr, file->size, MADV_RANDOM)) {
575
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
623
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
624
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
576
625
  strerror(errno));
577
626
  }
578
627
  }
@@ -609,7 +658,9 @@ struct llama_mmap {
609
658
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
610
659
  if (prefetch) {
611
660
  // Advise the kernel to preload the mapped memory
661
+
612
662
  WIN32_MEMORY_RANGE_ENTRY range;
663
+
613
664
  range.VirtualAddress = addr;
614
665
  range.NumberOfBytes = (SIZE_T)size;
615
666
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -796,12 +847,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
796
847
  (void) tensor;
797
848
  }
798
849
 
799
- static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
850
+ static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
800
851
  std::vector<char> result(8, 0);
801
- const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
852
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
802
853
  if (n_tokens < 0) {
803
854
  result.resize(-n_tokens);
804
- int check = llama_token_to_str(ctx, token, result.data(), result.size());
855
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
805
856
  GGML_ASSERT(check == -n_tokens);
806
857
  } else {
807
858
  result.resize(n_tokens);
@@ -955,10 +1006,10 @@ struct llama_vocab {
955
1006
  id linefeed_id = 13;
956
1007
 
957
1008
  int find_bpe_rank(std::string token_left, std::string token_right) const {
958
- replace_all(token_left, " ", "Ġ");
959
- replace_all(token_left, "\n", "Ċ");
960
- replace_all(token_right, " ", "Ġ");
961
- replace_all(token_right, "\n", "Ċ");
1009
+ replace_all(token_left, " ", "\u0120");
1010
+ replace_all(token_left, "\n", "\u010A");
1011
+ replace_all(token_right, " ", "\u0120");
1012
+ replace_all(token_right, "\n", "\u010A");
962
1013
 
963
1014
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
964
1015
  if (it == bpe_ranks.end()) {
@@ -1144,11 +1195,13 @@ static bool llama_kv_cache_init(
1144
1195
 
1145
1196
  enum llama_fver {
1146
1197
  GGUF_FILE_VERSION_V1 = 1,
1198
+ GGUF_FILE_VERSION_V2 = 2,
1147
1199
  };
1148
1200
 
1149
1201
  static const char * llama_file_version_name(llama_fver version) {
1150
1202
  switch (version) {
1151
- case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
1203
+ case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
1204
+ case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
1152
1205
  }
1153
1206
 
1154
1207
  return "unknown";
@@ -1439,7 +1492,11 @@ struct llama_model_loader {
1439
1492
  // allocate temp buffer if not using mmap
1440
1493
  if (!use_mmap && cur->data == NULL) {
1441
1494
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1442
- cur->data = malloc(ggml_nbytes(cur));
1495
+ #ifdef GGML_USE_CPU_HBM
1496
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1497
+ #else
1498
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1499
+ #endif
1443
1500
  }
1444
1501
 
1445
1502
  load_data_for(cur);
@@ -1593,9 +1650,13 @@ static void llm_load_hparams(
1593
1650
 
1594
1651
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1595
1652
 
1596
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1597
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1653
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1654
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1655
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1656
+ }
1598
1657
  }
1658
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1659
+ // gpt-j n_rot = rotary_dim
1599
1660
  }
1600
1661
 
1601
1662
  // arch-specific KVs
@@ -1635,7 +1696,8 @@ static void llm_load_hparams(
1635
1696
  }
1636
1697
 
1637
1698
  // TODO: This should probably be in llama.h
1638
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
1699
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
1700
+ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1639
1701
 
1640
1702
  static void llm_load_vocab(
1641
1703
  llama_model_loader & ml,
@@ -1737,7 +1799,11 @@ static void llm_load_vocab(
1737
1799
  }
1738
1800
 
1739
1801
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1740
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
1802
+ if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1803
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1804
+ } else {
1805
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
1806
+ }
1741
1807
 
1742
1808
  // special tokens
1743
1809
  GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@@ -2635,18 +2701,20 @@ static struct ggml_cgraph * llm_build_falcon(
2635
2701
 
2636
2702
  const size_t wsize = ggml_type_size(cur->type);
2637
2703
 
2638
- struct ggml_tensor * tmpq = ggml_view_3d(
2704
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
2705
+ // non-contiguous views is added for the rope operator
2706
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
2639
2707
  ctx0, cur, n_embd_head, n_head, N,
2640
2708
  wsize * n_embd_head,
2641
2709
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2642
- 0);
2710
+ 0));
2643
2711
  offload_func_kq(tmpq);
2644
2712
 
2645
- struct ggml_tensor * tmpk = ggml_view_3d(
2713
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
2646
2714
  ctx0, cur, n_embd_head, n_head_kv, N,
2647
2715
  wsize * n_embd_head,
2648
2716
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
2649
- wsize * n_embd_head * n_head);
2717
+ wsize * n_embd_head * n_head));
2650
2718
  offload_func_kq(tmpk);
2651
2719
 
2652
2720
  struct ggml_tensor * tmpv = ggml_view_3d(
@@ -2831,7 +2899,6 @@ static bool llama_eval_internal(
2831
2899
 
2832
2900
  GGML_ASSERT(n_tokens > 0);
2833
2901
  GGML_ASSERT(n_past >= 0);
2834
- GGML_ASSERT(n_threads > 0);
2835
2902
  // TODO: keep the values of n_batch and n_ctx
2836
2903
  // GGML_ASSERT(n_tokens <= n_batch);
2837
2904
  // GGML_ASSERT(n_past + n_tokens <= n_ctx);
@@ -2842,6 +2909,8 @@ static bool llama_eval_internal(
2842
2909
  ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
2843
2910
  #endif
2844
2911
 
2912
+ GGML_ASSERT(n_threads > 0);
2913
+
2845
2914
  const int N = n_tokens;
2846
2915
 
2847
2916
  const auto & model = lctx.model;
@@ -2880,7 +2949,12 @@ static bool llama_eval_internal(
2880
2949
 
2881
2950
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2882
2951
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2883
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
2952
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
2953
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
2954
+ // with the BLAS calls. need a better solution
2955
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
2956
+ n_threads = std::min(4, n_threads);
2957
+ }
2884
2958
 
2885
2959
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2886
2960
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -2985,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
2985
3059
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
2986
3060
  }
2987
3061
 
2988
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
2989
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
2990
- }
2991
-
2992
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
2993
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
2994
- }
2995
-
2996
3062
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
2997
3063
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
2998
3064
  }
2999
3065
 
3000
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3001
- GGML_ASSERT(llama_is_control_token(vocab, id));
3002
- return id == vocab.special_bos_id;
3003
- }
3004
-
3005
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3006
- GGML_ASSERT(llama_is_control_token(vocab, id));
3007
- return id == vocab.special_eos_id;
3008
- }
3009
-
3010
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3011
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3012
- return id == vocab.special_pad_id;
3013
- }
3014
-
3015
3066
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3016
3067
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3017
3068
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3026,16 +3077,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3026
3077
  return vocab.token_to_id.at(buf);
3027
3078
  }
3028
3079
 
3029
- static std::string llama_escape_whitespace(const std::string& text) {
3030
- std::string result = "\xe2\x96\x81";
3031
- for (size_t offs = 0; offs < text.length(); ++offs) {
3032
- if (text[offs] == ' ') {
3033
- result += "\xe2\x96\x81";
3034
- } else {
3035
- result += text[offs];
3036
- }
3037
- }
3038
- return result;
3080
+ static void llama_escape_whitespace(std::string & text) {
3081
+ replace_all(text, " ", "\xe2\x96\x81");
3039
3082
  }
3040
3083
 
3041
3084
  static void llama_unescape_whitespace(std::string & word) {
@@ -3204,7 +3247,7 @@ private:
3204
3247
 
3205
3248
  struct llm_bigram_bpe {
3206
3249
  struct comparator {
3207
- bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
3250
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
3208
3251
  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
3209
3252
  }
3210
3253
  };
@@ -3219,7 +3262,7 @@ struct llm_bigram_bpe {
3219
3262
  };
3220
3263
 
3221
3264
  struct llm_tokenizer_bpe {
3222
- llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
3265
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
3223
3266
 
3224
3267
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
3225
3268
  int final_prev_index = -1;
@@ -3312,9 +3355,15 @@ struct llm_tokenizer_bpe {
3312
3355
  std::string byte_str(1, *j);
3313
3356
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3314
3357
  if (token_multibyte == vocab.token_to_id.end()) {
3315
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3358
+ try {
3359
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3360
+ output.push_back(token_byte);
3361
+ } catch (const std::out_of_range & err) {
3362
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3363
+ }
3364
+ } else {
3365
+ output.push_back((*token_multibyte).second);
3316
3366
  }
3317
- output.push_back((*token_multibyte).second);
3318
3367
  }
3319
3368
  } else {
3320
3369
  output.push_back((*token).second);
@@ -3352,26 +3401,23 @@ private:
3352
3401
  }
3353
3402
 
3354
3403
  // probably not 100% correct
3355
- // TODO: this is quite slow - how to make it more efficient?
3356
- static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
3404
+ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
3357
3405
  std::vector<std::string> words;
3358
3406
 
3359
3407
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
3360
3408
  const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
3361
3409
  const std::regex re(pattern);
3362
- std::smatch m;
3363
3410
 
3364
- while (std::regex_search(text, m, re)) {
3365
- for (auto x : m) {
3366
- words.push_back(x);
3367
- }
3368
- text = m.suffix();
3411
+ auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
3412
+ auto words_end = std::sregex_iterator();
3413
+ auto n_words = std::distance(words_begin, words_end);
3414
+ words.reserve(n_words);
3415
+ for (auto it = words_begin; it != words_end; ++it) {
3416
+ words.push_back(it->str());
3369
3417
  }
3370
-
3371
3418
  return words;
3372
- }
3373
3419
 
3374
- bool flag_g2ws = false;
3420
+ }
3375
3421
 
3376
3422
  const llama_vocab & vocab;
3377
3423
 
@@ -3381,9 +3427,18 @@ private:
3381
3427
  llm_bigram_bpe::queue work_queue;
3382
3428
  };
3383
3429
 
3384
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
3430
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
3385
3431
  std::vector<llama_vocab::id> output;
3386
3432
 
3433
+ // OG tokenizer behavior:
3434
+ //
3435
+ // tokenizer.encode('', add_bos=True) returns [1]
3436
+ // tokenizer.encode('', add_bos=False) returns []
3437
+
3438
+ if (bos && vocab.special_bos_id != -1) {
3439
+ output.push_back(vocab.special_bos_id);
3440
+ }
3441
+
3387
3442
  if (raw_text.empty()) {
3388
3443
  return output;
3389
3444
  }
@@ -3391,29 +3446,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3391
3446
  switch (vocab.type) {
3392
3447
  case LLAMA_VOCAB_TYPE_SPM:
3393
3448
  {
3394
- llm_tokenizer_spm tokenizer(vocab);
3395
-
3396
- if (bos) {
3397
- output.push_back(vocab.special_bos_id);
3398
- }
3449
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
3450
+ raw_text = " " + raw_text;
3399
3451
 
3400
- std::string text;
3401
- if (escape) {
3402
- text = llama_escape_whitespace(raw_text);
3403
- } else {
3404
- text = raw_text;
3405
- }
3406
-
3407
- tokenizer.tokenize(text, output);
3452
+ llm_tokenizer_spm tokenizer(vocab);
3453
+ llama_escape_whitespace(raw_text);
3454
+ tokenizer.tokenize(raw_text, output);
3408
3455
  } break;
3409
3456
  case LLAMA_VOCAB_TYPE_BPE:
3410
3457
  {
3411
- llm_tokenizer_bpe tokenizer(vocab, escape);
3412
-
3413
- if (bos && vocab.special_bos_id != -1) {
3414
- output.push_back(vocab.special_bos_id);
3415
- }
3416
-
3458
+ llm_tokenizer_bpe tokenizer(vocab);
3417
3459
  tokenizer.tokenize(raw_text, output);
3418
3460
  } break;
3419
3461
  };
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
3595
3637
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
3638
 
3597
3639
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
3640
+ new_stacks.emplace_back(stack);
3599
3641
  return;
3600
3642
  }
3601
3643
 
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
3632
3674
  }
3633
3675
  case LLAMA_GRETYPE_CHAR:
3634
3676
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
3677
+ new_stacks.emplace_back(stack);
3636
3678
  break;
3637
3679
  default:
3638
3680
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
3839
  delete grammar;
3798
3840
  }
3799
3841
 
3842
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
3843
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
3844
+
3845
+ // redirect elements in stacks to point to new rules
3846
+ for (size_t is = 0; is < result->stacks.size(); is++) {
3847
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
3848
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
3849
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
3850
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
3851
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
3852
+ }
3853
+ }
3854
+ }
3855
+ }
3856
+ }
3857
+
3858
+ return result;
3859
+ }
3860
+
3800
3861
  //
3801
3862
  // sampling
3802
3863
  //
@@ -3908,7 +3969,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
3908
3969
 
3909
3970
  // Calculate absolute value of second derivatives
3910
3971
  for (size_t i = 0; i < second_derivatives.size(); ++i) {
3911
- second_derivatives[i] = abs(second_derivatives[i]);
3972
+ second_derivatives[i] = std::abs(second_derivatives[i]);
3912
3973
  }
3913
3974
 
3914
3975
  // Normalize the second derivatives
@@ -4099,16 +4160,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
4099
4160
  std::vector<llama_grammar_candidate> candidates_grammar;
4100
4161
 
4101
4162
  for (size_t i = 0; i < candidates->size; ++i) {
4102
- const llama_token id = candidates->data[i].id;
4103
- const std::string text = llama_token_to_text(ctx, id);
4163
+ const llama_token id = candidates->data[i].id;
4164
+ const std::string piece = llama_token_to_str(ctx, id);
4104
4165
  if (id == eos) {
4105
4166
  if (!allow_eos) {
4106
4167
  candidates->data[i].logit = -INFINITY;
4107
4168
  }
4108
- } else if (text.empty() || text[0] == 0) {
4169
+ } else if (piece.empty() || piece[0] == 0) {
4109
4170
  candidates->data[i].logit = -INFINITY;
4110
4171
  } else {
4111
- candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
4172
+ candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
4112
4173
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
4113
4174
  }
4114
4175
  }
@@ -4312,10 +4373,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4312
4373
  GGML_ASSERT(false);
4313
4374
  }
4314
4375
 
4315
- const std::string text = llama_token_to_text(ctx, token);
4376
+ const std::string piece = llama_token_to_str(ctx, token);
4316
4377
 
4317
4378
  // Note terminating 0 in decoded string
4318
- const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
4379
+ const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
4319
4380
  const auto & code_points = decoded.first;
4320
4381
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
4321
4382
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -4326,6 +4387,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
4326
4387
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4327
4388
  }
4328
4389
 
4390
+ //
4391
+ // Beam search
4392
+ //
4393
+
4394
+ struct llama_beam {
4395
+ std::vector<llama_token> tokens;
4396
+ float p; // Cumulative beam probability (renormalized relative to all beams)
4397
+ bool eob; // Initialize end-of-beam to false. Callback sets this to true.
4398
+ // Sort beams by probability. In case of ties, prefer beams at eob.
4399
+ bool operator<(const llama_beam & rhs) const {
4400
+ return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
4401
+ }
4402
+ // Shift off first n tokens and discard them.
4403
+ void shift_tokens(const size_t n) {
4404
+ if (n) {
4405
+ std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
4406
+ tokens.resize(tokens.size() - n);
4407
+ }
4408
+ }
4409
+ llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
4410
+ };
4411
+
4412
+ // A struct for calculating logit-related info.
4413
+ struct llama_logit_info {
4414
+ const float * const logits;
4415
+ const int n_vocab;
4416
+ const float max_l;
4417
+ const float normalizer;
4418
+ struct sum_exp {
4419
+ float max_l;
4420
+ float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
4421
+ };
4422
+ llama_logit_info(llama_context * ctx)
4423
+ : logits(llama_get_logits(ctx))
4424
+ , n_vocab(llama_n_vocab(ctx))
4425
+ , max_l(*std::max_element(logits, logits + n_vocab))
4426
+ , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4427
+ { }
4428
+ llama_token_data get_token_data(const llama_token token_id) const {
4429
+ constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
4430
+ return {token_id, logits[token_id], p};
4431
+ }
4432
+ // Return top k token_data by logit.
4433
+ std::vector<llama_token_data> top_k(size_t k) {
4434
+ std::vector<llama_token_data> min_heap; // min-heap by logit
4435
+ const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
4436
+ min_heap.reserve(k_min);
4437
+ for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
4438
+ min_heap.push_back(get_token_data(token_id));
4439
+ }
4440
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
4441
+ std::make_heap(min_heap.begin(), min_heap.end(), comp);
4442
+ for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
4443
+ if (min_heap.front().logit < logits[token_id]) {
4444
+ std::pop_heap(min_heap.begin(), min_heap.end(), comp);
4445
+ min_heap.back().id = token_id;
4446
+ min_heap.back().logit = logits[token_id];
4447
+ std::push_heap(min_heap.begin(), min_heap.end(), comp);
4448
+ }
4449
+ }
4450
+ return min_heap;
4451
+ }
4452
+ float probability_from_logit(float logit) const {
4453
+ return normalizer * std::exp(logit - max_l);
4454
+ }
4455
+ };
4456
+
4457
+ struct llama_beam_search_data {
4458
+ llama_context * ctx;
4459
+ size_t n_beams;
4460
+ int n_past;
4461
+ int n_predict;
4462
+ int n_threads;
4463
+ std::vector<llama_beam> beams;
4464
+ std::vector<llama_beam> next_beams;
4465
+
4466
+ // Re-calculated on each loop iteration
4467
+ size_t common_prefix_length;
4468
+
4469
+ // Used to communicate to/from callback on beams state.
4470
+ std::vector<llama_beam_view> beam_views;
4471
+
4472
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
4473
+ : ctx(ctx)
4474
+ , n_beams(n_beams)
4475
+ , n_past(n_past)
4476
+ , n_predict(n_predict)
4477
+ , n_threads(n_threads)
4478
+ , beam_views(n_beams) {
4479
+ beams.reserve(n_beams);
4480
+ next_beams.reserve(n_beams);
4481
+ }
4482
+
4483
+ // Collapse beams to a single beam given by index.
4484
+ void collapse_beams(const size_t beam_idx) {
4485
+ if (0u < beam_idx) {
4486
+ std::swap(beams[0], beams[beam_idx]);
4487
+ }
4488
+ beams.resize(1);
4489
+ }
4490
+
4491
+ // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
4492
+ // The repetative patterns below reflect the 2 stages of heaps:
4493
+ // * Gather elements until the vector is full, then call std::make_heap() on it.
4494
+ // * If the heap is full and a new element is found that should be included, pop the
4495
+ // least element to the back(), replace it with the new, then push it into the heap.
4496
+ void fill_next_beams_by_top_probabilities(llama_beam & beam) {
4497
+ // Min-heaps use a greater-than comparator.
4498
+ const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
4499
+ if (beam.eob) {
4500
+ // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
4501
+ if (next_beams.size() < n_beams) {
4502
+ next_beams.push_back(std::move(beam));
4503
+ if (next_beams.size() == n_beams) {
4504
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4505
+ }
4506
+ } else if (next_beams.front().p < beam.p) {
4507
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4508
+ next_beams.back() = std::move(beam);
4509
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4510
+ }
4511
+ } else {
4512
+ // beam is not at end-of-sentence, so branch with next top_k tokens.
4513
+ if (!beam.tokens.empty()) {
4514
+ llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
4515
+ }
4516
+ llama_logit_info logit_info(ctx);
4517
+ std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
4518
+ size_t i=0;
4519
+ if (next_beams.size() < n_beams) {
4520
+ for (; next_beams.size() < n_beams ; ++i) {
4521
+ llama_beam next_beam = beam;
4522
+ next_beam.tokens.push_back(next_tokens[i].id);
4523
+ next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
4524
+ next_beams.push_back(std::move(next_beam));
4525
+ }
4526
+ std::make_heap(next_beams.begin(), next_beams.end(), comp);
4527
+ } else {
4528
+ for (; next_beams.front().p == 0.0f ; ++i) {
4529
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4530
+ next_beams.back() = beam;
4531
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4532
+ next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
4533
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4534
+ }
4535
+ }
4536
+ for (; i < n_beams ; ++i) {
4537
+ const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
4538
+ if (next_beams.front().p < next_p) {
4539
+ std::pop_heap(next_beams.begin(), next_beams.end(), comp);
4540
+ next_beams.back() = beam;
4541
+ next_beams.back().tokens.push_back(next_tokens[i].id);
4542
+ next_beams.back().p = next_p;
4543
+ std::push_heap(next_beams.begin(), next_beams.end(), comp);
4544
+ }
4545
+ }
4546
+ }
4547
+ }
4548
+
4549
+ // Find common_prefix_length based on beams.
4550
+ // Requires beams is not empty.
4551
+ size_t find_common_prefix_length() {
4552
+ size_t common_prefix_length = beams[0].tokens.size();
4553
+ for (size_t i = 1 ; i < beams.size() ; ++i) {
4554
+ common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
4555
+ for (size_t j = 0 ; j < common_prefix_length ; ++j) {
4556
+ if (beams[0].tokens[j] != beams[i].tokens[j]) {
4557
+ common_prefix_length = j;
4558
+ break;
4559
+ }
4560
+ }
4561
+ }
4562
+ return common_prefix_length;
4563
+ }
4564
+
4565
+ // Construct beams_state to send back to caller via the callback function.
4566
+ // Side effect: set common_prefix_length = find_common_prefix_length();
4567
+ llama_beams_state get_beams_state(const bool last_call) {
4568
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4569
+ beam_views[i] = beams[i].view();
4570
+ }
4571
+ common_prefix_length = find_common_prefix_length();
4572
+ return {beam_views.data(), beams.size(), common_prefix_length, last_call};
4573
+ }
4574
+
4575
+ // Loop:
4576
+ // * while i < n_predict, AND
4577
+ // * any of the beams have not yet reached end-of-beam (eob), AND
4578
+ // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
4579
+ // (since all other beam probabilities can only decrease)
4580
+ void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
4581
+ beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
4582
+ const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
4583
+ for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
4584
+ !beams[top_beam_index()].eob ; ++i) {
4585
+ callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
4586
+ update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
4587
+ if (common_prefix_length) {
4588
+ llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
4589
+ n_past += common_prefix_length;
4590
+ }
4591
+ // Zero-out next_beam probabilities to place them last in following min-heap.
4592
+ std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
4593
+ for (llama_beam & beam : beams) {
4594
+ beam.shift_tokens(common_prefix_length);
4595
+ fill_next_beams_by_top_probabilities(beam);
4596
+ }
4597
+ // next_beams become the beams of next/final iteration. Swap them to re-use memory.
4598
+ beams.swap(next_beams);
4599
+ renormalize_beam_probabilities(beams);
4600
+ }
4601
+ collapse_beams(top_beam_index());
4602
+ callback(callback_data, get_beams_state(true));
4603
+ }
4604
+
4605
+ // As beams grow, the cumulative probabilities decrease.
4606
+ // Renormalize them to avoid floating point underflow.
4607
+ static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
4608
+ const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
4609
+ const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
4610
+ std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
4611
+ }
4612
+
4613
+ // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
4614
+ size_t top_beam_index() {
4615
+ return std::max_element(beams.begin(), beams.end()) - beams.begin();
4616
+ }
4617
+
4618
+ // Copy (p,eob) for each beam which may have been changed by the callback.
4619
+ void update_beams_from_beam_views() {
4620
+ for (size_t i = 0 ; i < beams.size() ; ++i) {
4621
+ beams[i].p = beam_views[i].p;
4622
+ beams[i].eob = beam_views[i].eob;
4623
+ }
4624
+ }
4625
+ };
4626
+
4627
+ void llama_beam_search(llama_context * ctx,
4628
+ llama_beam_search_callback_fn_t callback, void * callback_data,
4629
+ size_t n_beams, int n_past, int n_predict, int n_threads) {
4630
+ assert(ctx);
4631
+ const int64_t t_start_sample_us = ggml_time_us();
4632
+
4633
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
4634
+
4635
+ beam_search_data.loop(callback, callback_data);
4636
+
4637
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
4638
+ ctx->n_sample++;
4639
+ }
4640
+
4329
4641
  //
4330
4642
  // quantization
4331
4643
  //
@@ -4423,6 +4735,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4423
4735
 
4424
4736
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
4425
4737
 
4738
+ llama_model model;
4739
+ llm_load_arch(*ml, model);
4740
+ llm_load_hparams(*ml, model, 0, 0, 0);
4741
+
4742
+ if (params->only_copy) {
4743
+ ftype = model.ftype;
4744
+ }
4745
+
4426
4746
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4427
4747
  struct gguf_context * ctx_out = gguf_init_empty();
4428
4748
 
@@ -4448,6 +4768,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4448
4768
  ++n_feed_forward_w2;
4449
4769
  }
4450
4770
  }
4771
+ if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
4772
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
4773
+ __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
4774
+ }
4451
4775
 
4452
4776
  int i_attention_wv = 0;
4453
4777
  int i_feed_forward_w2 = 0;
@@ -4460,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4460
4784
  std::vector<std::thread> workers;
4461
4785
  std::mutex mutex;
4462
4786
 
4787
+ #ifdef GGML_USE_K_QUANTS
4463
4788
  auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4464
4789
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4465
4790
  };
4791
+ #endif
4466
4792
 
4467
4793
  int idx = 0;
4468
4794
 
@@ -4505,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4505
4831
  // quantize only 2D tensors
4506
4832
  quantize &= (tensor->n_dims == 2);
4507
4833
  quantize &= params->quantize_output_tensor || name != "output.weight";
4508
- quantize &= quantized_type != tensor->type;
4834
+ quantize &= !params->only_copy;
4509
4835
 
4510
4836
  enum ggml_type new_type;
4511
4837
  void * new_data;
4512
4838
  size_t new_size;
4513
4839
 
4514
- if (!quantize) {
4515
- new_type = tensor->type;
4516
- new_data = tensor->data;
4517
- new_size = ggml_nbytes(tensor);
4518
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4519
- } else {
4840
+ if (quantize) {
4520
4841
  new_type = quantized_type;
4521
4842
  #ifdef GGML_USE_K_QUANTS
4522
4843
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4524,8 +4845,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4524
4845
 
4525
4846
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4526
4847
  int nx = tensor->ne[0];
4527
- int ny = tensor->ne[1];
4528
- if (nx % QK_K == 0 && ny % QK_K == 0) {
4848
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4849
+ new_type = GGML_TYPE_Q8_0;
4850
+ }
4851
+ else if (new_type != GGML_TYPE_Q8_0) {
4529
4852
  new_type = GGML_TYPE_Q6_K;
4530
4853
  }
4531
4854
  } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4539,21 +4862,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4539
4862
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4540
4863
  else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4541
4864
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4865
+ if (model.type == MODEL_70B) {
4866
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4867
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4868
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4869
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4870
+ }
4542
4871
  ++i_attention_wv;
4543
4872
  } else if (name.find("ffn_down.weight") != std::string::npos) {
4544
4873
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4545
4874
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4546
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4875
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4876
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4877
+ : GGML_TYPE_Q3_K;
4878
+ }
4879
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4880
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4881
+ }
4882
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4883
+ if (model.arch == LLM_ARCH_FALCON) {
4884
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4885
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4886
+ } else {
4887
+ if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
+ }
4889
+ }
4890
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4891
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4892
+ new_type = GGML_TYPE_Q5_K;
4547
4893
  }
4548
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4549
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4550
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4551
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
4552
4894
  ++i_feed_forward_w2;
4553
4895
  } else if (name.find("attn_output.weight") != std::string::npos) {
4554
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4555
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4556
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4896
+ if (model.arch != LLM_ARCH_FALCON) {
4897
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4898
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4899
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4900
+ } else {
4901
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4902
+ }
4903
+ }
4904
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
4905
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4906
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4907
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4557
4908
  }
4558
4909
  else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4559
4910
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -4568,8 +4919,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4568
4919
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4569
4920
  int nx = tensor->ne[0];
4570
4921
  int ny = tensor->ne[1];
4571
- if (nx % QK_K != 0 || ny % QK_K != 0) {
4572
- LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
4922
+ if (nx % QK_K != 0) {
4923
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4573
4924
  convert_incompatible_tensor = true;
4574
4925
  }
4575
4926
  }
@@ -4585,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4585
4936
  }
4586
4937
  }
4587
4938
  #endif
4588
-
4939
+ // If we've decided to quantize to the same type the tensor is already
4940
+ // in then there's nothing to do.
4941
+ quantize = tensor->type != new_type;
4942
+ }
4943
+ if (!quantize) {
4944
+ new_type = tensor->type;
4945
+ new_data = tensor->data;
4946
+ new_size = ggml_nbytes(tensor);
4947
+ LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4948
+ } else {
4589
4949
  const size_t nelements = ggml_nelements(tensor);
4590
4950
 
4591
4951
  float * f32_data;
@@ -4990,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
4990
5350
  /*.seed =*/ LLAMA_DEFAULT_SEED,
4991
5351
  /*.n_ctx =*/ 512,
4992
5352
  /*.n_batch =*/ 512,
4993
- /*.gpu_layers =*/ 0,
5353
+ /*.n_gpu_layers =*/ 0,
4994
5354
  /*.main_gpu =*/ 0,
4995
5355
  /*.tensor_split =*/ nullptr,
4996
5356
  /*.rope_freq_base =*/ 10000.0f,
@@ -4998,7 +5358,7 @@ struct llama_context_params llama_context_default_params() {
4998
5358
  /*.progress_callback =*/ nullptr,
4999
5359
  /*.progress_callback_user_data =*/ nullptr,
5000
5360
  /*.low_vram =*/ false,
5001
- /*.mul_mat_q =*/ false,
5361
+ /*.mul_mat_q =*/ true,
5002
5362
  /*.f16_kv =*/ true,
5003
5363
  /*.logits_all =*/ false,
5004
5364
  /*.vocab_only =*/ false,
@@ -5007,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
5007
5367
  /*.embedding =*/ false,
5008
5368
  };
5009
5369
 
5370
+ #ifdef GGML_USE_METAL
5371
+ result.n_gpu_layers = 1;
5372
+ #endif
5373
+
5010
5374
  return result;
5011
5375
  }
5012
5376
 
@@ -5016,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5016
5380
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5017
5381
  /*.allow_requantize =*/ false,
5018
5382
  /*.quantize_output_tensor =*/ true,
5383
+ /*.only_copy =*/ false,
5019
5384
  };
5020
5385
 
5021
5386
  return result;
@@ -5198,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
5198
5563
  }
5199
5564
  #endif
5200
5565
  }
5201
- }
5202
5566
 
5203
5567
  #ifdef GGML_USE_METAL
5204
- if (params.n_gpu_layers > 0) {
5205
- // this allocates all Metal resources and memory buffers
5568
+ if (params.n_gpu_layers > 0) {
5569
+ // this allocates all Metal resources and memory buffers
5206
5570
 
5207
- void * data_ptr = NULL;
5208
- size_t data_size = 0;
5571
+ void * data_ptr = NULL;
5572
+ size_t data_size = 0;
5209
5573
 
5210
- if (params.use_mmap) {
5211
- data_ptr = ctx->model.mapping->addr;
5212
- data_size = ctx->model.mapping->size;
5213
- } else {
5214
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5215
- data_size = ggml_get_mem_size (ctx->model.ctx);
5216
- }
5574
+ if (params.use_mmap) {
5575
+ data_ptr = ctx->model.mapping->addr;
5576
+ data_size = ctx->model.mapping->size;
5577
+ } else {
5578
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5579
+ data_size = ggml_get_mem_size (ctx->model.ctx);
5580
+ }
5217
5581
 
5218
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5582
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5219
5583
 
5220
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5584
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5221
5585
 
5222
5586
  #define LLAMA_METAL_CHECK_BUF(result) \
5223
- if (!(result)) { \
5224
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5225
- llama_free(ctx); \
5226
- return NULL; \
5227
- }
5587
+ if (!(result)) { \
5588
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5589
+ llama_free(ctx); \
5590
+ return NULL; \
5591
+ }
5228
5592
 
5229
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5593
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5230
5594
 
5231
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5232
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5595
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5596
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5233
5597
 
5234
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5598
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5235
5599
  #undef LLAMA_METAL_CHECK_BUF
5236
- }
5600
+ }
5237
5601
  #endif
5602
+ }
5238
5603
 
5239
5604
  #ifdef GGML_USE_MPI
5240
5605
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5297,13 +5662,29 @@ int llama_model_n_embd(const struct llama_model * model) {
5297
5662
  return model->hparams.n_embd;
5298
5663
  }
5299
5664
 
5300
- int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
5665
+ int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
5301
5666
  return snprintf(buf, buf_size, "%s %s %s",
5302
5667
  model->name.c_str(),
5303
5668
  llama_model_type_name(model->type),
5304
5669
  llama_model_ftype_name(model->ftype).c_str());
5305
5670
  }
5306
5671
 
5672
+ uint64_t llama_model_size(const struct llama_model * model) {
5673
+ uint64_t size = 0;
5674
+ for (const auto & it : model->tensors_by_name) {
5675
+ size += ggml_nbytes(it.second);
5676
+ }
5677
+ return size;
5678
+ }
5679
+
5680
+ uint64_t llama_model_n_params(const struct llama_model * model) {
5681
+ uint64_t nparams = 0;
5682
+ for (const auto & it : model->tensors_by_name) {
5683
+ nparams += ggml_nelements(it.second);
5684
+ }
5685
+ return nparams;
5686
+ }
5687
+
5307
5688
  int llama_model_quantize(
5308
5689
  const char * fname_inp,
5309
5690
  const char * fname_out,
@@ -5552,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5552
5933
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5553
5934
  rng_ss >> ctx->rng;
5554
5935
 
5555
- GGML_ASSERT(rng_ss.fail() == false);
5936
+ GGML_ASSERT(!rng_ss.fail());
5556
5937
  }
5557
5938
 
5558
5939
  // set logits
@@ -5828,8 +6209,7 @@ int llama_tokenize_with_model(
5828
6209
  llama_token * tokens,
5829
6210
  int n_max_tokens,
5830
6211
  bool add_bos) {
5831
- auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
5832
- auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
6212
+ auto res = llama_tokenize_internal(model->vocab, text, add_bos);
5833
6213
 
5834
6214
  if (n_max_tokens < (int) res.size()) {
5835
6215
  LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -5843,12 +6223,12 @@ int llama_tokenize_with_model(
5843
6223
  return res.size();
5844
6224
  }
5845
6225
 
5846
- int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
5847
- return llama_token_to_str_with_model(&ctx->model, token, buf, length);
6226
+ int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6227
+ return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
5848
6228
  }
5849
6229
 
5850
- // does not write null-terminator to str
5851
- int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6230
+ // does not write null-terminator to buf
6231
+ int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
5852
6232
  if (0 <= token && token < llama_model_n_vocab(model)) {
5853
6233
  if (llama_is_normal_token(model->vocab, token)) {
5854
6234
  std::string result = model->vocab.id_to_token[token].text;
@@ -5936,11 +6316,40 @@ const char * llama_print_system_info(void) {
5936
6316
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
5937
6317
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
5938
6318
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
6319
+ s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
5939
6320
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
5940
6321
 
5941
6322
  return s.c_str();
5942
6323
  }
5943
6324
 
6325
+ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6326
+ fprintf(stream, "\n");
6327
+ fprintf(stream, "###########\n");
6328
+ fprintf(stream, "# Timings #\n");
6329
+ fprintf(stream, "###########\n");
6330
+ fprintf(stream, "\n");
6331
+
6332
+ fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
6333
+ 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
6334
+ fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
6335
+ 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
6336
+ fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
6337
+ 1.0e-3 * ctx->t_sample_us / ctx->n_sample);
6338
+ fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
6339
+ fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
6340
+ fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
6341
+ fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
6342
+ fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
6343
+ fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
6344
+ fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
6345
+ fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
6346
+ 1.0e6 * ctx->n_eval / ctx->t_eval_us);
6347
+ fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
6348
+ 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
6349
+ fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
6350
+ 1.0e6 * ctx->n_sample / ctx->t_sample_us);
6351
+ }
6352
+
5944
6353
  // For internal test use
5945
6354
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
5946
6355
  return ctx->model.tensors_by_name;
@@ -5951,10 +6360,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
5951
6360
  g_state.log_callback_user_data = user_data;
5952
6361
  }
5953
6362
 
5954
- #if defined(_MSC_VER) && !defined(vsnprintf)
5955
- #define vsnprintf _vsnprintf
5956
- #endif
5957
-
5958
6363
  static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
5959
6364
  va_list args_copy;
5960
6365
  va_copy(args_copy, args);