llama_cpp 0.9.3 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -46,7 +46,6 @@
46
46
  #endif
47
47
  #include <windows.h>
48
48
  #include <io.h>
49
- #include <stdio.h> // for _fseeki64
50
49
  #endif
51
50
 
52
51
  #include <algorithm>
@@ -91,7 +90,7 @@
91
90
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
91
  #endif
93
92
 
94
- #define LLAMA_MAX_NODES 4096
93
+ #define LLAMA_MAX_NODES 8192
95
94
 
96
95
  //
97
96
  // logging
@@ -604,6 +603,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
604
603
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
605
604
  }
606
605
 
606
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
607
+ switch (type) {
608
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
609
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
610
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
611
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
612
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
613
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
614
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
615
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
616
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
617
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
618
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
619
+ default: return format("unknown type %d", type);
620
+ }
621
+ }
622
+
623
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
624
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
+
626
+ switch (type) {
627
+ case GGUF_TYPE_STRING:
628
+ return gguf_get_val_str(ctx_gguf, i);
629
+ case GGUF_TYPE_ARRAY:
630
+ {
631
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
632
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
633
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
634
+ std::stringstream ss;
635
+ ss << "[";
636
+ for (int j = 0; j < arr_n; j++) {
637
+ if (arr_type == GGUF_TYPE_STRING) {
638
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
639
+ // escape quotes
640
+ replace_all(val, "\\", "\\\\");
641
+ replace_all(val, "\"", "\\\"");
642
+ ss << '"' << val << '"';
643
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
644
+ ss << "???";
645
+ } else {
646
+ ss << gguf_data_to_str(arr_type, data, j);
647
+ }
648
+ if (j < arr_n - 1) {
649
+ ss << ", ";
650
+ }
651
+ }
652
+ ss << "]";
653
+ return ss.str();
654
+ }
655
+ default:
656
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
657
+ }
658
+ }
659
+
607
660
  //
608
661
  // ggml helpers
609
662
  //
@@ -1059,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1059
1112
  //
1060
1113
 
1061
1114
  struct llama_state {
1115
+ llama_state() {
1116
+ #ifdef GGML_USE_METAL
1117
+ ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1118
+ #endif
1119
+ }
1120
+
1062
1121
  // We save the log callback globally
1063
1122
  ggml_log_callback log_callback = llama_log_callback_default;
1064
1123
  void * log_callback_user_data = nullptr;
@@ -1221,6 +1280,7 @@ struct llama_kv_cache {
1221
1280
  // cannot be freely changed after a slot has been allocated.
1222
1281
  uint32_t head = 0;
1223
1282
  uint32_t size = 0;
1283
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1224
1284
 
1225
1285
  // computed before each graph build
1226
1286
  uint32_t n = 0;
@@ -1322,6 +1382,9 @@ struct llama_model {
1322
1382
 
1323
1383
  int n_gpu_layers;
1324
1384
 
1385
+ // gguf metadata
1386
+ std::unordered_map<std::string, std::string> gguf_kv;
1387
+
1325
1388
  // context
1326
1389
  struct ggml_context * ctx = NULL;
1327
1390
 
@@ -1442,6 +1505,7 @@ static bool llama_kv_cache_init(
1442
1505
 
1443
1506
  cache.head = 0;
1444
1507
  cache.size = n_ctx;
1508
+ cache.used = 0;
1445
1509
 
1446
1510
  cache.cells.clear();
1447
1511
  cache.cells.resize(n_ctx);
@@ -1543,6 +1607,8 @@ static bool llama_kv_cache_find_slot(
1543
1607
  }
1544
1608
  }
1545
1609
 
1610
+ cache.used += n_tokens;
1611
+
1546
1612
  return true;
1547
1613
  }
1548
1614
 
@@ -1563,6 +1629,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1563
1629
  cache.cells[i].seq_id.clear();
1564
1630
  }
1565
1631
  cache.head = 0;
1632
+ cache.used = 0;
1566
1633
  }
1567
1634
 
1568
1635
  static void llama_kv_cache_seq_rm(
@@ -1585,6 +1652,9 @@ static void llama_kv_cache_seq_rm(
1585
1652
  continue;
1586
1653
  }
1587
1654
  if (cache.cells[i].seq_id.empty()) {
1655
+ // keep count of the number of used cells
1656
+ if (cache.cells[i].pos >= 0) cache.used--;
1657
+
1588
1658
  cache.cells[i].pos = -1;
1589
1659
  if (new_head == cache.size) new_head = i;
1590
1660
  }
@@ -1592,7 +1662,7 @@ static void llama_kv_cache_seq_rm(
1592
1662
  }
1593
1663
 
1594
1664
  // If we freed up a slot, set head to it so searching can start there.
1595
- if (new_head != cache.size) cache.head = new_head;
1665
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1596
1666
  }
1597
1667
 
1598
1668
  static void llama_kv_cache_seq_cp(
@@ -1618,6 +1688,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1618
1688
 
1619
1689
  for (uint32_t i = 0; i < cache.size; ++i) {
1620
1690
  if (!cache.cells[i].has_seq_id(seq_id)) {
1691
+ if (cache.cells[i].pos >= 0) cache.used--;
1621
1692
  cache.cells[i].pos = -1;
1622
1693
  cache.cells[i].seq_id.clear();
1623
1694
  if (new_head == cache.size) new_head = i;
@@ -1628,7 +1699,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1628
1699
  }
1629
1700
 
1630
1701
  // If we freed up a slot, set head to it so searching can start there.
1631
- if (new_head != cache.size) cache.head = new_head;
1702
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1632
1703
  }
1633
1704
 
1634
1705
  static void llama_kv_cache_seq_shift(
@@ -1649,6 +1720,7 @@ static void llama_kv_cache_seq_shift(
1649
1720
  cache.cells[i].delta += delta;
1650
1721
 
1651
1722
  if (cache.cells[i].pos < 0) {
1723
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1652
1724
  cache.cells[i].pos = -1;
1653
1725
  cache.cells[i].seq_id.clear();
1654
1726
  if (new_head == cache.size) new_head = i;
@@ -1780,10 +1852,10 @@ struct llama_model_loader {
1780
1852
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1781
1853
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1782
1854
  default:
1783
- {
1784
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1785
- ftype = LLAMA_FTYPE_ALL_F32;
1786
- } break;
1855
+ {
1856
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1857
+ ftype = LLAMA_FTYPE_ALL_F32;
1858
+ } break;
1787
1859
  }
1788
1860
 
1789
1861
  // this is a way to mark that we have "guessed" the file type
@@ -1797,10 +1869,21 @@ struct llama_model_loader {
1797
1869
  }
1798
1870
 
1799
1871
  for (int i = 0; i < n_kv; i++) {
1800
- const char * name = gguf_get_key(ctx_gguf, i);
1801
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1872
+ const char * name = gguf_get_key(ctx_gguf, i);
1873
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1874
+ const std::string type_name =
1875
+ type == GGUF_TYPE_ARRAY
1876
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1877
+ : gguf_type_name(type);
1878
+
1879
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1880
+ const size_t MAX_VALUE_LEN = 40;
1881
+ if (value.size() > MAX_VALUE_LEN) {
1882
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1883
+ }
1884
+ replace_all(value, "\n", "\\n");
1802
1885
 
1803
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1886
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1804
1887
  }
1805
1888
 
1806
1889
  // print type counts
@@ -2095,6 +2178,17 @@ static void llm_load_hparams(
2095
2178
 
2096
2179
  auto & hparams = model.hparams;
2097
2180
 
2181
+ // get metadata as string
2182
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2183
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2184
+ if (type == GGUF_TYPE_ARRAY) {
2185
+ continue;
2186
+ }
2187
+ const char * name = gguf_get_key(ctx, i);
2188
+ const std::string value = gguf_kv_to_str(ctx, i);
2189
+ model.gguf_kv.emplace(name, value);
2190
+ }
2191
+
2098
2192
  // get general kv
2099
2193
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2100
2194
 
@@ -2545,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2545
2639
  }
2546
2640
 
2547
2641
  // general kv
2548
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2642
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2549
2643
 
2550
2644
  // special tokens
2551
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2552
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2553
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2554
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2555
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2556
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2645
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2646
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2647
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2648
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2649
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2650
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2557
2651
  }
2558
2652
 
2559
2653
  static void llm_load_tensors(
@@ -3375,7 +3469,7 @@ static void llm_build_k_shift(
3375
3469
  struct ggml_cgraph * graph,
3376
3470
  llm_rope_type type,
3377
3471
  int64_t n_ctx,
3378
- int64_t n_rot,
3472
+ int n_rot,
3379
3473
  float freq_base,
3380
3474
  float freq_scale,
3381
3475
  const llm_build_cb & cb) {
@@ -3407,7 +3501,7 @@ static void llm_build_k_shift(
3407
3501
  // we rotate only the first n_rot dimensions
3408
3502
  ggml_rope_custom_inplace(ctx,
3409
3503
  ggml_view_3d(ctx, kv.k,
3410
- n_rot, n_head_kv, n_ctx,
3504
+ n_embd_head, n_head_kv, n_ctx,
3411
3505
  ggml_element_size(kv.k)*n_embd_head,
3412
3506
  ggml_element_size(kv.k)*n_embd_gqa,
3413
3507
  ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
@@ -3605,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
3605
3699
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
3606
3700
  cb(kq, "kq", il);
3607
3701
 
3608
- kq = ggml_scale(ctx, kq, kq_scale);
3609
- cb(kq, "kq_scaled", il);
3610
-
3611
3702
  if (max_alibi_bias > 0.0f) {
3612
- // TODO: n_head or n_head_kv
3613
- // TODO: K-shift is likely not working
3614
- // TODO: change to ggml_add
3615
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3616
- cb(kq, "kq_scaled_alibi", il);
3617
- }
3703
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
3704
+ kq = ggml_scale(ctx, kq, kq_scale);
3705
+ cb(kq, "kq_scaled", il);
3618
3706
 
3619
- kq = ggml_add(ctx, kq, kq_mask);
3620
- cb(kq, "kq_masked", il);
3707
+ if (max_alibi_bias > 0.0f) {
3708
+ // TODO: n_head or n_head_kv
3709
+ // TODO: K-shift is likely not working
3710
+ // TODO: change to ggml_add
3711
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3712
+ cb(kq, "kq_scaled_alibi", il);
3713
+ }
3621
3714
 
3622
- kq = ggml_soft_max(ctx, kq);
3623
- cb(kq, "kq_soft_max", il);
3715
+ kq = ggml_add(ctx, kq, kq_mask);
3716
+ cb(kq, "kq_masked", il);
3717
+
3718
+ kq = ggml_soft_max(ctx, kq);
3719
+ cb(kq, "kq_soft_max", il);
3720
+ } else {
3721
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
3722
+ cb(kq, "kq_soft_max_ext", il);
3723
+ }
3624
3724
 
3625
3725
  // split cached v into n_head heads
3626
3726
  struct ggml_tensor * v =
@@ -4730,92 +4830,34 @@ struct llm_build_context {
4730
4830
  // self-attention
4731
4831
  {
4732
4832
  // compute Q and K and RoPE them
4733
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
- cb(tmpq, "tmpq", il);
4833
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4834
+ cb(Qcur, "Qcur", il);
4735
4835
 
4736
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
- cb(tmpk, "tmpk", il);
4836
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4837
+ cb(Kcur, "Kcur", il);
4738
4838
 
4739
4839
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
4840
  cb(Vcur, "Vcur", il);
4741
4841
 
4742
- // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
- struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
- ggml_element_size(tmpq) * n_embd_head,
4746
- ggml_element_size(tmpq) * n_embd_head * n_head,
4747
- 0
4748
- ));
4749
- cb(qrot, "qrot", il);
4750
-
4751
- struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
- ggml_element_size(tmpk) * n_embd_head,
4754
- ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
- 0
4756
- ));
4757
- cb(krot, "krot", il);
4758
-
4759
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
- struct ggml_tensor * qpass = ggml_view_3d(
4761
- ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
- ggml_element_size(tmpq) * n_embd_head,
4763
- ggml_element_size(tmpq) * n_embd_head * n_head,
4764
- ggml_element_size(tmpq) * hparams.n_rot
4765
- );
4766
- cb(qpass, "qpass", il);
4767
-
4768
- struct ggml_tensor * kpass = ggml_view_3d(
4769
- ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
- ggml_element_size(tmpk) * (n_embd_head),
4771
- ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
- ggml_element_size(tmpk) * hparams.n_rot
4773
- );
4774
- cb(kpass, "kpass", il);
4775
-
4776
- struct ggml_tensor * qrotated = ggml_rope_custom(
4777
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
- );
4780
- cb(qrotated, "qrotated", il);
4781
-
4782
- struct ggml_tensor * krotated = ggml_rope_custom(
4783
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4842
+ Qcur = ggml_rope_custom(
4843
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4844
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4845
+ ext_factor, attn_factor, beta_fast, beta_slow
4785
4846
  );
4786
- cb(krotated, "krotated", il);
4787
-
4788
- // ggml currently only supports concatenation on dim=2
4789
- // so we need to permute qrot, qpass, concat, then permute back.
4790
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
- cb(qrotated, "qrotated", il);
4792
-
4793
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
- cb(krotated, "krotated", il);
4795
-
4796
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
- cb(qpass, "qpass", il);
4798
-
4799
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
- cb(kpass, "kpass", il);
4801
-
4802
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
4847
  cb(Qcur, "Qcur", il);
4804
4848
 
4805
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
- cb(Kcur, "Kcur", il);
4807
-
4808
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
- cb(Q, "Q", il);
4810
-
4811
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4849
+ Kcur = ggml_rope_custom(
4850
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4851
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4852
+ ext_factor, attn_factor, beta_fast, beta_slow
4853
+ );
4812
4854
  cb(Kcur, "Kcur", il);
4813
4855
 
4814
4856
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
4857
 
4816
4858
  cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
4859
  model.layers[il].wo, NULL,
4818
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4860
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
4861
  cb(cur, "kqv_out", il);
4820
4862
  }
4821
4863
 
@@ -5000,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5042
  { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5001
5043
  { "kq_masked", OFFLOAD_FUNC_KQ },
5002
5044
  { "kq_soft_max", OFFLOAD_FUNC_V },
5045
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5003
5046
  { "v", OFFLOAD_FUNC_V },
5004
5047
  { "kqv", OFFLOAD_FUNC_V },
5005
5048
  { "kqv_merged", OFFLOAD_FUNC_V },
@@ -5443,6 +5486,12 @@ static int llama_decode_internal(
5443
5486
  batch.seq_id = seq_id_arr.data();
5444
5487
  }
5445
5488
 
5489
+ // if we have enough unused cells before the current head ->
5490
+ // better to start searching from the beginning of the cache, hoping to fill it
5491
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5492
+ kv_self.head = 0;
5493
+ }
5494
+
5446
5495
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5447
5496
  return 1;
5448
5497
  }
@@ -5453,7 +5502,7 @@ static int llama_decode_internal(
5453
5502
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5454
5503
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5455
5504
 
5456
- //printf("kv_self.n = %d\n", kv_self.n);
5505
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5457
5506
 
5458
5507
  ggml_allocr_reset(lctx.alloc);
5459
5508
 
@@ -5502,18 +5551,8 @@ static int llama_decode_internal(
5502
5551
  n_threads = std::min(4, n_threads);
5503
5552
  }
5504
5553
 
5505
- // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5506
- const bool full_offload_supported =
5507
- model.arch == LLM_ARCH_LLAMA ||
5508
- model.arch == LLM_ARCH_BAICHUAN ||
5509
- model.arch == LLM_ARCH_FALCON ||
5510
- model.arch == LLM_ARCH_REFACT ||
5511
- model.arch == LLM_ARCH_MPT ||
5512
- model.arch == LLM_ARCH_STARCODER ||
5513
- model.arch == LLM_ARCH_STABLELM;
5514
-
5515
5554
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5516
- if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5555
+ if (ggml_cpu_has_cublas() && fully_offloaded) {
5517
5556
  n_threads = 1;
5518
5557
  }
5519
5558
 
@@ -6372,10 +6411,13 @@ struct llama_grammar_candidate {
6372
6411
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6373
6412
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6374
6413
  const char * src,
6414
+ size_t n_src,
6375
6415
  llama_partial_utf8 partial_start) {
6376
6416
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6377
6417
  const char * pos = src;
6378
6418
  std::vector<uint32_t> code_points;
6419
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
+ code_points.reserve(n_src + 1);
6379
6421
  uint32_t value = partial_start.value;
6380
6422
  int n_remain = partial_start.n_remain;
6381
6423
 
@@ -6426,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6426
6468
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6427
6469
  }
6428
6470
 
6471
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
+ std::string src,
6473
+ llama_partial_utf8 partial_start
6474
+ ) {
6475
+ return decode_utf8(src.c_str(), src.size(), partial_start);
6476
+ }
6477
+
6429
6478
  // returns true iff pos points to the end of one of the definitions of a rule
6430
6479
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6431
6480
  switch (pos->type) {
@@ -6979,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
6979
7028
  // Replace the data in candidates with the new_candidates data
6980
7029
  std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
6981
7030
  candidates->size = new_candidates.size();
7031
+ candidates->sorted = false;
6982
7032
 
6983
7033
  if (ctx) {
6984
7034
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7075,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7075
7125
  } else if (piece.empty() || piece[0] == 0) {
7076
7126
  candidates->data[i].logit = -INFINITY;
7077
7127
  } else {
7078
- candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
7128
+ candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
7079
7129
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
7080
7130
  }
7081
7131
  }
@@ -7282,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7282
7332
  const std::string piece = llama_token_to_piece(ctx, token);
7283
7333
 
7284
7334
  // Note terminating 0 in decoded string
7285
- const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
7335
+ const auto decoded = decode_utf8(piece, grammar->partial_utf8);
7286
7336
  const auto & code_points = decoded.first;
7287
7337
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
7288
7338
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -8527,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
8527
8577
 
8528
8578
  #ifdef GGML_USE_METAL
8529
8579
  if (model->n_gpu_layers > 0) {
8530
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8531
-
8532
8580
  ctx->ctx_metal = ggml_metal_init(1);
8533
8581
  if (!ctx->ctx_metal) {
8534
8582
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
@@ -8666,6 +8714,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8666
8714
  return model->hparams.rope_freq_scale_train;
8667
8715
  }
8668
8716
 
8717
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8718
+ const auto & it = model->gguf_kv.find(key);
8719
+ if (it == model->gguf_kv.end()) {
8720
+ if (buf_size > 0) {
8721
+ buf[0] = '\0';
8722
+ }
8723
+ return -1;
8724
+ }
8725
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8726
+ }
8727
+
8728
+ int llama_model_meta_count(const struct llama_model * model) {
8729
+ return (int)model->gguf_kv.size();
8730
+ }
8731
+
8732
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8733
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8734
+ if (buf_size > 0) {
8735
+ buf[0] = '\0';
8736
+ }
8737
+ return -1;
8738
+ }
8739
+ auto it = model->gguf_kv.begin();
8740
+ std::advance(it, i);
8741
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8742
+ }
8743
+
8744
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8745
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8746
+ if (buf_size > 0) {
8747
+ buf[0] = '\0';
8748
+ }
8749
+ return -1;
8750
+ }
8751
+ auto it = model->gguf_kv.begin();
8752
+ std::advance(it, i);
8753
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8754
+ }
8755
+
8669
8756
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8670
8757
  return snprintf(buf, buf_size, "%s %s %s",
8671
8758
  llama_model_arch_name(model->arch).c_str(),
@@ -8724,8 +8811,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8724
8811
  }
8725
8812
  }
8726
8813
 
8814
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8815
+ struct llama_kv_cache_view result = {
8816
+ /*.n_cells = */ 0,
8817
+ /*.n_max_seq = */ n_max_seq,
8818
+ /*.token_count = */ 0,
8819
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8820
+ /*.max_contiguous = */ 0,
8821
+ /*.max_contiguous_idx = */ -1,
8822
+ /*.cells = */ nullptr,
8823
+ /*.cells_sequences = */ nullptr,
8824
+ };
8825
+ return result;
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8829
+ if (view->cells != nullptr) {
8830
+ free(view->cells);
8831
+ view->cells = nullptr;
8832
+ }
8833
+ if (view->cells_sequences != nullptr) {
8834
+ free(view->cells_sequences);
8835
+ view->cells_sequences = nullptr;
8836
+ }
8837
+ }
8838
+
8839
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8840
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8841
+ view->n_cells = int32_t(ctx->kv_self.size);
8842
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8843
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8844
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8845
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8846
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8847
+ view->cells_sequences = (llama_seq_id *)p;
8848
+ }
8849
+
8850
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8851
+ llama_kv_cache_view_cell * c_curr = view->cells;
8852
+ llama_seq_id * cs_curr = view->cells_sequences;
8853
+ int32_t used_cells = 0;
8854
+ int32_t token_count = 0;
8855
+ int32_t curr_contig_idx = -1;
8856
+ uint32_t max_contig = 0;
8857
+ int32_t max_contig_idx = -1;
8858
+
8859
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8860
+ const size_t curr_size = kv_cells[i].seq_id.size();
8861
+ token_count += curr_size;
8862
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8863
+
8864
+ if (curr_size > 0) {
8865
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8866
+ max_contig = i - curr_contig_idx;
8867
+ max_contig_idx = curr_contig_idx;
8868
+ }
8869
+ curr_contig_idx = -1;
8870
+ } else if (curr_contig_idx < 0) {
8871
+ curr_contig_idx = i;
8872
+ }
8873
+
8874
+ int seq_idx = 0;
8875
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8876
+ if (seq_idx >= view->n_max_seq) {
8877
+ break;
8878
+ }
8879
+ cs_curr[seq_idx] = it;
8880
+ seq_idx++;
8881
+ }
8882
+ if (seq_idx != 0) {
8883
+ used_cells++;
8884
+ }
8885
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8886
+ cs_curr[seq_idx] = -1;
8887
+ }
8888
+ }
8889
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8890
+ max_contig_idx = curr_contig_idx;
8891
+ max_contig = kv_cells.size() - curr_contig_idx;
8892
+ }
8893
+ view->max_contiguous = max_contig;
8894
+ view->max_contiguous_idx = max_contig_idx;
8895
+ view->token_count = token_count;
8896
+ view->used_cells = used_cells;
8897
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8898
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8899
+ __func__, ctx->kv_self.used, used_cells);
8900
+ }
8901
+ }
8902
+
8727
8903
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8728
- return ctx->kv_self.head;
8904
+ int result = 0;
8905
+
8906
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8907
+ result += ctx->kv_self.cells[i].seq_id.size();
8908
+ }
8909
+
8910
+ return result;
8911
+ }
8912
+
8913
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8914
+ return ctx->kv_self.used;
8729
8915
  }
8730
8916
 
8731
8917
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8895,10 +9081,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8895
9081
  const size_t kv_buf_size = kv_self.buf.size;
8896
9082
  const uint32_t kv_head = kv_self.head;
8897
9083
  const uint32_t kv_size = kv_self.size;
9084
+ const uint32_t kv_used = kv_self.used;
8898
9085
 
8899
9086
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8900
9087
  data_ctx->write(&kv_head, sizeof(kv_head));
8901
9088
  data_ctx->write(&kv_size, sizeof(kv_size));
9089
+ data_ctx->write(&kv_used, sizeof(kv_used));
8902
9090
 
8903
9091
  if (kv_buf_size) {
8904
9092
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -9021,10 +9209,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9021
9209
  size_t kv_buf_size;
9022
9210
  uint32_t kv_head;
9023
9211
  uint32_t kv_size;
9212
+ uint32_t kv_used;
9024
9213
 
9025
9214
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9026
9215
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9027
9216
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9217
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9028
9218
 
9029
9219
  if (kv_buf_size) {
9030
9220
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
@@ -9059,6 +9249,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9059
9249
 
9060
9250
  ctx->kv_self.head = kv_head;
9061
9251
  ctx->kv_self.size = kv_size;
9252
+ ctx->kv_self.used = kv_used;
9062
9253
 
9063
9254
  ctx->kv_self.cells.resize(kv_size);
9064
9255
 
@@ -9521,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
9521
9712
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
9522
9713
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
9523
9714
  g_state.log_callback_user_data = user_data;
9715
+ #ifdef GGML_USE_METAL
9716
+ ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
9717
+ #endif
9524
9718
  }
9525
9719
 
9526
9720
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {