llama_cpp 0.9.3 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,7 +46,6 @@
46
46
  #endif
47
47
  #include <windows.h>
48
48
  #include <io.h>
49
- #include <stdio.h> // for _fseeki64
50
49
  #endif
51
50
 
52
51
  #include <algorithm>
@@ -91,7 +90,7 @@
91
90
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
91
  #endif
93
92
 
94
- #define LLAMA_MAX_NODES 4096
93
+ #define LLAMA_MAX_NODES 8192
95
94
 
96
95
  //
97
96
  // logging
@@ -604,6 +603,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
604
603
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
605
604
  }
606
605
 
606
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
607
+ switch (type) {
608
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
609
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
610
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
611
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
612
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
613
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
614
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
615
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
616
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
617
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
618
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
619
+ default: return format("unknown type %d", type);
620
+ }
621
+ }
622
+
623
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
624
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
625
+
626
+ switch (type) {
627
+ case GGUF_TYPE_STRING:
628
+ return gguf_get_val_str(ctx_gguf, i);
629
+ case GGUF_TYPE_ARRAY:
630
+ {
631
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
632
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
633
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
634
+ std::stringstream ss;
635
+ ss << "[";
636
+ for (int j = 0; j < arr_n; j++) {
637
+ if (arr_type == GGUF_TYPE_STRING) {
638
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
639
+ // escape quotes
640
+ replace_all(val, "\\", "\\\\");
641
+ replace_all(val, "\"", "\\\"");
642
+ ss << '"' << val << '"';
643
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
644
+ ss << "???";
645
+ } else {
646
+ ss << gguf_data_to_str(arr_type, data, j);
647
+ }
648
+ if (j < arr_n - 1) {
649
+ ss << ", ";
650
+ }
651
+ }
652
+ ss << "]";
653
+ return ss.str();
654
+ }
655
+ default:
656
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
657
+ }
658
+ }
659
+
607
660
  //
608
661
  // ggml helpers
609
662
  //
@@ -1059,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1059
1112
  //
1060
1113
 
1061
1114
  struct llama_state {
1115
+ llama_state() {
1116
+ #ifdef GGML_USE_METAL
1117
+ ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1118
+ #endif
1119
+ }
1120
+
1062
1121
  // We save the log callback globally
1063
1122
  ggml_log_callback log_callback = llama_log_callback_default;
1064
1123
  void * log_callback_user_data = nullptr;
@@ -1221,6 +1280,7 @@ struct llama_kv_cache {
1221
1280
  // cannot be freely changed after a slot has been allocated.
1222
1281
  uint32_t head = 0;
1223
1282
  uint32_t size = 0;
1283
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1224
1284
 
1225
1285
  // computed before each graph build
1226
1286
  uint32_t n = 0;
@@ -1322,6 +1382,9 @@ struct llama_model {
1322
1382
 
1323
1383
  int n_gpu_layers;
1324
1384
 
1385
+ // gguf metadata
1386
+ std::unordered_map<std::string, std::string> gguf_kv;
1387
+
1325
1388
  // context
1326
1389
  struct ggml_context * ctx = NULL;
1327
1390
 
@@ -1442,6 +1505,7 @@ static bool llama_kv_cache_init(
1442
1505
 
1443
1506
  cache.head = 0;
1444
1507
  cache.size = n_ctx;
1508
+ cache.used = 0;
1445
1509
 
1446
1510
  cache.cells.clear();
1447
1511
  cache.cells.resize(n_ctx);
@@ -1543,6 +1607,8 @@ static bool llama_kv_cache_find_slot(
1543
1607
  }
1544
1608
  }
1545
1609
 
1610
+ cache.used += n_tokens;
1611
+
1546
1612
  return true;
1547
1613
  }
1548
1614
 
@@ -1563,6 +1629,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1563
1629
  cache.cells[i].seq_id.clear();
1564
1630
  }
1565
1631
  cache.head = 0;
1632
+ cache.used = 0;
1566
1633
  }
1567
1634
 
1568
1635
  static void llama_kv_cache_seq_rm(
@@ -1585,6 +1652,9 @@ static void llama_kv_cache_seq_rm(
1585
1652
  continue;
1586
1653
  }
1587
1654
  if (cache.cells[i].seq_id.empty()) {
1655
+ // keep count of the number of used cells
1656
+ if (cache.cells[i].pos >= 0) cache.used--;
1657
+
1588
1658
  cache.cells[i].pos = -1;
1589
1659
  if (new_head == cache.size) new_head = i;
1590
1660
  }
@@ -1592,7 +1662,7 @@ static void llama_kv_cache_seq_rm(
1592
1662
  }
1593
1663
 
1594
1664
  // If we freed up a slot, set head to it so searching can start there.
1595
- if (new_head != cache.size) cache.head = new_head;
1665
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1596
1666
  }
1597
1667
 
1598
1668
  static void llama_kv_cache_seq_cp(
@@ -1618,6 +1688,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1618
1688
 
1619
1689
  for (uint32_t i = 0; i < cache.size; ++i) {
1620
1690
  if (!cache.cells[i].has_seq_id(seq_id)) {
1691
+ if (cache.cells[i].pos >= 0) cache.used--;
1621
1692
  cache.cells[i].pos = -1;
1622
1693
  cache.cells[i].seq_id.clear();
1623
1694
  if (new_head == cache.size) new_head = i;
@@ -1628,7 +1699,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1628
1699
  }
1629
1700
 
1630
1701
  // If we freed up a slot, set head to it so searching can start there.
1631
- if (new_head != cache.size) cache.head = new_head;
1702
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1632
1703
  }
1633
1704
 
1634
1705
  static void llama_kv_cache_seq_shift(
@@ -1649,6 +1720,7 @@ static void llama_kv_cache_seq_shift(
1649
1720
  cache.cells[i].delta += delta;
1650
1721
 
1651
1722
  if (cache.cells[i].pos < 0) {
1723
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1652
1724
  cache.cells[i].pos = -1;
1653
1725
  cache.cells[i].seq_id.clear();
1654
1726
  if (new_head == cache.size) new_head = i;
@@ -1780,10 +1852,10 @@ struct llama_model_loader {
1780
1852
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1781
1853
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1782
1854
  default:
1783
- {
1784
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1785
- ftype = LLAMA_FTYPE_ALL_F32;
1786
- } break;
1855
+ {
1856
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1857
+ ftype = LLAMA_FTYPE_ALL_F32;
1858
+ } break;
1787
1859
  }
1788
1860
 
1789
1861
  // this is a way to mark that we have "guessed" the file type
@@ -1797,10 +1869,21 @@ struct llama_model_loader {
1797
1869
  }
1798
1870
 
1799
1871
  for (int i = 0; i < n_kv; i++) {
1800
- const char * name = gguf_get_key(ctx_gguf, i);
1801
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1872
+ const char * name = gguf_get_key(ctx_gguf, i);
1873
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1874
+ const std::string type_name =
1875
+ type == GGUF_TYPE_ARRAY
1876
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1877
+ : gguf_type_name(type);
1878
+
1879
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1880
+ const size_t MAX_VALUE_LEN = 40;
1881
+ if (value.size() > MAX_VALUE_LEN) {
1882
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1883
+ }
1884
+ replace_all(value, "\n", "\\n");
1802
1885
 
1803
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1886
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1804
1887
  }
1805
1888
 
1806
1889
  // print type counts
@@ -2095,6 +2178,17 @@ static void llm_load_hparams(
2095
2178
 
2096
2179
  auto & hparams = model.hparams;
2097
2180
 
2181
+ // get metadata as string
2182
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2183
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2184
+ if (type == GGUF_TYPE_ARRAY) {
2185
+ continue;
2186
+ }
2187
+ const char * name = gguf_get_key(ctx, i);
2188
+ const std::string value = gguf_kv_to_str(ctx, i);
2189
+ model.gguf_kv.emplace(name, value);
2190
+ }
2191
+
2098
2192
  // get general kv
2099
2193
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2100
2194
 
@@ -2545,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2545
2639
  }
2546
2640
 
2547
2641
  // general kv
2548
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2642
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2549
2643
 
2550
2644
  // special tokens
2551
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2552
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2553
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2554
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2555
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2556
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2645
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2646
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2647
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2648
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2649
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2650
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2557
2651
  }
2558
2652
 
2559
2653
  static void llm_load_tensors(
@@ -3375,7 +3469,7 @@ static void llm_build_k_shift(
3375
3469
  struct ggml_cgraph * graph,
3376
3470
  llm_rope_type type,
3377
3471
  int64_t n_ctx,
3378
- int64_t n_rot,
3472
+ int n_rot,
3379
3473
  float freq_base,
3380
3474
  float freq_scale,
3381
3475
  const llm_build_cb & cb) {
@@ -3407,7 +3501,7 @@ static void llm_build_k_shift(
3407
3501
  // we rotate only the first n_rot dimensions
3408
3502
  ggml_rope_custom_inplace(ctx,
3409
3503
  ggml_view_3d(ctx, kv.k,
3410
- n_rot, n_head_kv, n_ctx,
3504
+ n_embd_head, n_head_kv, n_ctx,
3411
3505
  ggml_element_size(kv.k)*n_embd_head,
3412
3506
  ggml_element_size(kv.k)*n_embd_gqa,
3413
3507
  ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
@@ -3605,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
3605
3699
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
3606
3700
  cb(kq, "kq", il);
3607
3701
 
3608
- kq = ggml_scale(ctx, kq, kq_scale);
3609
- cb(kq, "kq_scaled", il);
3610
-
3611
3702
  if (max_alibi_bias > 0.0f) {
3612
- // TODO: n_head or n_head_kv
3613
- // TODO: K-shift is likely not working
3614
- // TODO: change to ggml_add
3615
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3616
- cb(kq, "kq_scaled_alibi", il);
3617
- }
3703
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
3704
+ kq = ggml_scale(ctx, kq, kq_scale);
3705
+ cb(kq, "kq_scaled", il);
3618
3706
 
3619
- kq = ggml_add(ctx, kq, kq_mask);
3620
- cb(kq, "kq_masked", il);
3707
+ if (max_alibi_bias > 0.0f) {
3708
+ // TODO: n_head or n_head_kv
3709
+ // TODO: K-shift is likely not working
3710
+ // TODO: change to ggml_add
3711
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3712
+ cb(kq, "kq_scaled_alibi", il);
3713
+ }
3621
3714
 
3622
- kq = ggml_soft_max(ctx, kq);
3623
- cb(kq, "kq_soft_max", il);
3715
+ kq = ggml_add(ctx, kq, kq_mask);
3716
+ cb(kq, "kq_masked", il);
3717
+
3718
+ kq = ggml_soft_max(ctx, kq);
3719
+ cb(kq, "kq_soft_max", il);
3720
+ } else {
3721
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
3722
+ cb(kq, "kq_soft_max_ext", il);
3723
+ }
3624
3724
 
3625
3725
  // split cached v into n_head heads
3626
3726
  struct ggml_tensor * v =
@@ -4730,92 +4830,34 @@ struct llm_build_context {
4730
4830
  // self-attention
4731
4831
  {
4732
4832
  // compute Q and K and RoPE them
4733
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
- cb(tmpq, "tmpq", il);
4833
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4834
+ cb(Qcur, "Qcur", il);
4735
4835
 
4736
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
- cb(tmpk, "tmpk", il);
4836
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4837
+ cb(Kcur, "Kcur", il);
4738
4838
 
4739
4839
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
4840
  cb(Vcur, "Vcur", il);
4741
4841
 
4742
- // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
- struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
- ggml_element_size(tmpq) * n_embd_head,
4746
- ggml_element_size(tmpq) * n_embd_head * n_head,
4747
- 0
4748
- ));
4749
- cb(qrot, "qrot", il);
4750
-
4751
- struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
- ggml_element_size(tmpk) * n_embd_head,
4754
- ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
- 0
4756
- ));
4757
- cb(krot, "krot", il);
4758
-
4759
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
- struct ggml_tensor * qpass = ggml_view_3d(
4761
- ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
- ggml_element_size(tmpq) * n_embd_head,
4763
- ggml_element_size(tmpq) * n_embd_head * n_head,
4764
- ggml_element_size(tmpq) * hparams.n_rot
4765
- );
4766
- cb(qpass, "qpass", il);
4767
-
4768
- struct ggml_tensor * kpass = ggml_view_3d(
4769
- ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
- ggml_element_size(tmpk) * (n_embd_head),
4771
- ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
- ggml_element_size(tmpk) * hparams.n_rot
4773
- );
4774
- cb(kpass, "kpass", il);
4775
-
4776
- struct ggml_tensor * qrotated = ggml_rope_custom(
4777
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
- );
4780
- cb(qrotated, "qrotated", il);
4781
-
4782
- struct ggml_tensor * krotated = ggml_rope_custom(
4783
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4842
+ Qcur = ggml_rope_custom(
4843
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4844
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4845
+ ext_factor, attn_factor, beta_fast, beta_slow
4785
4846
  );
4786
- cb(krotated, "krotated", il);
4787
-
4788
- // ggml currently only supports concatenation on dim=2
4789
- // so we need to permute qrot, qpass, concat, then permute back.
4790
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
- cb(qrotated, "qrotated", il);
4792
-
4793
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
- cb(krotated, "krotated", il);
4795
-
4796
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
- cb(qpass, "qpass", il);
4798
-
4799
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
- cb(kpass, "kpass", il);
4801
-
4802
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
4847
  cb(Qcur, "Qcur", il);
4804
4848
 
4805
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
- cb(Kcur, "Kcur", il);
4807
-
4808
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
- cb(Q, "Q", il);
4810
-
4811
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4849
+ Kcur = ggml_rope_custom(
4850
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4851
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4852
+ ext_factor, attn_factor, beta_fast, beta_slow
4853
+ );
4812
4854
  cb(Kcur, "Kcur", il);
4813
4855
 
4814
4856
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
4857
 
4816
4858
  cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
4859
  model.layers[il].wo, NULL,
4818
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4860
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
4861
  cb(cur, "kqv_out", il);
4820
4862
  }
4821
4863
 
@@ -5000,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5000
5042
  { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5001
5043
  { "kq_masked", OFFLOAD_FUNC_KQ },
5002
5044
  { "kq_soft_max", OFFLOAD_FUNC_V },
5045
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5003
5046
  { "v", OFFLOAD_FUNC_V },
5004
5047
  { "kqv", OFFLOAD_FUNC_V },
5005
5048
  { "kqv_merged", OFFLOAD_FUNC_V },
@@ -5443,6 +5486,12 @@ static int llama_decode_internal(
5443
5486
  batch.seq_id = seq_id_arr.data();
5444
5487
  }
5445
5488
 
5489
+ // if we have enough unused cells before the current head ->
5490
+ // better to start searching from the beginning of the cache, hoping to fill it
5491
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5492
+ kv_self.head = 0;
5493
+ }
5494
+
5446
5495
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5447
5496
  return 1;
5448
5497
  }
@@ -5453,7 +5502,7 @@ static int llama_decode_internal(
5453
5502
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5454
5503
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5455
5504
 
5456
- //printf("kv_self.n = %d\n", kv_self.n);
5505
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5457
5506
 
5458
5507
  ggml_allocr_reset(lctx.alloc);
5459
5508
 
@@ -5502,18 +5551,8 @@ static int llama_decode_internal(
5502
5551
  n_threads = std::min(4, n_threads);
5503
5552
  }
5504
5553
 
5505
- // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5506
- const bool full_offload_supported =
5507
- model.arch == LLM_ARCH_LLAMA ||
5508
- model.arch == LLM_ARCH_BAICHUAN ||
5509
- model.arch == LLM_ARCH_FALCON ||
5510
- model.arch == LLM_ARCH_REFACT ||
5511
- model.arch == LLM_ARCH_MPT ||
5512
- model.arch == LLM_ARCH_STARCODER ||
5513
- model.arch == LLM_ARCH_STABLELM;
5514
-
5515
5554
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5516
- if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5555
+ if (ggml_cpu_has_cublas() && fully_offloaded) {
5517
5556
  n_threads = 1;
5518
5557
  }
5519
5558
 
@@ -6372,10 +6411,13 @@ struct llama_grammar_candidate {
6372
6411
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6373
6412
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6374
6413
  const char * src,
6414
+ size_t n_src,
6375
6415
  llama_partial_utf8 partial_start) {
6376
6416
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6377
6417
  const char * pos = src;
6378
6418
  std::vector<uint32_t> code_points;
6419
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
+ code_points.reserve(n_src + 1);
6379
6421
  uint32_t value = partial_start.value;
6380
6422
  int n_remain = partial_start.n_remain;
6381
6423
 
@@ -6426,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6426
6468
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6427
6469
  }
6428
6470
 
6471
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
+ std::string src,
6473
+ llama_partial_utf8 partial_start
6474
+ ) {
6475
+ return decode_utf8(src.c_str(), src.size(), partial_start);
6476
+ }
6477
+
6429
6478
  // returns true iff pos points to the end of one of the definitions of a rule
6430
6479
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6431
6480
  switch (pos->type) {
@@ -6979,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
6979
7028
  // Replace the data in candidates with the new_candidates data
6980
7029
  std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
6981
7030
  candidates->size = new_candidates.size();
7031
+ candidates->sorted = false;
6982
7032
 
6983
7033
  if (ctx) {
6984
7034
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7075,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7075
7125
  } else if (piece.empty() || piece[0] == 0) {
7076
7126
  candidates->data[i].logit = -INFINITY;
7077
7127
  } else {
7078
- candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
7128
+ candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
7079
7129
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
7080
7130
  }
7081
7131
  }
@@ -7282,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7282
7332
  const std::string piece = llama_token_to_piece(ctx, token);
7283
7333
 
7284
7334
  // Note terminating 0 in decoded string
7285
- const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
7335
+ const auto decoded = decode_utf8(piece, grammar->partial_utf8);
7286
7336
  const auto & code_points = decoded.first;
7287
7337
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
7288
7338
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -8527,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
8527
8577
 
8528
8578
  #ifdef GGML_USE_METAL
8529
8579
  if (model->n_gpu_layers > 0) {
8530
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8531
-
8532
8580
  ctx->ctx_metal = ggml_metal_init(1);
8533
8581
  if (!ctx->ctx_metal) {
8534
8582
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
@@ -8666,6 +8714,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8666
8714
  return model->hparams.rope_freq_scale_train;
8667
8715
  }
8668
8716
 
8717
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8718
+ const auto & it = model->gguf_kv.find(key);
8719
+ if (it == model->gguf_kv.end()) {
8720
+ if (buf_size > 0) {
8721
+ buf[0] = '\0';
8722
+ }
8723
+ return -1;
8724
+ }
8725
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8726
+ }
8727
+
8728
+ int llama_model_meta_count(const struct llama_model * model) {
8729
+ return (int)model->gguf_kv.size();
8730
+ }
8731
+
8732
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8733
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8734
+ if (buf_size > 0) {
8735
+ buf[0] = '\0';
8736
+ }
8737
+ return -1;
8738
+ }
8739
+ auto it = model->gguf_kv.begin();
8740
+ std::advance(it, i);
8741
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8742
+ }
8743
+
8744
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8745
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8746
+ if (buf_size > 0) {
8747
+ buf[0] = '\0';
8748
+ }
8749
+ return -1;
8750
+ }
8751
+ auto it = model->gguf_kv.begin();
8752
+ std::advance(it, i);
8753
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8754
+ }
8755
+
8669
8756
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8670
8757
  return snprintf(buf, buf_size, "%s %s %s",
8671
8758
  llama_model_arch_name(model->arch).c_str(),
@@ -8724,8 +8811,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8724
8811
  }
8725
8812
  }
8726
8813
 
8814
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8815
+ struct llama_kv_cache_view result = {
8816
+ /*.n_cells = */ 0,
8817
+ /*.n_max_seq = */ n_max_seq,
8818
+ /*.token_count = */ 0,
8819
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8820
+ /*.max_contiguous = */ 0,
8821
+ /*.max_contiguous_idx = */ -1,
8822
+ /*.cells = */ nullptr,
8823
+ /*.cells_sequences = */ nullptr,
8824
+ };
8825
+ return result;
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8829
+ if (view->cells != nullptr) {
8830
+ free(view->cells);
8831
+ view->cells = nullptr;
8832
+ }
8833
+ if (view->cells_sequences != nullptr) {
8834
+ free(view->cells_sequences);
8835
+ view->cells_sequences = nullptr;
8836
+ }
8837
+ }
8838
+
8839
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8840
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8841
+ view->n_cells = int32_t(ctx->kv_self.size);
8842
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8843
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8844
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8845
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8846
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8847
+ view->cells_sequences = (llama_seq_id *)p;
8848
+ }
8849
+
8850
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8851
+ llama_kv_cache_view_cell * c_curr = view->cells;
8852
+ llama_seq_id * cs_curr = view->cells_sequences;
8853
+ int32_t used_cells = 0;
8854
+ int32_t token_count = 0;
8855
+ int32_t curr_contig_idx = -1;
8856
+ uint32_t max_contig = 0;
8857
+ int32_t max_contig_idx = -1;
8858
+
8859
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8860
+ const size_t curr_size = kv_cells[i].seq_id.size();
8861
+ token_count += curr_size;
8862
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8863
+
8864
+ if (curr_size > 0) {
8865
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8866
+ max_contig = i - curr_contig_idx;
8867
+ max_contig_idx = curr_contig_idx;
8868
+ }
8869
+ curr_contig_idx = -1;
8870
+ } else if (curr_contig_idx < 0) {
8871
+ curr_contig_idx = i;
8872
+ }
8873
+
8874
+ int seq_idx = 0;
8875
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8876
+ if (seq_idx >= view->n_max_seq) {
8877
+ break;
8878
+ }
8879
+ cs_curr[seq_idx] = it;
8880
+ seq_idx++;
8881
+ }
8882
+ if (seq_idx != 0) {
8883
+ used_cells++;
8884
+ }
8885
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8886
+ cs_curr[seq_idx] = -1;
8887
+ }
8888
+ }
8889
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8890
+ max_contig_idx = curr_contig_idx;
8891
+ max_contig = kv_cells.size() - curr_contig_idx;
8892
+ }
8893
+ view->max_contiguous = max_contig;
8894
+ view->max_contiguous_idx = max_contig_idx;
8895
+ view->token_count = token_count;
8896
+ view->used_cells = used_cells;
8897
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8898
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8899
+ __func__, ctx->kv_self.used, used_cells);
8900
+ }
8901
+ }
8902
+
8727
8903
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8728
- return ctx->kv_self.head;
8904
+ int result = 0;
8905
+
8906
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8907
+ result += ctx->kv_self.cells[i].seq_id.size();
8908
+ }
8909
+
8910
+ return result;
8911
+ }
8912
+
8913
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8914
+ return ctx->kv_self.used;
8729
8915
  }
8730
8916
 
8731
8917
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8895,10 +9081,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8895
9081
  const size_t kv_buf_size = kv_self.buf.size;
8896
9082
  const uint32_t kv_head = kv_self.head;
8897
9083
  const uint32_t kv_size = kv_self.size;
9084
+ const uint32_t kv_used = kv_self.used;
8898
9085
 
8899
9086
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8900
9087
  data_ctx->write(&kv_head, sizeof(kv_head));
8901
9088
  data_ctx->write(&kv_size, sizeof(kv_size));
9089
+ data_ctx->write(&kv_used, sizeof(kv_used));
8902
9090
 
8903
9091
  if (kv_buf_size) {
8904
9092
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -9021,10 +9209,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9021
9209
  size_t kv_buf_size;
9022
9210
  uint32_t kv_head;
9023
9211
  uint32_t kv_size;
9212
+ uint32_t kv_used;
9024
9213
 
9025
9214
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9026
9215
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9027
9216
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9217
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9028
9218
 
9029
9219
  if (kv_buf_size) {
9030
9220
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
@@ -9059,6 +9249,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9059
9249
 
9060
9250
  ctx->kv_self.head = kv_head;
9061
9251
  ctx->kv_self.size = kv_size;
9252
+ ctx->kv_self.used = kv_used;
9062
9253
 
9063
9254
  ctx->kv_self.cells.resize(kv_size);
9064
9255
 
@@ -9521,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
9521
9712
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
9522
9713
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
9523
9714
  g_state.log_callback_user_data = user_data;
9715
+ #ifdef GGML_USE_METAL
9716
+ ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
9717
+ #endif
9524
9718
  }
9525
9719
 
9526
9720
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {