llama_cpp 0.9.3 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -91,7 +91,7 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
- #define LLAMA_MAX_NODES 4096
94
+ #define LLAMA_MAX_NODES 8192
95
95
 
96
96
  //
97
97
  // logging
@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
604
604
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
605
605
  }
606
606
 
607
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
608
+ switch (type) {
609
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
610
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
611
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
612
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
613
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
614
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
615
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
616
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
617
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
618
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
619
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
620
+ default: return format("unknown type %d", type);
621
+ }
622
+ }
623
+
624
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
625
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
626
+
627
+ switch (type) {
628
+ case GGUF_TYPE_STRING:
629
+ return gguf_get_val_str(ctx_gguf, i);
630
+ case GGUF_TYPE_ARRAY:
631
+ {
632
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
633
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
634
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
635
+ std::stringstream ss;
636
+ ss << "[";
637
+ for (int j = 0; j < arr_n; j++) {
638
+ if (arr_type == GGUF_TYPE_STRING) {
639
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
640
+ // escape quotes
641
+ replace_all(val, "\\", "\\\\");
642
+ replace_all(val, "\"", "\\\"");
643
+ ss << '"' << val << '"';
644
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
645
+ ss << "???";
646
+ } else {
647
+ ss << gguf_data_to_str(arr_type, data, j);
648
+ }
649
+ if (j < arr_n - 1) {
650
+ ss << ", ";
651
+ }
652
+ }
653
+ ss << "]";
654
+ return ss.str();
655
+ }
656
+ default:
657
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
658
+ }
659
+ }
660
+
607
661
  //
608
662
  // ggml helpers
609
663
  //
@@ -1221,6 +1275,7 @@ struct llama_kv_cache {
1221
1275
  // cannot be freely changed after a slot has been allocated.
1222
1276
  uint32_t head = 0;
1223
1277
  uint32_t size = 0;
1278
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1224
1279
 
1225
1280
  // computed before each graph build
1226
1281
  uint32_t n = 0;
@@ -1322,6 +1377,9 @@ struct llama_model {
1322
1377
 
1323
1378
  int n_gpu_layers;
1324
1379
 
1380
+ // gguf metadata
1381
+ std::unordered_map<std::string, std::string> gguf_kv;
1382
+
1325
1383
  // context
1326
1384
  struct ggml_context * ctx = NULL;
1327
1385
 
@@ -1442,6 +1500,7 @@ static bool llama_kv_cache_init(
1442
1500
 
1443
1501
  cache.head = 0;
1444
1502
  cache.size = n_ctx;
1503
+ cache.used = 0;
1445
1504
 
1446
1505
  cache.cells.clear();
1447
1506
  cache.cells.resize(n_ctx);
@@ -1543,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
1543
1602
  }
1544
1603
  }
1545
1604
 
1605
+ cache.used += n_tokens;
1606
+
1546
1607
  return true;
1547
1608
  }
1548
1609
 
@@ -1563,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1563
1624
  cache.cells[i].seq_id.clear();
1564
1625
  }
1565
1626
  cache.head = 0;
1627
+ cache.used = 0;
1566
1628
  }
1567
1629
 
1568
1630
  static void llama_kv_cache_seq_rm(
@@ -1585,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
1585
1647
  continue;
1586
1648
  }
1587
1649
  if (cache.cells[i].seq_id.empty()) {
1650
+ // keep count of the number of used cells
1651
+ if (cache.cells[i].pos >= 0) cache.used--;
1652
+
1588
1653
  cache.cells[i].pos = -1;
1589
1654
  if (new_head == cache.size) new_head = i;
1590
1655
  }
@@ -1592,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
1592
1657
  }
1593
1658
 
1594
1659
  // If we freed up a slot, set head to it so searching can start there.
1595
- if (new_head != cache.size) cache.head = new_head;
1660
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1596
1661
  }
1597
1662
 
1598
1663
  static void llama_kv_cache_seq_cp(
@@ -1618,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1618
1683
 
1619
1684
  for (uint32_t i = 0; i < cache.size; ++i) {
1620
1685
  if (!cache.cells[i].has_seq_id(seq_id)) {
1686
+ if (cache.cells[i].pos >= 0) cache.used--;
1621
1687
  cache.cells[i].pos = -1;
1622
1688
  cache.cells[i].seq_id.clear();
1623
1689
  if (new_head == cache.size) new_head = i;
@@ -1628,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1628
1694
  }
1629
1695
 
1630
1696
  // If we freed up a slot, set head to it so searching can start there.
1631
- if (new_head != cache.size) cache.head = new_head;
1697
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1632
1698
  }
1633
1699
 
1634
1700
  static void llama_kv_cache_seq_shift(
@@ -1649,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
1649
1715
  cache.cells[i].delta += delta;
1650
1716
 
1651
1717
  if (cache.cells[i].pos < 0) {
1718
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1652
1719
  cache.cells[i].pos = -1;
1653
1720
  cache.cells[i].seq_id.clear();
1654
1721
  if (new_head == cache.size) new_head = i;
@@ -1780,10 +1847,10 @@ struct llama_model_loader {
1780
1847
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1781
1848
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1782
1849
  default:
1783
- {
1784
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1785
- ftype = LLAMA_FTYPE_ALL_F32;
1786
- } break;
1850
+ {
1851
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1852
+ ftype = LLAMA_FTYPE_ALL_F32;
1853
+ } break;
1787
1854
  }
1788
1855
 
1789
1856
  // this is a way to mark that we have "guessed" the file type
@@ -1797,10 +1864,21 @@ struct llama_model_loader {
1797
1864
  }
1798
1865
 
1799
1866
  for (int i = 0; i < n_kv; i++) {
1800
- const char * name = gguf_get_key(ctx_gguf, i);
1801
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1867
+ const char * name = gguf_get_key(ctx_gguf, i);
1868
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1869
+ const std::string type_name =
1870
+ type == GGUF_TYPE_ARRAY
1871
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1872
+ : gguf_type_name(type);
1873
+
1874
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1875
+ const size_t MAX_VALUE_LEN = 40;
1876
+ if (value.size() > MAX_VALUE_LEN) {
1877
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1878
+ }
1879
+ replace_all(value, "\n", "\\n");
1802
1880
 
1803
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1881
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1804
1882
  }
1805
1883
 
1806
1884
  // print type counts
@@ -2095,6 +2173,17 @@ static void llm_load_hparams(
2095
2173
 
2096
2174
  auto & hparams = model.hparams;
2097
2175
 
2176
+ // get metadata as string
2177
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2178
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2179
+ if (type == GGUF_TYPE_ARRAY) {
2180
+ continue;
2181
+ }
2182
+ const char * name = gguf_get_key(ctx, i);
2183
+ const std::string value = gguf_kv_to_str(ctx, i);
2184
+ model.gguf_kv.emplace(name, value);
2185
+ }
2186
+
2098
2187
  // get general kv
2099
2188
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2100
2189
 
@@ -4730,92 +4819,34 @@ struct llm_build_context {
4730
4819
  // self-attention
4731
4820
  {
4732
4821
  // compute Q and K and RoPE them
4733
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
- cb(tmpq, "tmpq", il);
4822
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4823
+ cb(Qcur, "Qcur", il);
4735
4824
 
4736
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
- cb(tmpk, "tmpk", il);
4825
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4826
+ cb(Kcur, "Kcur", il);
4738
4827
 
4739
4828
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
4829
  cb(Vcur, "Vcur", il);
4741
4830
 
4742
- // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
- struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
- ggml_element_size(tmpq) * n_embd_head,
4746
- ggml_element_size(tmpq) * n_embd_head * n_head,
4747
- 0
4748
- ));
4749
- cb(qrot, "qrot", il);
4750
-
4751
- struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
- ggml_element_size(tmpk) * n_embd_head,
4754
- ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
- 0
4756
- ));
4757
- cb(krot, "krot", il);
4758
-
4759
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
- struct ggml_tensor * qpass = ggml_view_3d(
4761
- ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
- ggml_element_size(tmpq) * n_embd_head,
4763
- ggml_element_size(tmpq) * n_embd_head * n_head,
4764
- ggml_element_size(tmpq) * hparams.n_rot
4765
- );
4766
- cb(qpass, "qpass", il);
4767
-
4768
- struct ggml_tensor * kpass = ggml_view_3d(
4769
- ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
- ggml_element_size(tmpk) * (n_embd_head),
4771
- ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
- ggml_element_size(tmpk) * hparams.n_rot
4773
- );
4774
- cb(kpass, "kpass", il);
4775
-
4776
- struct ggml_tensor * qrotated = ggml_rope_custom(
4777
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
- );
4780
- cb(qrotated, "qrotated", il);
4781
-
4782
- struct ggml_tensor * krotated = ggml_rope_custom(
4783
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4831
+ Qcur = ggml_rope_custom(
4832
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4833
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4834
+ ext_factor, attn_factor, beta_fast, beta_slow
4785
4835
  );
4786
- cb(krotated, "krotated", il);
4787
-
4788
- // ggml currently only supports concatenation on dim=2
4789
- // so we need to permute qrot, qpass, concat, then permute back.
4790
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
- cb(qrotated, "qrotated", il);
4792
-
4793
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
- cb(krotated, "krotated", il);
4795
-
4796
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
- cb(qpass, "qpass", il);
4798
-
4799
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
- cb(kpass, "kpass", il);
4801
-
4802
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
4836
  cb(Qcur, "Qcur", il);
4804
4837
 
4805
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
- cb(Kcur, "Kcur", il);
4807
-
4808
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
- cb(Q, "Q", il);
4810
-
4811
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4838
+ Kcur = ggml_rope_custom(
4839
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4840
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4841
+ ext_factor, attn_factor, beta_fast, beta_slow
4842
+ );
4812
4843
  cb(Kcur, "Kcur", il);
4813
4844
 
4814
4845
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
4846
 
4816
4847
  cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
4848
  model.layers[il].wo, NULL,
4818
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4849
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
4850
  cb(cur, "kqv_out", il);
4820
4851
  }
4821
4852
 
@@ -5443,6 +5474,12 @@ static int llama_decode_internal(
5443
5474
  batch.seq_id = seq_id_arr.data();
5444
5475
  }
5445
5476
 
5477
+ // if we have enough unused cells before the current head ->
5478
+ // better to start searching from the beginning of the cache, hoping to fill it
5479
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5480
+ kv_self.head = 0;
5481
+ }
5482
+
5446
5483
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5447
5484
  return 1;
5448
5485
  }
@@ -5453,7 +5490,7 @@ static int llama_decode_internal(
5453
5490
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5454
5491
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5455
5492
 
5456
- //printf("kv_self.n = %d\n", kv_self.n);
5493
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5457
5494
 
5458
5495
  ggml_allocr_reset(lctx.alloc);
5459
5496
 
@@ -8666,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8666
8703
  return model->hparams.rope_freq_scale_train;
8667
8704
  }
8668
8705
 
8706
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8707
+ const auto & it = model->gguf_kv.find(key);
8708
+ if (it == model->gguf_kv.end()) {
8709
+ if (buf_size > 0) {
8710
+ buf[0] = '\0';
8711
+ }
8712
+ return -1;
8713
+ }
8714
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8715
+ }
8716
+
8717
+ int llama_model_meta_count(const struct llama_model * model) {
8718
+ return (int)model->gguf_kv.size();
8719
+ }
8720
+
8721
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8722
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8723
+ if (buf_size > 0) {
8724
+ buf[0] = '\0';
8725
+ }
8726
+ return -1;
8727
+ }
8728
+ auto it = model->gguf_kv.begin();
8729
+ std::advance(it, i);
8730
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8731
+ }
8732
+
8733
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8734
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8735
+ if (buf_size > 0) {
8736
+ buf[0] = '\0';
8737
+ }
8738
+ return -1;
8739
+ }
8740
+ auto it = model->gguf_kv.begin();
8741
+ std::advance(it, i);
8742
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8743
+ }
8744
+
8669
8745
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8670
8746
  return snprintf(buf, buf_size, "%s %s %s",
8671
8747
  llama_model_arch_name(model->arch).c_str(),
@@ -8724,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8724
8800
  }
8725
8801
  }
8726
8802
 
8803
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8804
+ struct llama_kv_cache_view result = {
8805
+ /*.n_cells = */ 0,
8806
+ /*.n_max_seq = */ n_max_seq,
8807
+ /*.token_count = */ 0,
8808
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8809
+ /*.max_contiguous = */ 0,
8810
+ /*.max_contiguous_idx = */ -1,
8811
+ /*.cells = */ nullptr,
8812
+ /*.cells_sequences = */ nullptr,
8813
+ };
8814
+ return result;
8815
+ }
8816
+
8817
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8818
+ if (view->cells != nullptr) {
8819
+ free(view->cells);
8820
+ view->cells = nullptr;
8821
+ }
8822
+ if (view->cells_sequences != nullptr) {
8823
+ free(view->cells_sequences);
8824
+ view->cells_sequences = nullptr;
8825
+ }
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8829
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8830
+ view->n_cells = int32_t(ctx->kv_self.size);
8831
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8832
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8833
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8834
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8835
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8836
+ view->cells_sequences = (llama_seq_id *)p;
8837
+ }
8838
+
8839
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8840
+ llama_kv_cache_view_cell * c_curr = view->cells;
8841
+ llama_seq_id * cs_curr = view->cells_sequences;
8842
+ int32_t used_cells = 0;
8843
+ int32_t token_count = 0;
8844
+ int32_t curr_contig_idx = -1;
8845
+ uint32_t max_contig = 0;
8846
+ int32_t max_contig_idx = -1;
8847
+
8848
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8849
+ const size_t curr_size = kv_cells[i].seq_id.size();
8850
+ token_count += curr_size;
8851
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8852
+
8853
+ if (curr_size > 0) {
8854
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8855
+ max_contig = i - curr_contig_idx;
8856
+ max_contig_idx = curr_contig_idx;
8857
+ }
8858
+ curr_contig_idx = -1;
8859
+ } else if (curr_contig_idx < 0) {
8860
+ curr_contig_idx = i;
8861
+ }
8862
+
8863
+ int seq_idx = 0;
8864
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8865
+ if (seq_idx >= view->n_max_seq) {
8866
+ break;
8867
+ }
8868
+ cs_curr[seq_idx] = it;
8869
+ seq_idx++;
8870
+ }
8871
+ if (seq_idx != 0) {
8872
+ used_cells++;
8873
+ }
8874
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8875
+ cs_curr[seq_idx] = -1;
8876
+ }
8877
+ }
8878
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8879
+ max_contig_idx = curr_contig_idx;
8880
+ max_contig = kv_cells.size() - curr_contig_idx;
8881
+ }
8882
+ view->max_contiguous = max_contig;
8883
+ view->max_contiguous_idx = max_contig_idx;
8884
+ view->token_count = token_count;
8885
+ view->used_cells = used_cells;
8886
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8887
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8888
+ __func__, ctx->kv_self.used, used_cells);
8889
+ }
8890
+ }
8891
+
8727
8892
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8728
- return ctx->kv_self.head;
8893
+ int result = 0;
8894
+
8895
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8896
+ result += ctx->kv_self.cells[i].seq_id.size();
8897
+ }
8898
+
8899
+ return result;
8900
+ }
8901
+
8902
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8903
+ return ctx->kv_self.used;
8729
8904
  }
8730
8905
 
8731
8906
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8895,10 +9070,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8895
9070
  const size_t kv_buf_size = kv_self.buf.size;
8896
9071
  const uint32_t kv_head = kv_self.head;
8897
9072
  const uint32_t kv_size = kv_self.size;
9073
+ const uint32_t kv_used = kv_self.used;
8898
9074
 
8899
9075
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8900
9076
  data_ctx->write(&kv_head, sizeof(kv_head));
8901
9077
  data_ctx->write(&kv_size, sizeof(kv_size));
9078
+ data_ctx->write(&kv_used, sizeof(kv_used));
8902
9079
 
8903
9080
  if (kv_buf_size) {
8904
9081
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -9021,10 +9198,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9021
9198
  size_t kv_buf_size;
9022
9199
  uint32_t kv_head;
9023
9200
  uint32_t kv_size;
9201
+ uint32_t kv_used;
9024
9202
 
9025
9203
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9026
9204
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9027
9205
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9206
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9028
9207
 
9029
9208
  if (kv_buf_size) {
9030
9209
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
@@ -9059,6 +9238,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9059
9238
 
9060
9239
  ctx->kv_self.head = kv_head;
9061
9240
  ctx->kv_self.size = kv_size;
9241
+ ctx->kv_self.used = kv_used;
9062
9242
 
9063
9243
  ctx->kv_self.cells.resize(kv_size);
9064
9244
 
@@ -301,6 +301,23 @@ extern "C" {
301
301
  // Get the model's RoPE frequency scaling factor
302
302
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
303
303
 
304
+ // Functions to access the model's GGUF metadata scalar values
305
+ // - The functions return the length of the string on success, or -1 on failure
306
+ // - The output string is always null-terminated and cleared on failure
307
+ // - GGUF array values are not supported by these functions
308
+
309
+ // Get metadata value as a string by key name
310
+ LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
311
+
312
+ // Get the number of metadata key/value pairs
313
+ LLAMA_API int llama_model_meta_count(const struct llama_model * model);
314
+
315
+ // Get metadata key name by index
316
+ LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
317
+
318
+ // Get metadata value as a string by index
319
+ LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
320
+
304
321
  // Get a string describing the model type
305
322
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
306
323
 
@@ -344,9 +361,60 @@ extern "C" {
344
361
  // KV cache
345
362
  //
346
363
 
347
- // Returns the number of tokens in the KV cache
348
- LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
349
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
364
+ // Information associated with an individual cell in the KV cache view.
365
+ struct llama_kv_cache_view_cell {
366
+ // The position for this cell. Takes KV cache shifts into account.
367
+ // May be negative if the cell is not populated.
368
+ llama_pos pos;
369
+ };
370
+
371
+ // An updateable view of the KV cache.
372
+ struct llama_kv_cache_view {
373
+ // Number of KV cache cells. This will be the same as the context size.
374
+ int32_t n_cells;
375
+
376
+ // Maximum number of sequences that can exist in a cell. It's not an error
377
+ // if there are more sequences in a cell than this value, however they will
378
+ // not be visible in the view cells_sequences.
379
+ int32_t n_max_seq;
380
+
381
+ // Number of tokens in the cache. For example, if there are two populated
382
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
383
+ // ids then you'll have 3 tokens.
384
+ int32_t token_count;
385
+
386
+ // Number of populated cache cells.
387
+ int32_t used_cells;
388
+
389
+ // Maximum contiguous empty slots in the cache.
390
+ int32_t max_contiguous;
391
+
392
+ // Index to the start of the max_contiguous slot range. Can be negative
393
+ // when cache is full.
394
+ int32_t max_contiguous_idx;
395
+
396
+ // Information for an individual cell.
397
+ struct llama_kv_cache_view_cell * cells;
398
+
399
+ // The sequences for each cell. There will be n_max_seq items per cell.
400
+ llama_seq_id * cells_sequences;
401
+ };
402
+
403
+ // Create an empty KV cache view. (use only for debugging purposes)
404
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
405
+
406
+ // Free a KV cache view. (use only for debugging purposes)
407
+ LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
408
+
409
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
410
+ LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
411
+
412
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
413
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
414
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
415
+
416
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
417
+ LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
350
418
 
351
419
  // Clear the KV cache
352
420
  LLAMA_API void llama_kv_cache_clear(
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.3'
6
+ VERSION = '0.9.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1523'
9
+ LLAMA_CPP_VERSION = 'b1555'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.9.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-18 00:00:00.000000000 Z
11
+ date: 2023-11-25 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: