llama_cpp 0.9.3 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,7 +91,7 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
- #define LLAMA_MAX_NODES 4096
94
+ #define LLAMA_MAX_NODES 8192
95
95
 
96
96
  //
97
97
  // logging
@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
604
604
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
605
605
  }
606
606
 
607
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
608
+ switch (type) {
609
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
610
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
611
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
612
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
613
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
614
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
615
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
616
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
617
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
618
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
619
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
620
+ default: return format("unknown type %d", type);
621
+ }
622
+ }
623
+
624
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
625
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
626
+
627
+ switch (type) {
628
+ case GGUF_TYPE_STRING:
629
+ return gguf_get_val_str(ctx_gguf, i);
630
+ case GGUF_TYPE_ARRAY:
631
+ {
632
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
633
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
634
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
635
+ std::stringstream ss;
636
+ ss << "[";
637
+ for (int j = 0; j < arr_n; j++) {
638
+ if (arr_type == GGUF_TYPE_STRING) {
639
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
640
+ // escape quotes
641
+ replace_all(val, "\\", "\\\\");
642
+ replace_all(val, "\"", "\\\"");
643
+ ss << '"' << val << '"';
644
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
645
+ ss << "???";
646
+ } else {
647
+ ss << gguf_data_to_str(arr_type, data, j);
648
+ }
649
+ if (j < arr_n - 1) {
650
+ ss << ", ";
651
+ }
652
+ }
653
+ ss << "]";
654
+ return ss.str();
655
+ }
656
+ default:
657
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
658
+ }
659
+ }
660
+
607
661
  //
608
662
  // ggml helpers
609
663
  //
@@ -1221,6 +1275,7 @@ struct llama_kv_cache {
1221
1275
  // cannot be freely changed after a slot has been allocated.
1222
1276
  uint32_t head = 0;
1223
1277
  uint32_t size = 0;
1278
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1224
1279
 
1225
1280
  // computed before each graph build
1226
1281
  uint32_t n = 0;
@@ -1322,6 +1377,9 @@ struct llama_model {
1322
1377
 
1323
1378
  int n_gpu_layers;
1324
1379
 
1380
+ // gguf metadata
1381
+ std::unordered_map<std::string, std::string> gguf_kv;
1382
+
1325
1383
  // context
1326
1384
  struct ggml_context * ctx = NULL;
1327
1385
 
@@ -1442,6 +1500,7 @@ static bool llama_kv_cache_init(
1442
1500
 
1443
1501
  cache.head = 0;
1444
1502
  cache.size = n_ctx;
1503
+ cache.used = 0;
1445
1504
 
1446
1505
  cache.cells.clear();
1447
1506
  cache.cells.resize(n_ctx);
@@ -1543,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
1543
1602
  }
1544
1603
  }
1545
1604
 
1605
+ cache.used += n_tokens;
1606
+
1546
1607
  return true;
1547
1608
  }
1548
1609
 
@@ -1563,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1563
1624
  cache.cells[i].seq_id.clear();
1564
1625
  }
1565
1626
  cache.head = 0;
1627
+ cache.used = 0;
1566
1628
  }
1567
1629
 
1568
1630
  static void llama_kv_cache_seq_rm(
@@ -1585,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
1585
1647
  continue;
1586
1648
  }
1587
1649
  if (cache.cells[i].seq_id.empty()) {
1650
+ // keep count of the number of used cells
1651
+ if (cache.cells[i].pos >= 0) cache.used--;
1652
+
1588
1653
  cache.cells[i].pos = -1;
1589
1654
  if (new_head == cache.size) new_head = i;
1590
1655
  }
@@ -1592,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
1592
1657
  }
1593
1658
 
1594
1659
  // If we freed up a slot, set head to it so searching can start there.
1595
- if (new_head != cache.size) cache.head = new_head;
1660
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1596
1661
  }
1597
1662
 
1598
1663
  static void llama_kv_cache_seq_cp(
@@ -1618,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1618
1683
 
1619
1684
  for (uint32_t i = 0; i < cache.size; ++i) {
1620
1685
  if (!cache.cells[i].has_seq_id(seq_id)) {
1686
+ if (cache.cells[i].pos >= 0) cache.used--;
1621
1687
  cache.cells[i].pos = -1;
1622
1688
  cache.cells[i].seq_id.clear();
1623
1689
  if (new_head == cache.size) new_head = i;
@@ -1628,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1628
1694
  }
1629
1695
 
1630
1696
  // If we freed up a slot, set head to it so searching can start there.
1631
- if (new_head != cache.size) cache.head = new_head;
1697
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1632
1698
  }
1633
1699
 
1634
1700
  static void llama_kv_cache_seq_shift(
@@ -1649,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
1649
1715
  cache.cells[i].delta += delta;
1650
1716
 
1651
1717
  if (cache.cells[i].pos < 0) {
1718
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1652
1719
  cache.cells[i].pos = -1;
1653
1720
  cache.cells[i].seq_id.clear();
1654
1721
  if (new_head == cache.size) new_head = i;
@@ -1780,10 +1847,10 @@ struct llama_model_loader {
1780
1847
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1781
1848
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1782
1849
  default:
1783
- {
1784
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1785
- ftype = LLAMA_FTYPE_ALL_F32;
1786
- } break;
1850
+ {
1851
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1852
+ ftype = LLAMA_FTYPE_ALL_F32;
1853
+ } break;
1787
1854
  }
1788
1855
 
1789
1856
  // this is a way to mark that we have "guessed" the file type
@@ -1797,10 +1864,21 @@ struct llama_model_loader {
1797
1864
  }
1798
1865
 
1799
1866
  for (int i = 0; i < n_kv; i++) {
1800
- const char * name = gguf_get_key(ctx_gguf, i);
1801
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1867
+ const char * name = gguf_get_key(ctx_gguf, i);
1868
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1869
+ const std::string type_name =
1870
+ type == GGUF_TYPE_ARRAY
1871
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1872
+ : gguf_type_name(type);
1873
+
1874
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1875
+ const size_t MAX_VALUE_LEN = 40;
1876
+ if (value.size() > MAX_VALUE_LEN) {
1877
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1878
+ }
1879
+ replace_all(value, "\n", "\\n");
1802
1880
 
1803
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1881
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1804
1882
  }
1805
1883
 
1806
1884
  // print type counts
@@ -2095,6 +2173,17 @@ static void llm_load_hparams(
2095
2173
 
2096
2174
  auto & hparams = model.hparams;
2097
2175
 
2176
+ // get metadata as string
2177
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2178
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2179
+ if (type == GGUF_TYPE_ARRAY) {
2180
+ continue;
2181
+ }
2182
+ const char * name = gguf_get_key(ctx, i);
2183
+ const std::string value = gguf_kv_to_str(ctx, i);
2184
+ model.gguf_kv.emplace(name, value);
2185
+ }
2186
+
2098
2187
  // get general kv
2099
2188
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2100
2189
 
@@ -4730,92 +4819,34 @@ struct llm_build_context {
4730
4819
  // self-attention
4731
4820
  {
4732
4821
  // compute Q and K and RoPE them
4733
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
- cb(tmpq, "tmpq", il);
4822
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4823
+ cb(Qcur, "Qcur", il);
4735
4824
 
4736
- struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
- cb(tmpk, "tmpk", il);
4825
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4826
+ cb(Kcur, "Kcur", il);
4738
4827
 
4739
4828
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
4829
  cb(Vcur, "Vcur", il);
4741
4830
 
4742
- // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
- struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
- ggml_element_size(tmpq) * n_embd_head,
4746
- ggml_element_size(tmpq) * n_embd_head * n_head,
4747
- 0
4748
- ));
4749
- cb(qrot, "qrot", il);
4750
-
4751
- struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
- ggml_element_size(tmpk) * n_embd_head,
4754
- ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
- 0
4756
- ));
4757
- cb(krot, "krot", il);
4758
-
4759
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
- struct ggml_tensor * qpass = ggml_view_3d(
4761
- ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
- ggml_element_size(tmpq) * n_embd_head,
4763
- ggml_element_size(tmpq) * n_embd_head * n_head,
4764
- ggml_element_size(tmpq) * hparams.n_rot
4765
- );
4766
- cb(qpass, "qpass", il);
4767
-
4768
- struct ggml_tensor * kpass = ggml_view_3d(
4769
- ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
- ggml_element_size(tmpk) * (n_embd_head),
4771
- ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
- ggml_element_size(tmpk) * hparams.n_rot
4773
- );
4774
- cb(kpass, "kpass", il);
4775
-
4776
- struct ggml_tensor * qrotated = ggml_rope_custom(
4777
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
- );
4780
- cb(qrotated, "qrotated", il);
4781
-
4782
- struct ggml_tensor * krotated = ggml_rope_custom(
4783
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4831
+ Qcur = ggml_rope_custom(
4832
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4833
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4834
+ ext_factor, attn_factor, beta_fast, beta_slow
4785
4835
  );
4786
- cb(krotated, "krotated", il);
4787
-
4788
- // ggml currently only supports concatenation on dim=2
4789
- // so we need to permute qrot, qpass, concat, then permute back.
4790
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
- cb(qrotated, "qrotated", il);
4792
-
4793
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
- cb(krotated, "krotated", il);
4795
-
4796
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
- cb(qpass, "qpass", il);
4798
-
4799
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
- cb(kpass, "kpass", il);
4801
-
4802
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
4836
  cb(Qcur, "Qcur", il);
4804
4837
 
4805
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
- cb(Kcur, "Kcur", il);
4807
-
4808
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
- cb(Q, "Q", il);
4810
-
4811
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4838
+ Kcur = ggml_rope_custom(
4839
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4840
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4841
+ ext_factor, attn_factor, beta_fast, beta_slow
4842
+ );
4812
4843
  cb(Kcur, "Kcur", il);
4813
4844
 
4814
4845
  llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
4846
 
4816
4847
  cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
4848
  model.layers[il].wo, NULL,
4818
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4849
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
4850
  cb(cur, "kqv_out", il);
4820
4851
  }
4821
4852
 
@@ -5443,6 +5474,12 @@ static int llama_decode_internal(
5443
5474
  batch.seq_id = seq_id_arr.data();
5444
5475
  }
5445
5476
 
5477
+ // if we have enough unused cells before the current head ->
5478
+ // better to start searching from the beginning of the cache, hoping to fill it
5479
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5480
+ kv_self.head = 0;
5481
+ }
5482
+
5446
5483
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5447
5484
  return 1;
5448
5485
  }
@@ -5453,7 +5490,7 @@ static int llama_decode_internal(
5453
5490
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5454
5491
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5455
5492
 
5456
- //printf("kv_self.n = %d\n", kv_self.n);
5493
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5457
5494
 
5458
5495
  ggml_allocr_reset(lctx.alloc);
5459
5496
 
@@ -8666,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8666
8703
  return model->hparams.rope_freq_scale_train;
8667
8704
  }
8668
8705
 
8706
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8707
+ const auto & it = model->gguf_kv.find(key);
8708
+ if (it == model->gguf_kv.end()) {
8709
+ if (buf_size > 0) {
8710
+ buf[0] = '\0';
8711
+ }
8712
+ return -1;
8713
+ }
8714
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8715
+ }
8716
+
8717
+ int llama_model_meta_count(const struct llama_model * model) {
8718
+ return (int)model->gguf_kv.size();
8719
+ }
8720
+
8721
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8722
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8723
+ if (buf_size > 0) {
8724
+ buf[0] = '\0';
8725
+ }
8726
+ return -1;
8727
+ }
8728
+ auto it = model->gguf_kv.begin();
8729
+ std::advance(it, i);
8730
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8731
+ }
8732
+
8733
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8734
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8735
+ if (buf_size > 0) {
8736
+ buf[0] = '\0';
8737
+ }
8738
+ return -1;
8739
+ }
8740
+ auto it = model->gguf_kv.begin();
8741
+ std::advance(it, i);
8742
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8743
+ }
8744
+
8669
8745
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8670
8746
  return snprintf(buf, buf_size, "%s %s %s",
8671
8747
  llama_model_arch_name(model->arch).c_str(),
@@ -8724,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8724
8800
  }
8725
8801
  }
8726
8802
 
8803
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8804
+ struct llama_kv_cache_view result = {
8805
+ /*.n_cells = */ 0,
8806
+ /*.n_max_seq = */ n_max_seq,
8807
+ /*.token_count = */ 0,
8808
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8809
+ /*.max_contiguous = */ 0,
8810
+ /*.max_contiguous_idx = */ -1,
8811
+ /*.cells = */ nullptr,
8812
+ /*.cells_sequences = */ nullptr,
8813
+ };
8814
+ return result;
8815
+ }
8816
+
8817
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8818
+ if (view->cells != nullptr) {
8819
+ free(view->cells);
8820
+ view->cells = nullptr;
8821
+ }
8822
+ if (view->cells_sequences != nullptr) {
8823
+ free(view->cells_sequences);
8824
+ view->cells_sequences = nullptr;
8825
+ }
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8829
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8830
+ view->n_cells = int32_t(ctx->kv_self.size);
8831
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8832
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8833
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8834
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8835
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8836
+ view->cells_sequences = (llama_seq_id *)p;
8837
+ }
8838
+
8839
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8840
+ llama_kv_cache_view_cell * c_curr = view->cells;
8841
+ llama_seq_id * cs_curr = view->cells_sequences;
8842
+ int32_t used_cells = 0;
8843
+ int32_t token_count = 0;
8844
+ int32_t curr_contig_idx = -1;
8845
+ uint32_t max_contig = 0;
8846
+ int32_t max_contig_idx = -1;
8847
+
8848
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8849
+ const size_t curr_size = kv_cells[i].seq_id.size();
8850
+ token_count += curr_size;
8851
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8852
+
8853
+ if (curr_size > 0) {
8854
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8855
+ max_contig = i - curr_contig_idx;
8856
+ max_contig_idx = curr_contig_idx;
8857
+ }
8858
+ curr_contig_idx = -1;
8859
+ } else if (curr_contig_idx < 0) {
8860
+ curr_contig_idx = i;
8861
+ }
8862
+
8863
+ int seq_idx = 0;
8864
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8865
+ if (seq_idx >= view->n_max_seq) {
8866
+ break;
8867
+ }
8868
+ cs_curr[seq_idx] = it;
8869
+ seq_idx++;
8870
+ }
8871
+ if (seq_idx != 0) {
8872
+ used_cells++;
8873
+ }
8874
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8875
+ cs_curr[seq_idx] = -1;
8876
+ }
8877
+ }
8878
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8879
+ max_contig_idx = curr_contig_idx;
8880
+ max_contig = kv_cells.size() - curr_contig_idx;
8881
+ }
8882
+ view->max_contiguous = max_contig;
8883
+ view->max_contiguous_idx = max_contig_idx;
8884
+ view->token_count = token_count;
8885
+ view->used_cells = used_cells;
8886
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8887
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8888
+ __func__, ctx->kv_self.used, used_cells);
8889
+ }
8890
+ }
8891
+
8727
8892
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8728
- return ctx->kv_self.head;
8893
+ int result = 0;
8894
+
8895
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8896
+ result += ctx->kv_self.cells[i].seq_id.size();
8897
+ }
8898
+
8899
+ return result;
8900
+ }
8901
+
8902
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8903
+ return ctx->kv_self.used;
8729
8904
  }
8730
8905
 
8731
8906
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8895,10 +9070,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8895
9070
  const size_t kv_buf_size = kv_self.buf.size;
8896
9071
  const uint32_t kv_head = kv_self.head;
8897
9072
  const uint32_t kv_size = kv_self.size;
9073
+ const uint32_t kv_used = kv_self.used;
8898
9074
 
8899
9075
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8900
9076
  data_ctx->write(&kv_head, sizeof(kv_head));
8901
9077
  data_ctx->write(&kv_size, sizeof(kv_size));
9078
+ data_ctx->write(&kv_used, sizeof(kv_used));
8902
9079
 
8903
9080
  if (kv_buf_size) {
8904
9081
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -9021,10 +9198,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9021
9198
  size_t kv_buf_size;
9022
9199
  uint32_t kv_head;
9023
9200
  uint32_t kv_size;
9201
+ uint32_t kv_used;
9024
9202
 
9025
9203
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9026
9204
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9027
9205
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9206
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
9028
9207
 
9029
9208
  if (kv_buf_size) {
9030
9209
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
@@ -9059,6 +9238,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9059
9238
 
9060
9239
  ctx->kv_self.head = kv_head;
9061
9240
  ctx->kv_self.size = kv_size;
9241
+ ctx->kv_self.used = kv_used;
9062
9242
 
9063
9243
  ctx->kv_self.cells.resize(kv_size);
9064
9244
 
@@ -301,6 +301,23 @@ extern "C" {
301
301
  // Get the model's RoPE frequency scaling factor
302
302
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
303
303
 
304
+ // Functions to access the model's GGUF metadata scalar values
305
+ // - The functions return the length of the string on success, or -1 on failure
306
+ // - The output string is always null-terminated and cleared on failure
307
+ // - GGUF array values are not supported by these functions
308
+
309
+ // Get metadata value as a string by key name
310
+ LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
311
+
312
+ // Get the number of metadata key/value pairs
313
+ LLAMA_API int llama_model_meta_count(const struct llama_model * model);
314
+
315
+ // Get metadata key name by index
316
+ LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
317
+
318
+ // Get metadata value as a string by index
319
+ LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
320
+
304
321
  // Get a string describing the model type
305
322
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
306
323
 
@@ -344,9 +361,60 @@ extern "C" {
344
361
  // KV cache
345
362
  //
346
363
 
347
- // Returns the number of tokens in the KV cache
348
- LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
349
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
364
+ // Information associated with an individual cell in the KV cache view.
365
+ struct llama_kv_cache_view_cell {
366
+ // The position for this cell. Takes KV cache shifts into account.
367
+ // May be negative if the cell is not populated.
368
+ llama_pos pos;
369
+ };
370
+
371
+ // An updateable view of the KV cache.
372
+ struct llama_kv_cache_view {
373
+ // Number of KV cache cells. This will be the same as the context size.
374
+ int32_t n_cells;
375
+
376
+ // Maximum number of sequences that can exist in a cell. It's not an error
377
+ // if there are more sequences in a cell than this value, however they will
378
+ // not be visible in the view cells_sequences.
379
+ int32_t n_max_seq;
380
+
381
+ // Number of tokens in the cache. For example, if there are two populated
382
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
383
+ // ids then you'll have 3 tokens.
384
+ int32_t token_count;
385
+
386
+ // Number of populated cache cells.
387
+ int32_t used_cells;
388
+
389
+ // Maximum contiguous empty slots in the cache.
390
+ int32_t max_contiguous;
391
+
392
+ // Index to the start of the max_contiguous slot range. Can be negative
393
+ // when cache is full.
394
+ int32_t max_contiguous_idx;
395
+
396
+ // Information for an individual cell.
397
+ struct llama_kv_cache_view_cell * cells;
398
+
399
+ // The sequences for each cell. There will be n_max_seq items per cell.
400
+ llama_seq_id * cells_sequences;
401
+ };
402
+
403
+ // Create an empty KV cache view. (use only for debugging purposes)
404
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
405
+
406
+ // Free a KV cache view. (use only for debugging purposes)
407
+ LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
408
+
409
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
410
+ LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
411
+
412
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
413
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
414
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
415
+
416
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
417
+ LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
350
418
 
351
419
  // Clear the KV cache
352
420
  LLAMA_API void llama_kv_cache_clear(
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.3'
6
+ VERSION = '0.9.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1523'
9
+ LLAMA_CPP_VERSION = 'b1555'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.9.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-18 00:00:00.000000000 Z
11
+ date: 2023-11-25 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: