llama_cpp 0.9.3 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,7 +91,7 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
95
|
|
96
96
|
//
|
97
97
|
// logging
|
@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
604
604
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
605
605
|
}
|
606
606
|
|
607
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
608
|
+
switch (type) {
|
609
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
618
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
619
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
620
|
+
default: return format("unknown type %d", type);
|
621
|
+
}
|
622
|
+
}
|
623
|
+
|
624
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
625
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
626
|
+
|
627
|
+
switch (type) {
|
628
|
+
case GGUF_TYPE_STRING:
|
629
|
+
return gguf_get_val_str(ctx_gguf, i);
|
630
|
+
case GGUF_TYPE_ARRAY:
|
631
|
+
{
|
632
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
633
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
634
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
635
|
+
std::stringstream ss;
|
636
|
+
ss << "[";
|
637
|
+
for (int j = 0; j < arr_n; j++) {
|
638
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
639
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
640
|
+
// escape quotes
|
641
|
+
replace_all(val, "\\", "\\\\");
|
642
|
+
replace_all(val, "\"", "\\\"");
|
643
|
+
ss << '"' << val << '"';
|
644
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
645
|
+
ss << "???";
|
646
|
+
} else {
|
647
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
648
|
+
}
|
649
|
+
if (j < arr_n - 1) {
|
650
|
+
ss << ", ";
|
651
|
+
}
|
652
|
+
}
|
653
|
+
ss << "]";
|
654
|
+
return ss.str();
|
655
|
+
}
|
656
|
+
default:
|
657
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
658
|
+
}
|
659
|
+
}
|
660
|
+
|
607
661
|
//
|
608
662
|
// ggml helpers
|
609
663
|
//
|
@@ -1221,6 +1275,7 @@ struct llama_kv_cache {
|
|
1221
1275
|
// cannot be freely changed after a slot has been allocated.
|
1222
1276
|
uint32_t head = 0;
|
1223
1277
|
uint32_t size = 0;
|
1278
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1224
1279
|
|
1225
1280
|
// computed before each graph build
|
1226
1281
|
uint32_t n = 0;
|
@@ -1322,6 +1377,9 @@ struct llama_model {
|
|
1322
1377
|
|
1323
1378
|
int n_gpu_layers;
|
1324
1379
|
|
1380
|
+
// gguf metadata
|
1381
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1382
|
+
|
1325
1383
|
// context
|
1326
1384
|
struct ggml_context * ctx = NULL;
|
1327
1385
|
|
@@ -1442,6 +1500,7 @@ static bool llama_kv_cache_init(
|
|
1442
1500
|
|
1443
1501
|
cache.head = 0;
|
1444
1502
|
cache.size = n_ctx;
|
1503
|
+
cache.used = 0;
|
1445
1504
|
|
1446
1505
|
cache.cells.clear();
|
1447
1506
|
cache.cells.resize(n_ctx);
|
@@ -1543,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
|
|
1543
1602
|
}
|
1544
1603
|
}
|
1545
1604
|
|
1605
|
+
cache.used += n_tokens;
|
1606
|
+
|
1546
1607
|
return true;
|
1547
1608
|
}
|
1548
1609
|
|
@@ -1563,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1563
1624
|
cache.cells[i].seq_id.clear();
|
1564
1625
|
}
|
1565
1626
|
cache.head = 0;
|
1627
|
+
cache.used = 0;
|
1566
1628
|
}
|
1567
1629
|
|
1568
1630
|
static void llama_kv_cache_seq_rm(
|
@@ -1585,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
|
|
1585
1647
|
continue;
|
1586
1648
|
}
|
1587
1649
|
if (cache.cells[i].seq_id.empty()) {
|
1650
|
+
// keep count of the number of used cells
|
1651
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1652
|
+
|
1588
1653
|
cache.cells[i].pos = -1;
|
1589
1654
|
if (new_head == cache.size) new_head = i;
|
1590
1655
|
}
|
@@ -1592,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
|
|
1592
1657
|
}
|
1593
1658
|
|
1594
1659
|
// If we freed up a slot, set head to it so searching can start there.
|
1595
|
-
if (new_head != cache.size) cache.head = new_head;
|
1660
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1596
1661
|
}
|
1597
1662
|
|
1598
1663
|
static void llama_kv_cache_seq_cp(
|
@@ -1618,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1618
1683
|
|
1619
1684
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1620
1685
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1686
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1621
1687
|
cache.cells[i].pos = -1;
|
1622
1688
|
cache.cells[i].seq_id.clear();
|
1623
1689
|
if (new_head == cache.size) new_head = i;
|
@@ -1628,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1628
1694
|
}
|
1629
1695
|
|
1630
1696
|
// If we freed up a slot, set head to it so searching can start there.
|
1631
|
-
if (new_head != cache.size) cache.head = new_head;
|
1697
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1632
1698
|
}
|
1633
1699
|
|
1634
1700
|
static void llama_kv_cache_seq_shift(
|
@@ -1649,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
|
|
1649
1715
|
cache.cells[i].delta += delta;
|
1650
1716
|
|
1651
1717
|
if (cache.cells[i].pos < 0) {
|
1718
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1652
1719
|
cache.cells[i].pos = -1;
|
1653
1720
|
cache.cells[i].seq_id.clear();
|
1654
1721
|
if (new_head == cache.size) new_head = i;
|
@@ -1780,10 +1847,10 @@ struct llama_model_loader {
|
|
1780
1847
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1781
1848
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1782
1849
|
default:
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1850
|
+
{
|
1851
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1852
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1853
|
+
} break;
|
1787
1854
|
}
|
1788
1855
|
|
1789
1856
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1797,10 +1864,21 @@ struct llama_model_loader {
|
|
1797
1864
|
}
|
1798
1865
|
|
1799
1866
|
for (int i = 0; i < n_kv; i++) {
|
1800
|
-
const char * name
|
1801
|
-
const enum gguf_type type
|
1867
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1868
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1869
|
+
const std::string type_name =
|
1870
|
+
type == GGUF_TYPE_ARRAY
|
1871
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1872
|
+
: gguf_type_name(type);
|
1873
|
+
|
1874
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1875
|
+
const size_t MAX_VALUE_LEN = 40;
|
1876
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1877
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1878
|
+
}
|
1879
|
+
replace_all(value, "\n", "\\n");
|
1802
1880
|
|
1803
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1881
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1804
1882
|
}
|
1805
1883
|
|
1806
1884
|
// print type counts
|
@@ -2095,6 +2173,17 @@ static void llm_load_hparams(
|
|
2095
2173
|
|
2096
2174
|
auto & hparams = model.hparams;
|
2097
2175
|
|
2176
|
+
// get metadata as string
|
2177
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2178
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2179
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2180
|
+
continue;
|
2181
|
+
}
|
2182
|
+
const char * name = gguf_get_key(ctx, i);
|
2183
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2184
|
+
model.gguf_kv.emplace(name, value);
|
2185
|
+
}
|
2186
|
+
|
2098
2187
|
// get general kv
|
2099
2188
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2100
2189
|
|
@@ -4730,92 +4819,34 @@ struct llm_build_context {
|
|
4730
4819
|
// self-attention
|
4731
4820
|
{
|
4732
4821
|
// compute Q and K and RoPE them
|
4733
|
-
struct ggml_tensor *
|
4734
|
-
cb(
|
4822
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4823
|
+
cb(Qcur, "Qcur", il);
|
4735
4824
|
|
4736
|
-
struct ggml_tensor *
|
4737
|
-
cb(
|
4825
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4826
|
+
cb(Kcur, "Kcur", il);
|
4738
4827
|
|
4739
4828
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
4829
|
cb(Vcur, "Vcur", il);
|
4741
4830
|
|
4742
|
-
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
-
0
|
4748
|
-
));
|
4749
|
-
cb(qrot, "qrot", il);
|
4750
|
-
|
4751
|
-
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
-
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
-
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
-
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
-
0
|
4756
|
-
));
|
4757
|
-
cb(krot, "krot", il);
|
4758
|
-
|
4759
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
-
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
-
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
-
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
-
);
|
4766
|
-
cb(qpass, "qpass", il);
|
4767
|
-
|
4768
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
-
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
-
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
-
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
-
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
-
);
|
4774
|
-
cb(kpass, "kpass", il);
|
4775
|
-
|
4776
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
-
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
-
);
|
4780
|
-
cb(qrotated, "qrotated", il);
|
4781
|
-
|
4782
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
-
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4831
|
+
Qcur = ggml_rope_custom(
|
4832
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4833
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4785
4835
|
);
|
4786
|
-
cb(krotated, "krotated", il);
|
4787
|
-
|
4788
|
-
// ggml currently only supports concatenation on dim=2
|
4789
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
-
cb(qrotated, "qrotated", il);
|
4792
|
-
|
4793
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
-
cb(krotated, "krotated", il);
|
4795
|
-
|
4796
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
-
cb(qpass, "qpass", il);
|
4798
|
-
|
4799
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
-
cb(kpass, "kpass", il);
|
4801
|
-
|
4802
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
4836
|
cb(Qcur, "Qcur", il);
|
4804
4837
|
|
4805
|
-
|
4806
|
-
|
4807
|
-
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4838
|
+
Kcur = ggml_rope_custom(
|
4839
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4840
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4841
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
);
|
4812
4843
|
cb(Kcur, "Kcur", il);
|
4813
4844
|
|
4814
4845
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
4846
|
|
4816
4847
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
4848
|
model.layers[il].wo, NULL,
|
4818
|
-
|
4849
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
4850
|
cb(cur, "kqv_out", il);
|
4820
4851
|
}
|
4821
4852
|
|
@@ -5443,6 +5474,12 @@ static int llama_decode_internal(
|
|
5443
5474
|
batch.seq_id = seq_id_arr.data();
|
5444
5475
|
}
|
5445
5476
|
|
5477
|
+
// if we have enough unused cells before the current head ->
|
5478
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5479
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5480
|
+
kv_self.head = 0;
|
5481
|
+
}
|
5482
|
+
|
5446
5483
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5447
5484
|
return 1;
|
5448
5485
|
}
|
@@ -5453,7 +5490,7 @@ static int llama_decode_internal(
|
|
5453
5490
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5454
5491
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5455
5492
|
|
5456
|
-
//printf("kv_self.n = %
|
5493
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5457
5494
|
|
5458
5495
|
ggml_allocr_reset(lctx.alloc);
|
5459
5496
|
|
@@ -8666,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8666
8703
|
return model->hparams.rope_freq_scale_train;
|
8667
8704
|
}
|
8668
8705
|
|
8706
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8707
|
+
const auto & it = model->gguf_kv.find(key);
|
8708
|
+
if (it == model->gguf_kv.end()) {
|
8709
|
+
if (buf_size > 0) {
|
8710
|
+
buf[0] = '\0';
|
8711
|
+
}
|
8712
|
+
return -1;
|
8713
|
+
}
|
8714
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8715
|
+
}
|
8716
|
+
|
8717
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8718
|
+
return (int)model->gguf_kv.size();
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8722
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8723
|
+
if (buf_size > 0) {
|
8724
|
+
buf[0] = '\0';
|
8725
|
+
}
|
8726
|
+
return -1;
|
8727
|
+
}
|
8728
|
+
auto it = model->gguf_kv.begin();
|
8729
|
+
std::advance(it, i);
|
8730
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8731
|
+
}
|
8732
|
+
|
8733
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8734
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8735
|
+
if (buf_size > 0) {
|
8736
|
+
buf[0] = '\0';
|
8737
|
+
}
|
8738
|
+
return -1;
|
8739
|
+
}
|
8740
|
+
auto it = model->gguf_kv.begin();
|
8741
|
+
std::advance(it, i);
|
8742
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8743
|
+
}
|
8744
|
+
|
8669
8745
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8670
8746
|
return snprintf(buf, buf_size, "%s %s %s",
|
8671
8747
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8724,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8724
8800
|
}
|
8725
8801
|
}
|
8726
8802
|
|
8803
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8804
|
+
struct llama_kv_cache_view result = {
|
8805
|
+
/*.n_cells = */ 0,
|
8806
|
+
/*.n_max_seq = */ n_max_seq,
|
8807
|
+
/*.token_count = */ 0,
|
8808
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8809
|
+
/*.max_contiguous = */ 0,
|
8810
|
+
/*.max_contiguous_idx = */ -1,
|
8811
|
+
/*.cells = */ nullptr,
|
8812
|
+
/*.cells_sequences = */ nullptr,
|
8813
|
+
};
|
8814
|
+
return result;
|
8815
|
+
}
|
8816
|
+
|
8817
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8818
|
+
if (view->cells != nullptr) {
|
8819
|
+
free(view->cells);
|
8820
|
+
view->cells = nullptr;
|
8821
|
+
}
|
8822
|
+
if (view->cells_sequences != nullptr) {
|
8823
|
+
free(view->cells_sequences);
|
8824
|
+
view->cells_sequences = nullptr;
|
8825
|
+
}
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8829
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8830
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8831
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8832
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8833
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8834
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8835
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8836
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8840
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8841
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8842
|
+
int32_t used_cells = 0;
|
8843
|
+
int32_t token_count = 0;
|
8844
|
+
int32_t curr_contig_idx = -1;
|
8845
|
+
uint32_t max_contig = 0;
|
8846
|
+
int32_t max_contig_idx = -1;
|
8847
|
+
|
8848
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8849
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8850
|
+
token_count += curr_size;
|
8851
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8852
|
+
|
8853
|
+
if (curr_size > 0) {
|
8854
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8855
|
+
max_contig = i - curr_contig_idx;
|
8856
|
+
max_contig_idx = curr_contig_idx;
|
8857
|
+
}
|
8858
|
+
curr_contig_idx = -1;
|
8859
|
+
} else if (curr_contig_idx < 0) {
|
8860
|
+
curr_contig_idx = i;
|
8861
|
+
}
|
8862
|
+
|
8863
|
+
int seq_idx = 0;
|
8864
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8865
|
+
if (seq_idx >= view->n_max_seq) {
|
8866
|
+
break;
|
8867
|
+
}
|
8868
|
+
cs_curr[seq_idx] = it;
|
8869
|
+
seq_idx++;
|
8870
|
+
}
|
8871
|
+
if (seq_idx != 0) {
|
8872
|
+
used_cells++;
|
8873
|
+
}
|
8874
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8875
|
+
cs_curr[seq_idx] = -1;
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8879
|
+
max_contig_idx = curr_contig_idx;
|
8880
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8881
|
+
}
|
8882
|
+
view->max_contiguous = max_contig;
|
8883
|
+
view->max_contiguous_idx = max_contig_idx;
|
8884
|
+
view->token_count = token_count;
|
8885
|
+
view->used_cells = used_cells;
|
8886
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8887
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8888
|
+
__func__, ctx->kv_self.used, used_cells);
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
|
8727
8892
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8728
|
-
|
8893
|
+
int result = 0;
|
8894
|
+
|
8895
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8896
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8897
|
+
}
|
8898
|
+
|
8899
|
+
return result;
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8903
|
+
return ctx->kv_self.used;
|
8729
8904
|
}
|
8730
8905
|
|
8731
8906
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8895,10 +9070,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8895
9070
|
const size_t kv_buf_size = kv_self.buf.size;
|
8896
9071
|
const uint32_t kv_head = kv_self.head;
|
8897
9072
|
const uint32_t kv_size = kv_self.size;
|
9073
|
+
const uint32_t kv_used = kv_self.used;
|
8898
9074
|
|
8899
9075
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8900
9076
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8901
9077
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9078
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8902
9079
|
|
8903
9080
|
if (kv_buf_size) {
|
8904
9081
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -9021,10 +9198,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9021
9198
|
size_t kv_buf_size;
|
9022
9199
|
uint32_t kv_head;
|
9023
9200
|
uint32_t kv_size;
|
9201
|
+
uint32_t kv_used;
|
9024
9202
|
|
9025
9203
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9026
9204
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9027
9205
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9206
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9028
9207
|
|
9029
9208
|
if (kv_buf_size) {
|
9030
9209
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
@@ -9059,6 +9238,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9059
9238
|
|
9060
9239
|
ctx->kv_self.head = kv_head;
|
9061
9240
|
ctx->kv_self.size = kv_size;
|
9241
|
+
ctx->kv_self.used = kv_used;
|
9062
9242
|
|
9063
9243
|
ctx->kv_self.cells.resize(kv_size);
|
9064
9244
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -301,6 +301,23 @@ extern "C" {
|
|
301
301
|
// Get the model's RoPE frequency scaling factor
|
302
302
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
303
303
|
|
304
|
+
// Functions to access the model's GGUF metadata scalar values
|
305
|
+
// - The functions return the length of the string on success, or -1 on failure
|
306
|
+
// - The output string is always null-terminated and cleared on failure
|
307
|
+
// - GGUF array values are not supported by these functions
|
308
|
+
|
309
|
+
// Get metadata value as a string by key name
|
310
|
+
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
311
|
+
|
312
|
+
// Get the number of metadata key/value pairs
|
313
|
+
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
314
|
+
|
315
|
+
// Get metadata key name by index
|
316
|
+
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
317
|
+
|
318
|
+
// Get metadata value as a string by index
|
319
|
+
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
320
|
+
|
304
321
|
// Get a string describing the model type
|
305
322
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
306
323
|
|
@@ -344,9 +361,60 @@ extern "C" {
|
|
344
361
|
// KV cache
|
345
362
|
//
|
346
363
|
|
347
|
-
//
|
348
|
-
|
349
|
-
|
364
|
+
// Information associated with an individual cell in the KV cache view.
|
365
|
+
struct llama_kv_cache_view_cell {
|
366
|
+
// The position for this cell. Takes KV cache shifts into account.
|
367
|
+
// May be negative if the cell is not populated.
|
368
|
+
llama_pos pos;
|
369
|
+
};
|
370
|
+
|
371
|
+
// An updateable view of the KV cache.
|
372
|
+
struct llama_kv_cache_view {
|
373
|
+
// Number of KV cache cells. This will be the same as the context size.
|
374
|
+
int32_t n_cells;
|
375
|
+
|
376
|
+
// Maximum number of sequences that can exist in a cell. It's not an error
|
377
|
+
// if there are more sequences in a cell than this value, however they will
|
378
|
+
// not be visible in the view cells_sequences.
|
379
|
+
int32_t n_max_seq;
|
380
|
+
|
381
|
+
// Number of tokens in the cache. For example, if there are two populated
|
382
|
+
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
383
|
+
// ids then you'll have 3 tokens.
|
384
|
+
int32_t token_count;
|
385
|
+
|
386
|
+
// Number of populated cache cells.
|
387
|
+
int32_t used_cells;
|
388
|
+
|
389
|
+
// Maximum contiguous empty slots in the cache.
|
390
|
+
int32_t max_contiguous;
|
391
|
+
|
392
|
+
// Index to the start of the max_contiguous slot range. Can be negative
|
393
|
+
// when cache is full.
|
394
|
+
int32_t max_contiguous_idx;
|
395
|
+
|
396
|
+
// Information for an individual cell.
|
397
|
+
struct llama_kv_cache_view_cell * cells;
|
398
|
+
|
399
|
+
// The sequences for each cell. There will be n_max_seq items per cell.
|
400
|
+
llama_seq_id * cells_sequences;
|
401
|
+
};
|
402
|
+
|
403
|
+
// Create an empty KV cache view. (use only for debugging purposes)
|
404
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
405
|
+
|
406
|
+
// Free a KV cache view. (use only for debugging purposes)
|
407
|
+
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
408
|
+
|
409
|
+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
410
|
+
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
411
|
+
|
412
|
+
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
413
|
+
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
414
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
415
|
+
|
416
|
+
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
417
|
+
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
350
418
|
|
351
419
|
// Clear the KV cache
|
352
420
|
LLAMA_API void llama_kv_cache_clear(
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1555'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|