llama_cpp 0.9.3 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,7 +91,7 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
-
#define LLAMA_MAX_NODES
|
94
|
+
#define LLAMA_MAX_NODES 8192
|
95
95
|
|
96
96
|
//
|
97
97
|
// logging
|
@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
604
604
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
605
605
|
}
|
606
606
|
|
607
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
608
|
+
switch (type) {
|
609
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
610
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
611
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
612
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
613
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
614
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
615
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
616
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
617
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
618
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
619
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
620
|
+
default: return format("unknown type %d", type);
|
621
|
+
}
|
622
|
+
}
|
623
|
+
|
624
|
+
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
625
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
626
|
+
|
627
|
+
switch (type) {
|
628
|
+
case GGUF_TYPE_STRING:
|
629
|
+
return gguf_get_val_str(ctx_gguf, i);
|
630
|
+
case GGUF_TYPE_ARRAY:
|
631
|
+
{
|
632
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
633
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
634
|
+
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
635
|
+
std::stringstream ss;
|
636
|
+
ss << "[";
|
637
|
+
for (int j = 0; j < arr_n; j++) {
|
638
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
639
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
640
|
+
// escape quotes
|
641
|
+
replace_all(val, "\\", "\\\\");
|
642
|
+
replace_all(val, "\"", "\\\"");
|
643
|
+
ss << '"' << val << '"';
|
644
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
645
|
+
ss << "???";
|
646
|
+
} else {
|
647
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
648
|
+
}
|
649
|
+
if (j < arr_n - 1) {
|
650
|
+
ss << ", ";
|
651
|
+
}
|
652
|
+
}
|
653
|
+
ss << "]";
|
654
|
+
return ss.str();
|
655
|
+
}
|
656
|
+
default:
|
657
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
658
|
+
}
|
659
|
+
}
|
660
|
+
|
607
661
|
//
|
608
662
|
// ggml helpers
|
609
663
|
//
|
@@ -1221,6 +1275,7 @@ struct llama_kv_cache {
|
|
1221
1275
|
// cannot be freely changed after a slot has been allocated.
|
1222
1276
|
uint32_t head = 0;
|
1223
1277
|
uint32_t size = 0;
|
1278
|
+
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
1224
1279
|
|
1225
1280
|
// computed before each graph build
|
1226
1281
|
uint32_t n = 0;
|
@@ -1322,6 +1377,9 @@ struct llama_model {
|
|
1322
1377
|
|
1323
1378
|
int n_gpu_layers;
|
1324
1379
|
|
1380
|
+
// gguf metadata
|
1381
|
+
std::unordered_map<std::string, std::string> gguf_kv;
|
1382
|
+
|
1325
1383
|
// context
|
1326
1384
|
struct ggml_context * ctx = NULL;
|
1327
1385
|
|
@@ -1442,6 +1500,7 @@ static bool llama_kv_cache_init(
|
|
1442
1500
|
|
1443
1501
|
cache.head = 0;
|
1444
1502
|
cache.size = n_ctx;
|
1503
|
+
cache.used = 0;
|
1445
1504
|
|
1446
1505
|
cache.cells.clear();
|
1447
1506
|
cache.cells.resize(n_ctx);
|
@@ -1543,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
|
|
1543
1602
|
}
|
1544
1603
|
}
|
1545
1604
|
|
1605
|
+
cache.used += n_tokens;
|
1606
|
+
|
1546
1607
|
return true;
|
1547
1608
|
}
|
1548
1609
|
|
@@ -1563,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
1563
1624
|
cache.cells[i].seq_id.clear();
|
1564
1625
|
}
|
1565
1626
|
cache.head = 0;
|
1627
|
+
cache.used = 0;
|
1566
1628
|
}
|
1567
1629
|
|
1568
1630
|
static void llama_kv_cache_seq_rm(
|
@@ -1585,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
|
|
1585
1647
|
continue;
|
1586
1648
|
}
|
1587
1649
|
if (cache.cells[i].seq_id.empty()) {
|
1650
|
+
// keep count of the number of used cells
|
1651
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1652
|
+
|
1588
1653
|
cache.cells[i].pos = -1;
|
1589
1654
|
if (new_head == cache.size) new_head = i;
|
1590
1655
|
}
|
@@ -1592,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
|
|
1592
1657
|
}
|
1593
1658
|
|
1594
1659
|
// If we freed up a slot, set head to it so searching can start there.
|
1595
|
-
if (new_head != cache.size) cache.head = new_head;
|
1660
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1596
1661
|
}
|
1597
1662
|
|
1598
1663
|
static void llama_kv_cache_seq_cp(
|
@@ -1618,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1618
1683
|
|
1619
1684
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1620
1685
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1686
|
+
if (cache.cells[i].pos >= 0) cache.used--;
|
1621
1687
|
cache.cells[i].pos = -1;
|
1622
1688
|
cache.cells[i].seq_id.clear();
|
1623
1689
|
if (new_head == cache.size) new_head = i;
|
@@ -1628,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1628
1694
|
}
|
1629
1695
|
|
1630
1696
|
// If we freed up a slot, set head to it so searching can start there.
|
1631
|
-
if (new_head != cache.size) cache.head = new_head;
|
1697
|
+
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
1632
1698
|
}
|
1633
1699
|
|
1634
1700
|
static void llama_kv_cache_seq_shift(
|
@@ -1649,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
|
|
1649
1715
|
cache.cells[i].delta += delta;
|
1650
1716
|
|
1651
1717
|
if (cache.cells[i].pos < 0) {
|
1718
|
+
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
1652
1719
|
cache.cells[i].pos = -1;
|
1653
1720
|
cache.cells[i].seq_id.clear();
|
1654
1721
|
if (new_head == cache.size) new_head = i;
|
@@ -1780,10 +1847,10 @@ struct llama_model_loader {
|
|
1780
1847
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
1781
1848
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
1782
1849
|
default:
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1850
|
+
{
|
1851
|
+
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
1852
|
+
ftype = LLAMA_FTYPE_ALL_F32;
|
1853
|
+
} break;
|
1787
1854
|
}
|
1788
1855
|
|
1789
1856
|
// this is a way to mark that we have "guessed" the file type
|
@@ -1797,10 +1864,21 @@ struct llama_model_loader {
|
|
1797
1864
|
}
|
1798
1865
|
|
1799
1866
|
for (int i = 0; i < n_kv; i++) {
|
1800
|
-
const char * name
|
1801
|
-
const enum gguf_type type
|
1867
|
+
const char * name = gguf_get_key(ctx_gguf, i);
|
1868
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
1869
|
+
const std::string type_name =
|
1870
|
+
type == GGUF_TYPE_ARRAY
|
1871
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
1872
|
+
: gguf_type_name(type);
|
1873
|
+
|
1874
|
+
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
1875
|
+
const size_t MAX_VALUE_LEN = 40;
|
1876
|
+
if (value.size() > MAX_VALUE_LEN) {
|
1877
|
+
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
1878
|
+
}
|
1879
|
+
replace_all(value, "\n", "\\n");
|
1802
1880
|
|
1803
|
-
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-
|
1881
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
1804
1882
|
}
|
1805
1883
|
|
1806
1884
|
// print type counts
|
@@ -2095,6 +2173,17 @@ static void llm_load_hparams(
|
|
2095
2173
|
|
2096
2174
|
auto & hparams = model.hparams;
|
2097
2175
|
|
2176
|
+
// get metadata as string
|
2177
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
2178
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
2179
|
+
if (type == GGUF_TYPE_ARRAY) {
|
2180
|
+
continue;
|
2181
|
+
}
|
2182
|
+
const char * name = gguf_get_key(ctx, i);
|
2183
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
2184
|
+
model.gguf_kv.emplace(name, value);
|
2185
|
+
}
|
2186
|
+
|
2098
2187
|
// get general kv
|
2099
2188
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
2100
2189
|
|
@@ -4730,92 +4819,34 @@ struct llm_build_context {
|
|
4730
4819
|
// self-attention
|
4731
4820
|
{
|
4732
4821
|
// compute Q and K and RoPE them
|
4733
|
-
struct ggml_tensor *
|
4734
|
-
cb(
|
4822
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4823
|
+
cb(Qcur, "Qcur", il);
|
4735
4824
|
|
4736
|
-
struct ggml_tensor *
|
4737
|
-
cb(
|
4825
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4826
|
+
cb(Kcur, "Kcur", il);
|
4738
4827
|
|
4739
4828
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
4829
|
cb(Vcur, "Vcur", il);
|
4741
4830
|
|
4742
|
-
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
-
0
|
4748
|
-
));
|
4749
|
-
cb(qrot, "qrot", il);
|
4750
|
-
|
4751
|
-
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
-
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
-
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
-
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
-
0
|
4756
|
-
));
|
4757
|
-
cb(krot, "krot", il);
|
4758
|
-
|
4759
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
-
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
-
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
-
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
-
);
|
4766
|
-
cb(qpass, "qpass", il);
|
4767
|
-
|
4768
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
-
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
-
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
-
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
-
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
-
);
|
4774
|
-
cb(kpass, "kpass", il);
|
4775
|
-
|
4776
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
-
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
-
);
|
4780
|
-
cb(qrotated, "qrotated", il);
|
4781
|
-
|
4782
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
-
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4831
|
+
Qcur = ggml_rope_custom(
|
4832
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4833
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4834
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4785
4835
|
);
|
4786
|
-
cb(krotated, "krotated", il);
|
4787
|
-
|
4788
|
-
// ggml currently only supports concatenation on dim=2
|
4789
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
-
cb(qrotated, "qrotated", il);
|
4792
|
-
|
4793
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
-
cb(krotated, "krotated", il);
|
4795
|
-
|
4796
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
-
cb(qpass, "qpass", il);
|
4798
|
-
|
4799
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
-
cb(kpass, "kpass", il);
|
4801
|
-
|
4802
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
4836
|
cb(Qcur, "Qcur", il);
|
4804
4837
|
|
4805
|
-
|
4806
|
-
|
4807
|
-
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4838
|
+
Kcur = ggml_rope_custom(
|
4839
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4840
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
4841
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
4842
|
+
);
|
4812
4843
|
cb(Kcur, "Kcur", il);
|
4813
4844
|
|
4814
4845
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
4846
|
|
4816
4847
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
4848
|
model.layers[il].wo, NULL,
|
4818
|
-
|
4849
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
4850
|
cb(cur, "kqv_out", il);
|
4820
4851
|
}
|
4821
4852
|
|
@@ -5443,6 +5474,12 @@ static int llama_decode_internal(
|
|
5443
5474
|
batch.seq_id = seq_id_arr.data();
|
5444
5475
|
}
|
5445
5476
|
|
5477
|
+
// if we have enough unused cells before the current head ->
|
5478
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
5479
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
5480
|
+
kv_self.head = 0;
|
5481
|
+
}
|
5482
|
+
|
5446
5483
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
5447
5484
|
return 1;
|
5448
5485
|
}
|
@@ -5453,7 +5490,7 @@ static int llama_decode_internal(
|
|
5453
5490
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
5454
5491
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
5455
5492
|
|
5456
|
-
//printf("kv_self.n = %
|
5493
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5457
5494
|
|
5458
5495
|
ggml_allocr_reset(lctx.alloc);
|
5459
5496
|
|
@@ -8666,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
8666
8703
|
return model->hparams.rope_freq_scale_train;
|
8667
8704
|
}
|
8668
8705
|
|
8706
|
+
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
8707
|
+
const auto & it = model->gguf_kv.find(key);
|
8708
|
+
if (it == model->gguf_kv.end()) {
|
8709
|
+
if (buf_size > 0) {
|
8710
|
+
buf[0] = '\0';
|
8711
|
+
}
|
8712
|
+
return -1;
|
8713
|
+
}
|
8714
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8715
|
+
}
|
8716
|
+
|
8717
|
+
int llama_model_meta_count(const struct llama_model * model) {
|
8718
|
+
return (int)model->gguf_kv.size();
|
8719
|
+
}
|
8720
|
+
|
8721
|
+
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8722
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8723
|
+
if (buf_size > 0) {
|
8724
|
+
buf[0] = '\0';
|
8725
|
+
}
|
8726
|
+
return -1;
|
8727
|
+
}
|
8728
|
+
auto it = model->gguf_kv.begin();
|
8729
|
+
std::advance(it, i);
|
8730
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
8731
|
+
}
|
8732
|
+
|
8733
|
+
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
8734
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
8735
|
+
if (buf_size > 0) {
|
8736
|
+
buf[0] = '\0';
|
8737
|
+
}
|
8738
|
+
return -1;
|
8739
|
+
}
|
8740
|
+
auto it = model->gguf_kv.begin();
|
8741
|
+
std::advance(it, i);
|
8742
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
8743
|
+
}
|
8744
|
+
|
8669
8745
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
8670
8746
|
return snprintf(buf, buf_size, "%s %s %s",
|
8671
8747
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -8724,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
8724
8800
|
}
|
8725
8801
|
}
|
8726
8802
|
|
8803
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
8804
|
+
struct llama_kv_cache_view result = {
|
8805
|
+
/*.n_cells = */ 0,
|
8806
|
+
/*.n_max_seq = */ n_max_seq,
|
8807
|
+
/*.token_count = */ 0,
|
8808
|
+
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
8809
|
+
/*.max_contiguous = */ 0,
|
8810
|
+
/*.max_contiguous_idx = */ -1,
|
8811
|
+
/*.cells = */ nullptr,
|
8812
|
+
/*.cells_sequences = */ nullptr,
|
8813
|
+
};
|
8814
|
+
return result;
|
8815
|
+
}
|
8816
|
+
|
8817
|
+
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
8818
|
+
if (view->cells != nullptr) {
|
8819
|
+
free(view->cells);
|
8820
|
+
view->cells = nullptr;
|
8821
|
+
}
|
8822
|
+
if (view->cells_sequences != nullptr) {
|
8823
|
+
free(view->cells_sequences);
|
8824
|
+
view->cells_sequences = nullptr;
|
8825
|
+
}
|
8826
|
+
}
|
8827
|
+
|
8828
|
+
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
8829
|
+
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
8830
|
+
view->n_cells = int32_t(ctx->kv_self.size);
|
8831
|
+
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
8832
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
8833
|
+
view->cells = (struct llama_kv_cache_view_cell *)p;
|
8834
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
8835
|
+
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
8836
|
+
view->cells_sequences = (llama_seq_id *)p;
|
8837
|
+
}
|
8838
|
+
|
8839
|
+
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
8840
|
+
llama_kv_cache_view_cell * c_curr = view->cells;
|
8841
|
+
llama_seq_id * cs_curr = view->cells_sequences;
|
8842
|
+
int32_t used_cells = 0;
|
8843
|
+
int32_t token_count = 0;
|
8844
|
+
int32_t curr_contig_idx = -1;
|
8845
|
+
uint32_t max_contig = 0;
|
8846
|
+
int32_t max_contig_idx = -1;
|
8847
|
+
|
8848
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
8849
|
+
const size_t curr_size = kv_cells[i].seq_id.size();
|
8850
|
+
token_count += curr_size;
|
8851
|
+
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
8852
|
+
|
8853
|
+
if (curr_size > 0) {
|
8854
|
+
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
8855
|
+
max_contig = i - curr_contig_idx;
|
8856
|
+
max_contig_idx = curr_contig_idx;
|
8857
|
+
}
|
8858
|
+
curr_contig_idx = -1;
|
8859
|
+
} else if (curr_contig_idx < 0) {
|
8860
|
+
curr_contig_idx = i;
|
8861
|
+
}
|
8862
|
+
|
8863
|
+
int seq_idx = 0;
|
8864
|
+
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
8865
|
+
if (seq_idx >= view->n_max_seq) {
|
8866
|
+
break;
|
8867
|
+
}
|
8868
|
+
cs_curr[seq_idx] = it;
|
8869
|
+
seq_idx++;
|
8870
|
+
}
|
8871
|
+
if (seq_idx != 0) {
|
8872
|
+
used_cells++;
|
8873
|
+
}
|
8874
|
+
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
8875
|
+
cs_curr[seq_idx] = -1;
|
8876
|
+
}
|
8877
|
+
}
|
8878
|
+
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
8879
|
+
max_contig_idx = curr_contig_idx;
|
8880
|
+
max_contig = kv_cells.size() - curr_contig_idx;
|
8881
|
+
}
|
8882
|
+
view->max_contiguous = max_contig;
|
8883
|
+
view->max_contiguous_idx = max_contig_idx;
|
8884
|
+
view->token_count = token_count;
|
8885
|
+
view->used_cells = used_cells;
|
8886
|
+
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
8887
|
+
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
8888
|
+
__func__, ctx->kv_self.used, used_cells);
|
8889
|
+
}
|
8890
|
+
}
|
8891
|
+
|
8727
8892
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
8728
|
-
|
8893
|
+
int result = 0;
|
8894
|
+
|
8895
|
+
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
8896
|
+
result += ctx->kv_self.cells[i].seq_id.size();
|
8897
|
+
}
|
8898
|
+
|
8899
|
+
return result;
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
8903
|
+
return ctx->kv_self.used;
|
8729
8904
|
}
|
8730
8905
|
|
8731
8906
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
@@ -8895,10 +9070,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8895
9070
|
const size_t kv_buf_size = kv_self.buf.size;
|
8896
9071
|
const uint32_t kv_head = kv_self.head;
|
8897
9072
|
const uint32_t kv_size = kv_self.size;
|
9073
|
+
const uint32_t kv_used = kv_self.used;
|
8898
9074
|
|
8899
9075
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
8900
9076
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
8901
9077
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
9078
|
+
data_ctx->write(&kv_used, sizeof(kv_used));
|
8902
9079
|
|
8903
9080
|
if (kv_buf_size) {
|
8904
9081
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
@@ -9021,10 +9198,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9021
9198
|
size_t kv_buf_size;
|
9022
9199
|
uint32_t kv_head;
|
9023
9200
|
uint32_t kv_size;
|
9201
|
+
uint32_t kv_used;
|
9024
9202
|
|
9025
9203
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
9026
9204
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
9027
9205
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
9206
|
+
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
9028
9207
|
|
9029
9208
|
if (kv_buf_size) {
|
9030
9209
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
@@ -9059,6 +9238,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9059
9238
|
|
9060
9239
|
ctx->kv_self.head = kv_head;
|
9061
9240
|
ctx->kv_self.size = kv_size;
|
9241
|
+
ctx->kv_self.used = kv_used;
|
9062
9242
|
|
9063
9243
|
ctx->kv_self.cells.resize(kv_size);
|
9064
9244
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -301,6 +301,23 @@ extern "C" {
|
|
301
301
|
// Get the model's RoPE frequency scaling factor
|
302
302
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
303
303
|
|
304
|
+
// Functions to access the model's GGUF metadata scalar values
|
305
|
+
// - The functions return the length of the string on success, or -1 on failure
|
306
|
+
// - The output string is always null-terminated and cleared on failure
|
307
|
+
// - GGUF array values are not supported by these functions
|
308
|
+
|
309
|
+
// Get metadata value as a string by key name
|
310
|
+
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
311
|
+
|
312
|
+
// Get the number of metadata key/value pairs
|
313
|
+
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
314
|
+
|
315
|
+
// Get metadata key name by index
|
316
|
+
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
317
|
+
|
318
|
+
// Get metadata value as a string by index
|
319
|
+
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
320
|
+
|
304
321
|
// Get a string describing the model type
|
305
322
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
306
323
|
|
@@ -344,9 +361,60 @@ extern "C" {
|
|
344
361
|
// KV cache
|
345
362
|
//
|
346
363
|
|
347
|
-
//
|
348
|
-
|
349
|
-
|
364
|
+
// Information associated with an individual cell in the KV cache view.
|
365
|
+
struct llama_kv_cache_view_cell {
|
366
|
+
// The position for this cell. Takes KV cache shifts into account.
|
367
|
+
// May be negative if the cell is not populated.
|
368
|
+
llama_pos pos;
|
369
|
+
};
|
370
|
+
|
371
|
+
// An updateable view of the KV cache.
|
372
|
+
struct llama_kv_cache_view {
|
373
|
+
// Number of KV cache cells. This will be the same as the context size.
|
374
|
+
int32_t n_cells;
|
375
|
+
|
376
|
+
// Maximum number of sequences that can exist in a cell. It's not an error
|
377
|
+
// if there are more sequences in a cell than this value, however they will
|
378
|
+
// not be visible in the view cells_sequences.
|
379
|
+
int32_t n_max_seq;
|
380
|
+
|
381
|
+
// Number of tokens in the cache. For example, if there are two populated
|
382
|
+
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
383
|
+
// ids then you'll have 3 tokens.
|
384
|
+
int32_t token_count;
|
385
|
+
|
386
|
+
// Number of populated cache cells.
|
387
|
+
int32_t used_cells;
|
388
|
+
|
389
|
+
// Maximum contiguous empty slots in the cache.
|
390
|
+
int32_t max_contiguous;
|
391
|
+
|
392
|
+
// Index to the start of the max_contiguous slot range. Can be negative
|
393
|
+
// when cache is full.
|
394
|
+
int32_t max_contiguous_idx;
|
395
|
+
|
396
|
+
// Information for an individual cell.
|
397
|
+
struct llama_kv_cache_view_cell * cells;
|
398
|
+
|
399
|
+
// The sequences for each cell. There will be n_max_seq items per cell.
|
400
|
+
llama_seq_id * cells_sequences;
|
401
|
+
};
|
402
|
+
|
403
|
+
// Create an empty KV cache view. (use only for debugging purposes)
|
404
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
405
|
+
|
406
|
+
// Free a KV cache view. (use only for debugging purposes)
|
407
|
+
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
408
|
+
|
409
|
+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
410
|
+
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
411
|
+
|
412
|
+
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
413
|
+
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
414
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
415
|
+
|
416
|
+
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
417
|
+
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
350
418
|
|
351
419
|
// Clear the KV cache
|
352
420
|
LLAMA_API void llama_kv_cache_clear(
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1555'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|