llama_cpp 0.0.7 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +736 -36
- data/ext/llama_cpp/src/ggml-cuda.h +8 -33
- data/ext/llama_cpp/src/ggml-opencl.c +202 -20
- data/ext/llama_cpp/src/ggml.c +732 -496
- data/ext/llama_cpp/src/ggml.h +47 -5
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
- data/ext/llama_cpp/src/llama.cpp +560 -147
- data/ext/llama_cpp/src/llama.h +71 -24
- data/lib/llama_cpp/client.rb +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +27 -3
- data/sig/llama_cpp.rbs +38 -3
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <cstdio>
|
6
6
|
#endif
|
7
7
|
|
8
|
-
#include "
|
8
|
+
#include "llama-util.h"
|
9
9
|
#include "llama.h"
|
10
10
|
|
11
11
|
#include "ggml.h"
|
@@ -28,11 +28,11 @@
|
|
28
28
|
#include <atomic>
|
29
29
|
#include <mutex>
|
30
30
|
#include <sstream>
|
31
|
+
#include <numeric>
|
31
32
|
|
32
33
|
#define LLAMA_USE_SCRATCH
|
33
34
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
34
35
|
|
35
|
-
|
36
36
|
// available llama models
|
37
37
|
enum e_model {
|
38
38
|
MODEL_UNKNOWN,
|
@@ -136,7 +136,7 @@ struct llama_kv_cache {
|
|
136
136
|
|
137
137
|
struct ggml_context * ctx = NULL;
|
138
138
|
|
139
|
-
|
139
|
+
llama_ctx_buffer buf;
|
140
140
|
|
141
141
|
int n; // number of tokens currently in the cache
|
142
142
|
|
@@ -167,7 +167,7 @@ struct llama_model {
|
|
167
167
|
struct llama_kv_cache kv_self;
|
168
168
|
|
169
169
|
// the model memory buffer
|
170
|
-
|
170
|
+
llama_ctx_buffer buf;
|
171
171
|
|
172
172
|
// model memory mapped file
|
173
173
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -228,8 +228,8 @@ struct llama_context {
|
|
228
228
|
|
229
229
|
// memory buffers used to evaluate the model
|
230
230
|
// TODO: move in llama_state
|
231
|
-
|
232
|
-
|
231
|
+
llama_ctx_buffer buf_compute;
|
232
|
+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
233
233
|
|
234
234
|
int buf_last = 0;
|
235
235
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -483,7 +483,6 @@ struct llama_file_loader {
|
|
483
483
|
case GGML_TYPE_Q4_0:
|
484
484
|
case GGML_TYPE_Q4_1:
|
485
485
|
case GGML_TYPE_Q4_2:
|
486
|
-
case GGML_TYPE_Q4_3:
|
487
486
|
case GGML_TYPE_Q5_0:
|
488
487
|
case GGML_TYPE_Q5_1:
|
489
488
|
case GGML_TYPE_Q8_0:
|
@@ -560,7 +559,6 @@ struct llama_file_saver {
|
|
560
559
|
case GGML_TYPE_Q4_0:
|
561
560
|
case GGML_TYPE_Q4_1:
|
562
561
|
case GGML_TYPE_Q4_2:
|
563
|
-
case GGML_TYPE_Q4_3:
|
564
562
|
case GGML_TYPE_Q5_0:
|
565
563
|
case GGML_TYPE_Q5_1:
|
566
564
|
case GGML_TYPE_Q8_0:
|
@@ -661,6 +659,7 @@ struct llama_model_loader {
|
|
661
659
|
LLAMA_ASSERT(lt.ne.size() == 1);
|
662
660
|
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
663
661
|
}
|
662
|
+
ggml_set_name(tensor, lt.name.c_str());
|
664
663
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
665
664
|
lt.ggml_tensor = tensor;
|
666
665
|
num_ggml_tensors_created++;
|
@@ -729,8 +728,7 @@ struct llama_model_loader {
|
|
729
728
|
LLAMA_ASSERT(offset == lt.size);
|
730
729
|
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
731
730
|
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
732
|
-
std::vector<llama_buffer> tmp_bufs;
|
733
|
-
tmp_bufs.resize(lt.shards.size());
|
731
|
+
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
734
732
|
for (size_t i = 0; i < lt.shards.size(); i++) {
|
735
733
|
llama_load_tensor_shard & shard = lt.shards.at(i);
|
736
734
|
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
@@ -782,7 +780,7 @@ static bool kv_cache_init(
|
|
782
780
|
const int n_embd = hparams.n_embd;
|
783
781
|
const int n_layer = hparams.n_layer;
|
784
782
|
|
785
|
-
const int64_t n_mem =
|
783
|
+
const int64_t n_mem = n_layer*n_ctx;
|
786
784
|
const int64_t n_elements = n_embd*n_mem;
|
787
785
|
|
788
786
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
@@ -801,6 +799,8 @@ static bool kv_cache_init(
|
|
801
799
|
|
802
800
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
803
801
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
802
|
+
ggml_set_name(cache.k, "cache_k");
|
803
|
+
ggml_set_name(cache.v, "cache_v");
|
804
804
|
|
805
805
|
return true;
|
806
806
|
}
|
@@ -809,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
|
|
809
809
|
struct llama_context_params result = {
|
810
810
|
/*.n_ctx =*/ 512,
|
811
811
|
/*.n_parts =*/ -1,
|
812
|
-
/*.seed =*/
|
812
|
+
/*.seed =*/ -1,
|
813
813
|
/*.f16_kv =*/ false,
|
814
814
|
/*.logits_all =*/ false,
|
815
815
|
/*.vocab_only =*/ false,
|
@@ -853,7 +853,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
853
853
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
854
854
|
return "mostly Q4_1, some F16";
|
855
855
|
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
|
-
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
856
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
857
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
858
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
@@ -1088,6 +1087,7 @@ static bool llama_eval_internal(
|
|
1088
1087
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1089
1088
|
|
1090
1089
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1090
|
+
ggml_set_name(embd, "embd");
|
1091
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1092
1092
|
|
1093
1093
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
@@ -1114,6 +1114,8 @@ static bool llama_eval_internal(
|
|
1114
1114
|
// compute Q and K and RoPE them
|
1115
1115
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1116
1116
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
|
+
ggml_set_name(Qcur, "Qcur");
|
1118
|
+
ggml_set_name(Kcur, "Kcur");
|
1117
1119
|
|
1118
1120
|
// store key and value to memory
|
1119
1121
|
{
|
@@ -1134,6 +1136,7 @@ static bool llama_eval_internal(
|
|
1134
1136
|
ggml_permute(ctx0,
|
1135
1137
|
Qcur,
|
1136
1138
|
0, 2, 1, 3);
|
1139
|
+
ggml_set_name(Q, "Q");
|
1137
1140
|
|
1138
1141
|
struct ggml_tensor * K =
|
1139
1142
|
ggml_permute(ctx0,
|
@@ -1141,21 +1144,26 @@ static bool llama_eval_internal(
|
|
1141
1144
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1142
1145
|
n_embd/n_head, n_head, n_past + N),
|
1143
1146
|
0, 2, 1, 3);
|
1147
|
+
ggml_set_name(K, "K");
|
1144
1148
|
|
1145
1149
|
// K * Q
|
1146
1150
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1151
|
+
ggml_set_name(KQ, "KQ");
|
1147
1152
|
|
1148
1153
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
1149
|
-
struct ggml_tensor *
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1154
|
+
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
|
+
|
1157
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
1158
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1153
1159
|
|
1154
1160
|
// KQ_masked = mask_past(KQ_scaled)
|
1155
1161
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
1162
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
1156
1163
|
|
1157
1164
|
// KQ = soft_max(KQ_masked)
|
1158
1165
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
1166
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1159
1167
|
|
1160
1168
|
// split cached V into n_head heads
|
1161
1169
|
struct ggml_tensor * V =
|
@@ -1164,9 +1172,11 @@ static bool llama_eval_internal(
|
|
1164
1172
|
n_ctx*ggml_element_size(kv_self.v),
|
1165
1173
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1166
1174
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1175
|
+
ggml_set_name(V, "V");
|
1167
1176
|
|
1168
1177
|
#if 1
|
1169
1178
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1179
|
+
ggml_set_name(KQV, "KQV");
|
1170
1180
|
#else
|
1171
1181
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1172
1182
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
@@ -1177,11 +1187,13 @@ static bool llama_eval_internal(
|
|
1177
1187
|
|
1178
1188
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1179
1189
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1190
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
1180
1191
|
|
1181
1192
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1182
1193
|
cur = ggml_cpy(ctx0,
|
1183
1194
|
KQV_merged,
|
1184
1195
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1196
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
1185
1197
|
|
1186
1198
|
// projection (no bias)
|
1187
1199
|
cur = ggml_mul_mat(ctx0,
|
@@ -1273,6 +1285,9 @@ static bool llama_eval_internal(
|
|
1273
1285
|
//embd_w.resize(n_vocab*N);
|
1274
1286
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
1275
1287
|
|
1288
|
+
// update kv token count
|
1289
|
+
lctx.model.kv_self.n = n_past + N;
|
1290
|
+
|
1276
1291
|
// extract logits
|
1277
1292
|
{
|
1278
1293
|
auto & logits_out = lctx.logits;
|
@@ -1478,109 +1493,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1478
1493
|
// sampling
|
1479
1494
|
//
|
1480
1495
|
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1496
|
+
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1497
|
+
assert(candidates->size > 0);
|
1498
|
+
|
1499
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1500
|
+
|
1501
|
+
// Sort the logits in descending order
|
1502
|
+
if (!candidates->sorted) {
|
1503
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1504
|
+
return a.logit > b.logit;
|
1505
|
+
});
|
1506
|
+
candidates->sorted = true;
|
1507
|
+
}
|
1508
|
+
|
1509
|
+
float max_l = candidates->data[0].logit;
|
1510
|
+
float cum_sum = 0.0f;
|
1511
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1512
|
+
float p = expf(candidates->data[i].logit - max_l);
|
1513
|
+
candidates->data[i].p = p;
|
1514
|
+
cum_sum += p;
|
1515
|
+
}
|
1516
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1517
|
+
candidates->data[i].p /= cum_sum;
|
1518
|
+
}
|
1489
1519
|
|
1490
|
-
|
1520
|
+
if (ctx) {
|
1521
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1522
|
+
}
|
1491
1523
|
}
|
1492
1524
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
// select the token with the highest logit directly
|
1509
|
-
float max_logit = plogits[0];
|
1510
|
-
llama_vocab::id max_id = 0;
|
1511
|
-
|
1512
|
-
for (int i = 1; i < n_logits; ++i) {
|
1513
|
-
if (plogits[i] > max_logit) {
|
1514
|
-
max_logit = plogits[i];
|
1515
|
-
max_id = i;
|
1516
|
-
}
|
1525
|
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
1526
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1527
|
+
|
1528
|
+
k = std::max(k, (int) min_keep);
|
1529
|
+
k = std::min(k, (int) candidates->size);
|
1530
|
+
|
1531
|
+
// Sort scores in descending order
|
1532
|
+
if (!candidates->sorted) {
|
1533
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
1534
|
+
return a.logit > b.logit;
|
1535
|
+
};
|
1536
|
+
if (k == (int) candidates->size) {
|
1537
|
+
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
1538
|
+
} else {
|
1539
|
+
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
1517
1540
|
}
|
1518
|
-
|
1541
|
+
candidates->sorted = true;
|
1542
|
+
}
|
1543
|
+
candidates->size = k;
|
1544
|
+
|
1545
|
+
if (ctx) {
|
1546
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1519
1547
|
}
|
1548
|
+
}
|
1520
1549
|
|
1521
|
-
|
1522
|
-
|
1550
|
+
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1551
|
+
if (p >= 1.0f) {
|
1552
|
+
return;
|
1553
|
+
}
|
1523
1554
|
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1555
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1556
|
+
|
1557
|
+
llama_sample_softmax(ctx, candidates);
|
1558
|
+
|
1559
|
+
// Compute the cumulative probabilities
|
1560
|
+
float cum_sum = 0.0f;
|
1561
|
+
size_t last_idx = candidates->size;
|
1562
|
+
|
1563
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1564
|
+
cum_sum += candidates->data[i].p;
|
1565
|
+
|
1566
|
+
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
1567
|
+
if (cum_sum > p && i >= min_keep) {
|
1568
|
+
last_idx = i;
|
1569
|
+
break;
|
1539
1570
|
}
|
1540
1571
|
}
|
1541
1572
|
|
1542
|
-
|
1573
|
+
// Resize the output vector to keep only the top-p tokens
|
1574
|
+
candidates->size = last_idx;
|
1543
1575
|
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1576
|
+
if (ctx) {
|
1577
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1578
|
+
}
|
1579
|
+
}
|
1547
1580
|
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
const float p = expf(kv.first - maxl);
|
1552
|
-
probs.push_back(p);
|
1553
|
-
sum += p;
|
1581
|
+
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
1582
|
+
if (z >= 1.0f || candidates->size <= 2) {
|
1583
|
+
return;
|
1554
1584
|
}
|
1555
1585
|
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1586
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1587
|
+
|
1588
|
+
llama_sample_softmax(nullptr, candidates);
|
1589
|
+
|
1590
|
+
// Compute the first and second derivatives
|
1591
|
+
std::vector<float> first_derivatives(candidates->size - 1);
|
1592
|
+
std::vector<float> second_derivatives(candidates->size - 2);
|
1593
|
+
|
1594
|
+
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
1595
|
+
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
1596
|
+
}
|
1597
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1598
|
+
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
1559
1599
|
}
|
1560
1600
|
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1601
|
+
// Calculate absolute value of second derivatives
|
1602
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1603
|
+
second_derivatives[i] = abs(second_derivatives[i]);
|
1604
|
+
}
|
1605
|
+
|
1606
|
+
// Normalize the second derivatives
|
1607
|
+
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
1608
|
+
for (float & value : second_derivatives) {
|
1609
|
+
value /= second_derivatives_sum;
|
1610
|
+
}
|
1611
|
+
|
1612
|
+
float cum_sum = 0.0f;
|
1613
|
+
size_t last_idx = candidates->size;
|
1614
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1615
|
+
cum_sum += second_derivatives[i];
|
1616
|
+
|
1617
|
+
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
1618
|
+
if (cum_sum > z && i >= min_keep) {
|
1619
|
+
last_idx = i;
|
1620
|
+
break;
|
1570
1621
|
}
|
1571
1622
|
}
|
1572
1623
|
|
1573
|
-
//
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1624
|
+
// Resize the output vector to keep only the tokens above the tail location
|
1625
|
+
candidates->size = last_idx;
|
1626
|
+
|
1627
|
+
if (ctx) {
|
1628
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1629
|
+
}
|
1630
|
+
}
|
1631
|
+
|
1632
|
+
|
1633
|
+
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1634
|
+
// Reference implementation:
|
1635
|
+
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
1636
|
+
if (p >= 1.0f) {
|
1637
|
+
return;
|
1638
|
+
}
|
1639
|
+
|
1640
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1641
|
+
|
1642
|
+
// Compute the softmax of logits and calculate entropy
|
1643
|
+
llama_sample_softmax(nullptr, candidates);
|
1644
|
+
|
1645
|
+
float entropy = 0.0f;
|
1646
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1647
|
+
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
// Compute the absolute difference between negative log probability and entropy for each candidate
|
1651
|
+
std::vector<float> shifted_scores;
|
1652
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1653
|
+
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
1654
|
+
shifted_scores.push_back(shifted_score);
|
1655
|
+
}
|
1656
|
+
|
1657
|
+
// Sort tokens based on the shifted_scores and their corresponding indices
|
1658
|
+
std::vector<size_t> indices(candidates->size);
|
1659
|
+
std::iota(indices.begin(), indices.end(), 0);
|
1660
|
+
|
1661
|
+
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
1662
|
+
return shifted_scores[a] < shifted_scores[b];
|
1663
|
+
});
|
1664
|
+
|
1665
|
+
// Compute the cumulative probabilities
|
1666
|
+
float cum_sum = 0.0f;
|
1667
|
+
size_t last_idx = indices.size();
|
1668
|
+
|
1669
|
+
for (size_t i = 0; i < indices.size(); ++i) {
|
1670
|
+
size_t idx = indices[i];
|
1671
|
+
cum_sum += candidates->data[idx].p;
|
1672
|
+
|
1673
|
+
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
1674
|
+
if (cum_sum > p && i >= min_keep - 1) {
|
1675
|
+
last_idx = i + 1;
|
1676
|
+
break;
|
1677
|
+
}
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
// Resize the output vector to keep only the locally typical tokens
|
1681
|
+
std::vector<llama_token_data> new_candidates;
|
1682
|
+
for (size_t i = 0; i < last_idx; ++i) {
|
1683
|
+
size_t idx = indices[i];
|
1684
|
+
new_candidates.push_back(candidates->data[idx]);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
// Replace the data in candidates with the new_candidates data
|
1688
|
+
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
1689
|
+
candidates->size = new_candidates.size();
|
1690
|
+
|
1691
|
+
if (ctx) {
|
1692
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1693
|
+
}
|
1694
|
+
}
|
1695
|
+
|
1696
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
1697
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1698
|
+
|
1699
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
1700
|
+
candidates_p->data[i].logit /= temp;
|
1701
|
+
}
|
1702
|
+
|
1703
|
+
if (ctx) {
|
1704
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1705
|
+
}
|
1706
|
+
}
|
1707
|
+
|
1708
|
+
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
1709
|
+
if (last_tokens_size == 0 || penalty == 1.0f) {
|
1710
|
+
return;
|
1711
|
+
}
|
1712
|
+
|
1713
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1714
|
+
|
1715
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
+
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
|
+
if (token_iter == last_tokens + last_tokens_size) {
|
1718
|
+
continue;
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
1722
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
1723
|
+
if (candidates->data[i].logit <= 0) {
|
1724
|
+
candidates->data[i].logit *= penalty;
|
1725
|
+
} else {
|
1726
|
+
candidates->data[i].logit /= penalty;
|
1727
|
+
}
|
1728
|
+
}
|
1729
|
+
|
1730
|
+
candidates->sorted = false;
|
1731
|
+
|
1732
|
+
if (ctx) {
|
1733
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1734
|
+
}
|
1735
|
+
}
|
1736
|
+
|
1737
|
+
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
1738
|
+
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
1739
|
+
return;
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1743
|
+
|
1744
|
+
// Create a frequency map to count occurrences of each token in last_tokens
|
1745
|
+
std::unordered_map<llama_token, int> token_count;
|
1746
|
+
for (size_t i = 0; i < last_tokens_size; ++i) {
|
1747
|
+
token_count[last_tokens_p[i]]++;
|
1748
|
+
}
|
1749
|
+
|
1750
|
+
// Apply frequency and presence penalties to the candidates
|
1751
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1752
|
+
auto token_iter = token_count.find(candidates->data[i].id);
|
1753
|
+
if (token_iter == token_count.end()) {
|
1754
|
+
continue;
|
1755
|
+
}
|
1756
|
+
|
1757
|
+
int count = token_iter->second;
|
1758
|
+
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
1759
|
+
}
|
1760
|
+
|
1761
|
+
candidates->sorted = false;
|
1762
|
+
|
1763
|
+
if (ctx) {
|
1764
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1765
|
+
}
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
|
1769
|
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
1770
|
+
assert(ctx);
|
1771
|
+
auto N = float(llama_n_vocab(ctx));
|
1772
|
+
int64_t t_start_sample_us;
|
1773
|
+
t_start_sample_us = ggml_time_us();
|
1774
|
+
|
1775
|
+
llama_sample_softmax(nullptr, candidates);
|
1776
|
+
|
1777
|
+
// Estimate s_hat using the most probable m tokens
|
1778
|
+
float s_hat = 0.0;
|
1779
|
+
float sum_ti_bi = 0.0;
|
1780
|
+
float sum_ti_sq = 0.0;
|
1781
|
+
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
1782
|
+
float t_i = logf(float(i + 2) / float(i + 1));
|
1783
|
+
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
1784
|
+
sum_ti_bi += t_i * b_i;
|
1785
|
+
sum_ti_sq += t_i * t_i;
|
1786
|
+
}
|
1787
|
+
s_hat = sum_ti_bi / sum_ti_sq;
|
1788
|
+
|
1789
|
+
// Compute k from the estimated s_hat and target surprise value
|
1790
|
+
float epsilon_hat = s_hat - 1;
|
1791
|
+
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
|
+
|
1793
|
+
// Sample the next word X using top-k sampling
|
1794
|
+
llama_sample_top_k(nullptr, candidates, int(k));
|
1795
|
+
if (ctx) {
|
1796
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
|
+
}
|
1798
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1799
|
+
t_start_sample_us = ggml_time_us();
|
1800
|
+
|
1801
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1802
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1803
|
+
return candidate.id == X;
|
1804
|
+
}));
|
1805
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1806
|
+
float e = observed_surprise - tau;
|
1807
|
+
|
1808
|
+
// Update mu using the learning rate and error
|
1809
|
+
*mu = *mu - eta * e;
|
1810
|
+
|
1811
|
+
if (ctx) {
|
1812
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1813
|
+
ctx->n_sample++;
|
1814
|
+
}
|
1815
|
+
return X;
|
1816
|
+
}
|
1817
|
+
|
1818
|
+
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
1819
|
+
assert(ctx);
|
1820
|
+
int64_t t_start_sample_us;
|
1821
|
+
t_start_sample_us = ggml_time_us();
|
1822
|
+
|
1823
|
+
llama_sample_softmax(ctx, candidates);
|
1824
|
+
|
1825
|
+
// Truncate the words with surprise values greater than mu
|
1826
|
+
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1827
|
+
return -log2f(candidate.p) > *mu;
|
1828
|
+
}));
|
1829
|
+
|
1830
|
+
// Normalize the probabilities of the remaining words
|
1831
|
+
llama_sample_softmax(ctx, candidates);
|
1832
|
+
|
1833
|
+
// Sample the next word X from the remaining words
|
1834
|
+
if (ctx) {
|
1835
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1836
|
+
}
|
1837
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1838
|
+
t_start_sample_us = ggml_time_us();
|
1839
|
+
|
1840
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1841
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1842
|
+
return candidate.id == X;
|
1843
|
+
}));
|
1844
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1845
|
+
float e = observed_surprise - tau;
|
1846
|
+
|
1847
|
+
// Update mu using the learning rate and error
|
1848
|
+
*mu = *mu - eta * e;
|
1849
|
+
|
1850
|
+
if (ctx) {
|
1851
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1852
|
+
}
|
1853
|
+
return X;
|
1854
|
+
}
|
1855
|
+
|
1856
|
+
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1857
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1858
|
+
|
1859
|
+
// Find max element
|
1860
|
+
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
|
+
return a.logit < b.logit;
|
1862
|
+
});
|
1863
|
+
|
1864
|
+
llama_token result = max_iter->id;
|
1865
|
+
if (ctx) {
|
1866
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1867
|
+
ctx->n_sample++;
|
1868
|
+
}
|
1869
|
+
return result;
|
1870
|
+
}
|
1871
|
+
|
1872
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1873
|
+
assert(ctx);
|
1874
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1875
|
+
llama_sample_softmax(nullptr, candidates);
|
1876
|
+
|
1877
|
+
std::vector<float> probs;
|
1878
|
+
probs.reserve(candidates->size);
|
1879
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1880
|
+
probs.push_back(candidates->data[i].p);
|
1881
|
+
}
|
1579
1882
|
|
1580
1883
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
1884
|
+
auto & rng = ctx->rng;
|
1581
1885
|
int idx = dist(rng);
|
1582
1886
|
|
1583
|
-
|
1887
|
+
llama_token result = candidates->data[idx].id;
|
1888
|
+
|
1889
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1890
|
+
ctx->n_sample++;
|
1891
|
+
return result;
|
1584
1892
|
}
|
1585
1893
|
|
1586
1894
|
//
|
@@ -1593,7 +1901,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1593
1901
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1594
1902
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1595
1903
|
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1596
|
-
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
1904
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
1905
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
1906
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
@@ -1749,7 +2056,7 @@ struct llama_context * llama_init_from_file(
|
|
1749
2056
|
|
1750
2057
|
llama_context * ctx = new llama_context;
|
1751
2058
|
|
1752
|
-
if (params.seed
|
2059
|
+
if (params.seed < 0) {
|
1753
2060
|
params.seed = time(NULL);
|
1754
2061
|
}
|
1755
2062
|
|
@@ -2084,21 +2391,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2084
2391
|
}
|
2085
2392
|
}
|
2086
2393
|
|
2087
|
-
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2394
|
+
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2088
2395
|
return ctx->model.kv_self.n;
|
2089
2396
|
}
|
2090
2397
|
|
2091
2398
|
#define LLAMA_MAX_RNG_STATE 64*1024
|
2092
2399
|
|
2093
2400
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
-
if (seed
|
2401
|
+
if (seed < 0) {
|
2095
2402
|
seed = time(NULL);
|
2096
2403
|
}
|
2097
2404
|
ctx->rng.seed(seed);
|
2098
2405
|
}
|
2099
2406
|
|
2100
|
-
// Returns the size of the state
|
2101
|
-
size_t llama_get_state_size(struct llama_context * ctx) {
|
2407
|
+
// Returns the *maximum* size of the state
|
2408
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
2102
2409
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
2410
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
2411
|
const size_t s_rng_size = sizeof(size_t);
|
@@ -2176,21 +2483,51 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|
2176
2483
|
|
2177
2484
|
// copy kv cache
|
2178
2485
|
{
|
2179
|
-
const
|
2486
|
+
const auto & kv_self = ctx->model.kv_self;
|
2487
|
+
const auto & hparams = ctx->model.hparams;
|
2488
|
+
const int n_layer = hparams.n_layer;
|
2489
|
+
const int n_embd = hparams.n_embd;
|
2490
|
+
const int n_ctx = hparams.n_ctx;
|
2491
|
+
|
2492
|
+
const size_t kv_size = kv_self.buf.size;
|
2180
2493
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
2494
|
|
2182
2495
|
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
2496
|
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
2497
|
|
2185
2498
|
if (kv_size) {
|
2186
|
-
|
2499
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2500
|
+
char buffer[4096];
|
2501
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
|
+
ggml_cgraph gf{};
|
2503
|
+
gf.n_threads = 1;
|
2504
|
+
|
2505
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2506
|
+
kout3d->data = out;
|
2507
|
+
out += ggml_nbytes(kout3d);
|
2508
|
+
|
2509
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2510
|
+
vout3d->data = out;
|
2511
|
+
out += ggml_nbytes(vout3d);
|
2512
|
+
|
2513
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2514
|
+
n_embd, kv_ntok, n_layer,
|
2515
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2516
|
+
|
2517
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2518
|
+
kv_ntok, n_embd, n_layer,
|
2519
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2520
|
+
|
2521
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2187
2524
|
}
|
2188
2525
|
}
|
2189
2526
|
|
2190
2527
|
const size_t written = out - dest;
|
2191
|
-
const size_t
|
2528
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2192
2529
|
|
2193
|
-
LLAMA_ASSERT(written
|
2530
|
+
LLAMA_ASSERT(written <= max_size);
|
2194
2531
|
|
2195
2532
|
return written;
|
2196
2533
|
}
|
@@ -2248,6 +2585,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2248
2585
|
|
2249
2586
|
// set kv cache
|
2250
2587
|
{
|
2588
|
+
const auto & kv_self = ctx->model.kv_self;
|
2589
|
+
const auto & hparams = ctx->model.hparams;
|
2590
|
+
const int n_layer = hparams.n_layer;
|
2591
|
+
const int n_embd = hparams.n_embd;
|
2592
|
+
const int n_ctx = hparams.n_ctx;
|
2593
|
+
|
2251
2594
|
size_t kv_size;
|
2252
2595
|
int kv_ntok;
|
2253
2596
|
|
@@ -2255,29 +2598,125 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
2255
2598
|
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2256
2599
|
|
2257
2600
|
if (kv_size) {
|
2258
|
-
LLAMA_ASSERT(
|
2601
|
+
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
|
+
|
2603
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2604
|
+
char buffer[4096];
|
2605
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
|
+
ggml_cgraph gf{};
|
2607
|
+
gf.n_threads = 1;
|
2608
|
+
|
2609
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
+
kin3d->data = (void *) in;
|
2611
|
+
in += ggml_nbytes(kin3d);
|
2259
2612
|
|
2260
|
-
|
2261
|
-
|
2613
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
+
vin3d->data = (void *) in;
|
2615
|
+
in += ggml_nbytes(vin3d);
|
2262
2616
|
|
2263
|
-
|
2617
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
|
+
n_embd, kv_ntok, n_layer,
|
2619
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2264
2620
|
|
2265
|
-
|
2266
|
-
|
2621
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2622
|
+
kv_ntok, n_embd, n_layer,
|
2623
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2267
2624
|
|
2625
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2268
2628
|
}
|
2269
2629
|
|
2270
2630
|
ctx->model.kv_self.n = kv_ntok;
|
2271
2631
|
}
|
2272
2632
|
|
2273
2633
|
const size_t nread = in - src;
|
2274
|
-
const size_t
|
2634
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2275
2635
|
|
2276
|
-
LLAMA_ASSERT(nread
|
2636
|
+
LLAMA_ASSERT(nread <= max_size);
|
2277
2637
|
|
2278
2638
|
return nread;
|
2279
2639
|
}
|
2280
2640
|
|
2641
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
2642
|
+
llama_file file(path_session, "rb");
|
2643
|
+
|
2644
|
+
// sanity checks
|
2645
|
+
{
|
2646
|
+
const uint32_t magic = file.read_u32();
|
2647
|
+
const uint32_t version = file.read_u32();
|
2648
|
+
|
2649
|
+
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
|
2650
|
+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
|
+
return false;
|
2652
|
+
}
|
2653
|
+
|
2654
|
+
llama_hparams session_hparams;
|
2655
|
+
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
2656
|
+
|
2657
|
+
if (session_hparams != ctx->model.hparams) {
|
2658
|
+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
2659
|
+
return false;
|
2660
|
+
}
|
2661
|
+
}
|
2662
|
+
|
2663
|
+
// load the prompt
|
2664
|
+
{
|
2665
|
+
const uint32_t n_token_count = file.read_u32();
|
2666
|
+
|
2667
|
+
if (n_token_count > n_token_capacity) {
|
2668
|
+
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
2669
|
+
return false;
|
2670
|
+
}
|
2671
|
+
|
2672
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
2673
|
+
*n_token_count_out = n_token_count;
|
2674
|
+
}
|
2675
|
+
|
2676
|
+
// restore the context state
|
2677
|
+
{
|
2678
|
+
const size_t n_state_size_cur = file.size - file.tell();
|
2679
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2680
|
+
|
2681
|
+
if (n_state_size_cur > n_state_size_max) {
|
2682
|
+
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
2683
|
+
return false;
|
2684
|
+
}
|
2685
|
+
|
2686
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2687
|
+
file.read_raw(state_data.data(), n_state_size_cur);
|
2688
|
+
|
2689
|
+
llama_set_state_data(ctx, state_data.data());
|
2690
|
+
}
|
2691
|
+
|
2692
|
+
return true;
|
2693
|
+
}
|
2694
|
+
|
2695
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
2696
|
+
llama_file file(path_session, "wb");
|
2697
|
+
|
2698
|
+
file.write_u32(LLAMA_SESSION_MAGIC);
|
2699
|
+
file.write_u32(LLAMA_SESSION_VERSION);
|
2700
|
+
|
2701
|
+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
2702
|
+
|
2703
|
+
// save the prompt
|
2704
|
+
file.write_u32((uint32_t) n_token_count);
|
2705
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
2706
|
+
|
2707
|
+
// save the context state
|
2708
|
+
{
|
2709
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2710
|
+
|
2711
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2712
|
+
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
2713
|
+
|
2714
|
+
file.write_raw(state_data.data(), n_state_size_cur);
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return true;
|
2718
|
+
}
|
2719
|
+
|
2281
2720
|
int llama_eval(
|
2282
2721
|
struct llama_context * ctx,
|
2283
2722
|
const llama_token * tokens,
|
@@ -2316,15 +2755,15 @@ int llama_tokenize(
|
|
2316
2755
|
return res.size();
|
2317
2756
|
}
|
2318
2757
|
|
2319
|
-
int llama_n_vocab(struct llama_context * ctx) {
|
2758
|
+
int llama_n_vocab(const struct llama_context * ctx) {
|
2320
2759
|
return ctx->vocab.id_to_token.size();
|
2321
2760
|
}
|
2322
2761
|
|
2323
|
-
int llama_n_ctx(struct llama_context * ctx) {
|
2762
|
+
int llama_n_ctx(const struct llama_context * ctx) {
|
2324
2763
|
return ctx->model.hparams.n_ctx;
|
2325
2764
|
}
|
2326
2765
|
|
2327
|
-
int llama_n_embd(struct llama_context * ctx) {
|
2766
|
+
int llama_n_embd(const struct llama_context * ctx) {
|
2328
2767
|
return ctx->model.hparams.n_embd;
|
2329
2768
|
}
|
2330
2769
|
|
@@ -2336,7 +2775,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
2336
2775
|
return ctx->embedding.data();
|
2337
2776
|
}
|
2338
2777
|
|
2339
|
-
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
2778
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
2340
2779
|
if (token >= llama_n_vocab(ctx)) {
|
2341
2780
|
return nullptr;
|
2342
2781
|
}
|
@@ -2352,33 +2791,8 @@ llama_token llama_token_eos() {
|
|
2352
2791
|
return 2;
|
2353
2792
|
}
|
2354
2793
|
|
2355
|
-
llama_token
|
2356
|
-
|
2357
|
-
const llama_token * last_n_tokens_data,
|
2358
|
-
int last_n_tokens_size,
|
2359
|
-
int top_k,
|
2360
|
-
float top_p,
|
2361
|
-
float temp,
|
2362
|
-
float repeat_penalty) {
|
2363
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2364
|
-
|
2365
|
-
llama_token result = 0;
|
2366
|
-
|
2367
|
-
// TODO: avoid this ...
|
2368
|
-
const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
2369
|
-
|
2370
|
-
result = llama_sample_top_p_top_k(
|
2371
|
-
*ctx,
|
2372
|
-
last_n_tokens,
|
2373
|
-
top_k,
|
2374
|
-
top_p,
|
2375
|
-
temp,
|
2376
|
-
repeat_penalty);
|
2377
|
-
|
2378
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2379
|
-
ctx->n_sample++;
|
2380
|
-
|
2381
|
-
return result;
|
2794
|
+
llama_token llama_token_nl() {
|
2795
|
+
return 13;
|
2382
2796
|
}
|
2383
2797
|
|
2384
2798
|
|
@@ -2430,4 +2844,3 @@ const char * llama_print_system_info(void) {
|
|
2430
2844
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2431
2845
|
return ctx->model.tensors_by_name;
|
2432
2846
|
}
|
2433
|
-
|