llama_cpp 0.0.6 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +762 -36
- data/ext/llama_cpp/src/ggml-cuda.h +11 -4
- data/ext/llama_cpp/src/ggml-opencl.c +398 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1957 -909
- data/ext/llama_cpp/src/ggml.h +696 -627
- data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +91 -12
- data/ext/llama_cpp/src/llama.cpp +755 -159
- data/ext/llama_cpp/src/llama.h +85 -34
- data/lib/llama_cpp/client.rb +174 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +43 -11
- data/sig/llama_cpp.rbs +53 -3
- metadata +6 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
#include <cstdio>
|
6
6
|
#endif
|
7
7
|
|
8
|
-
#include "
|
8
|
+
#include "llama-util.h"
|
9
9
|
#include "llama.h"
|
10
10
|
|
11
11
|
#include "ggml.h"
|
@@ -27,11 +27,12 @@
|
|
27
27
|
#include <thread>
|
28
28
|
#include <atomic>
|
29
29
|
#include <mutex>
|
30
|
+
#include <sstream>
|
31
|
+
#include <numeric>
|
30
32
|
|
31
33
|
#define LLAMA_USE_SCRATCH
|
32
34
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
33
35
|
|
34
|
-
|
35
36
|
// available llama models
|
36
37
|
enum e_model {
|
37
38
|
MODEL_UNKNOWN,
|
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
53
54
|
{ MODEL_7B, 512ull * MB },
|
54
55
|
{ MODEL_13B, 512ull * MB },
|
55
56
|
{ MODEL_30B, 512ull * MB },
|
56
|
-
{ MODEL_65B,
|
57
|
+
{ MODEL_65B, 1024ull * MB },
|
57
58
|
};
|
58
59
|
return _MEM_REQ_SCRATCH0;
|
59
60
|
}
|
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
64
65
|
{ MODEL_7B, 512ull * MB },
|
65
66
|
{ MODEL_13B, 512ull * MB },
|
66
67
|
{ MODEL_30B, 512ull * MB },
|
67
|
-
{ MODEL_65B,
|
68
|
+
{ MODEL_65B, 1024ull * MB },
|
68
69
|
};
|
69
70
|
return _MEM_REQ_SCRATCH1;
|
70
|
-
}
|
71
|
+
}
|
71
72
|
|
72
73
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
73
74
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
79
80
|
{ MODEL_65B, 5120ull * MB },
|
80
81
|
};
|
81
82
|
return _MEM_REQ_KV_SELF;
|
82
|
-
}
|
83
|
+
}
|
83
84
|
|
84
85
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
85
86
|
// not actually needed if BLAS is disabled
|
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
92
93
|
{ MODEL_65B, 1536ull * MB },
|
93
94
|
};
|
94
95
|
return _MEM_REQ_EVAL;
|
95
|
-
}
|
96
|
+
}
|
96
97
|
|
97
98
|
// default hparams (LLaMA 7B)
|
98
99
|
struct llama_hparams {
|
@@ -135,7 +136,7 @@ struct llama_kv_cache {
|
|
135
136
|
|
136
137
|
struct ggml_context * ctx = NULL;
|
137
138
|
|
138
|
-
|
139
|
+
llama_ctx_buffer buf;
|
139
140
|
|
140
141
|
int n; // number of tokens currently in the cache
|
141
142
|
|
@@ -166,7 +167,7 @@ struct llama_model {
|
|
166
167
|
struct llama_kv_cache kv_self;
|
167
168
|
|
168
169
|
// the model memory buffer
|
169
|
-
|
170
|
+
llama_ctx_buffer buf;
|
170
171
|
|
171
172
|
// model memory mapped file
|
172
173
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -227,8 +228,8 @@ struct llama_context {
|
|
227
228
|
|
228
229
|
// memory buffers used to evaluate the model
|
229
230
|
// TODO: move in llama_state
|
230
|
-
|
231
|
-
|
231
|
+
llama_ctx_buffer buf_compute;
|
232
|
+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
232
233
|
|
233
234
|
int buf_last = 0;
|
234
235
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
@@ -482,7 +483,9 @@ struct llama_file_loader {
|
|
482
483
|
case GGML_TYPE_Q4_0:
|
483
484
|
case GGML_TYPE_Q4_1:
|
484
485
|
case GGML_TYPE_Q4_2:
|
485
|
-
case
|
486
|
+
case GGML_TYPE_Q5_0:
|
487
|
+
case GGML_TYPE_Q5_1:
|
488
|
+
case GGML_TYPE_Q8_0:
|
486
489
|
break;
|
487
490
|
default: {
|
488
491
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -556,7 +559,9 @@ struct llama_file_saver {
|
|
556
559
|
case GGML_TYPE_Q4_0:
|
557
560
|
case GGML_TYPE_Q4_1:
|
558
561
|
case GGML_TYPE_Q4_2:
|
559
|
-
case
|
562
|
+
case GGML_TYPE_Q5_0:
|
563
|
+
case GGML_TYPE_Q5_1:
|
564
|
+
case GGML_TYPE_Q8_0:
|
560
565
|
break;
|
561
566
|
default: LLAMA_ASSERT(false);
|
562
567
|
}
|
@@ -654,6 +659,7 @@ struct llama_model_loader {
|
|
654
659
|
LLAMA_ASSERT(lt.ne.size() == 1);
|
655
660
|
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
656
661
|
}
|
662
|
+
ggml_set_name(tensor, lt.name.c_str());
|
657
663
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
658
664
|
lt.ggml_tensor = tensor;
|
659
665
|
num_ggml_tensors_created++;
|
@@ -722,8 +728,7 @@ struct llama_model_loader {
|
|
722
728
|
LLAMA_ASSERT(offset == lt.size);
|
723
729
|
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
724
730
|
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
725
|
-
std::vector<llama_buffer> tmp_bufs;
|
726
|
-
tmp_bufs.resize(lt.shards.size());
|
731
|
+
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
727
732
|
for (size_t i = 0; i < lt.shards.size(); i++) {
|
728
733
|
llama_load_tensor_shard & shard = lt.shards.at(i);
|
729
734
|
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
@@ -775,7 +780,7 @@ static bool kv_cache_init(
|
|
775
780
|
const int n_embd = hparams.n_embd;
|
776
781
|
const int n_layer = hparams.n_layer;
|
777
782
|
|
778
|
-
const int64_t n_mem =
|
783
|
+
const int64_t n_mem = n_layer*n_ctx;
|
779
784
|
const int64_t n_elements = n_embd*n_mem;
|
780
785
|
|
781
786
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
@@ -794,6 +799,8 @@ static bool kv_cache_init(
|
|
794
799
|
|
795
800
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
796
801
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
802
|
+
ggml_set_name(cache.k, "cache_k");
|
803
|
+
ggml_set_name(cache.v, "cache_v");
|
797
804
|
|
798
805
|
return true;
|
799
806
|
}
|
@@ -802,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
|
|
802
809
|
struct llama_context_params result = {
|
803
810
|
/*.n_ctx =*/ 512,
|
804
811
|
/*.n_parts =*/ -1,
|
805
|
-
/*.seed =*/
|
812
|
+
/*.seed =*/ -1,
|
806
813
|
/*.f16_kv =*/ false,
|
807
814
|
/*.logits_all =*/ false,
|
808
815
|
/*.vocab_only =*/ false,
|
@@ -846,7 +853,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
846
853
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
847
854
|
return "mostly Q4_1, some F16";
|
848
855
|
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
|
-
case
|
856
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
857
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
858
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
850
859
|
default: return "unknown, may not work";
|
851
860
|
}
|
852
861
|
}
|
@@ -1075,9 +1084,10 @@ static bool llama_eval_internal(
|
|
1075
1084
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1076
1085
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1077
1086
|
ggml_cgraph gf = {};
|
1078
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !
|
1087
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1079
1088
|
|
1080
1089
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1090
|
+
ggml_set_name(embd, "embd");
|
1081
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1082
1092
|
|
1083
1093
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
@@ -1104,6 +1114,8 @@ static bool llama_eval_internal(
|
|
1104
1114
|
// compute Q and K and RoPE them
|
1105
1115
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1106
1116
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1117
|
+
ggml_set_name(Qcur, "Qcur");
|
1118
|
+
ggml_set_name(Kcur, "Kcur");
|
1107
1119
|
|
1108
1120
|
// store key and value to memory
|
1109
1121
|
{
|
@@ -1124,6 +1136,7 @@ static bool llama_eval_internal(
|
|
1124
1136
|
ggml_permute(ctx0,
|
1125
1137
|
Qcur,
|
1126
1138
|
0, 2, 1, 3);
|
1139
|
+
ggml_set_name(Q, "Q");
|
1127
1140
|
|
1128
1141
|
struct ggml_tensor * K =
|
1129
1142
|
ggml_permute(ctx0,
|
@@ -1131,21 +1144,26 @@ static bool llama_eval_internal(
|
|
1131
1144
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1132
1145
|
n_embd/n_head, n_head, n_past + N),
|
1133
1146
|
0, 2, 1, 3);
|
1147
|
+
ggml_set_name(K, "K");
|
1134
1148
|
|
1135
1149
|
// K * Q
|
1136
1150
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1151
|
+
ggml_set_name(KQ, "KQ");
|
1137
1152
|
|
1138
1153
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
1139
|
-
struct ggml_tensor *
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1154
|
+
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1155
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
1156
|
+
|
1157
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
1158
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1143
1159
|
|
1144
1160
|
// KQ_masked = mask_past(KQ_scaled)
|
1145
1161
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
1162
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
1146
1163
|
|
1147
1164
|
// KQ = soft_max(KQ_masked)
|
1148
1165
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
1166
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1149
1167
|
|
1150
1168
|
// split cached V into n_head heads
|
1151
1169
|
struct ggml_tensor * V =
|
@@ -1154,9 +1172,11 @@ static bool llama_eval_internal(
|
|
1154
1172
|
n_ctx*ggml_element_size(kv_self.v),
|
1155
1173
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1156
1174
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1175
|
+
ggml_set_name(V, "V");
|
1157
1176
|
|
1158
1177
|
#if 1
|
1159
1178
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1179
|
+
ggml_set_name(KQV, "KQV");
|
1160
1180
|
#else
|
1161
1181
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
1162
1182
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
@@ -1167,11 +1187,13 @@ static bool llama_eval_internal(
|
|
1167
1187
|
|
1168
1188
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1169
1189
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1190
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
1170
1191
|
|
1171
1192
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1172
1193
|
cur = ggml_cpy(ctx0,
|
1173
1194
|
KQV_merged,
|
1174
1195
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1196
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
1175
1197
|
|
1176
1198
|
// projection (no bias)
|
1177
1199
|
cur = ggml_mul_mat(ctx0,
|
@@ -1249,9 +1271,11 @@ static bool llama_eval_internal(
|
|
1249
1271
|
ggml_build_forward_expand(&gf, inpL);
|
1250
1272
|
ggml_graph_compute (ctx0, &gf);
|
1251
1273
|
|
1274
|
+
#ifdef GGML_PERF
|
1252
1275
|
// print timing information per ggml operation (for debugging purposes)
|
1253
1276
|
// requires GGML_PERF to be defined
|
1254
|
-
|
1277
|
+
ggml_graph_print(&gf);
|
1278
|
+
#endif
|
1255
1279
|
|
1256
1280
|
// plot the computation graph in dot format (for debugging purposes)
|
1257
1281
|
//if (n_past%100 == 0) {
|
@@ -1261,6 +1285,9 @@ static bool llama_eval_internal(
|
|
1261
1285
|
//embd_w.resize(n_vocab*N);
|
1262
1286
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
1263
1287
|
|
1288
|
+
// update kv token count
|
1289
|
+
lctx.model.kv_self.n = n_past + N;
|
1290
|
+
|
1264
1291
|
// extract logits
|
1265
1292
|
{
|
1266
1293
|
auto & logits_out = lctx.logits;
|
@@ -1466,109 +1493,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
1466
1493
|
// sampling
|
1467
1494
|
//
|
1468
1495
|
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1496
|
+
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1497
|
+
assert(candidates->size > 0);
|
1498
|
+
|
1499
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1500
|
+
|
1501
|
+
// Sort the logits in descending order
|
1502
|
+
if (!candidates->sorted) {
|
1503
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1504
|
+
return a.logit > b.logit;
|
1505
|
+
});
|
1506
|
+
candidates->sorted = true;
|
1507
|
+
}
|
1477
1508
|
|
1478
|
-
|
1509
|
+
float max_l = candidates->data[0].logit;
|
1510
|
+
float cum_sum = 0.0f;
|
1511
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1512
|
+
float p = expf(candidates->data[i].logit - max_l);
|
1513
|
+
candidates->data[i].p = p;
|
1514
|
+
cum_sum += p;
|
1515
|
+
}
|
1516
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1517
|
+
candidates->data[i].p /= cum_sum;
|
1518
|
+
}
|
1519
|
+
|
1520
|
+
if (ctx) {
|
1521
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1522
|
+
}
|
1479
1523
|
}
|
1480
1524
|
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
// select the token with the highest logit directly
|
1497
|
-
float max_logit = plogits[0];
|
1498
|
-
llama_vocab::id max_id = 0;
|
1499
|
-
|
1500
|
-
for (int i = 1; i < n_logits; ++i) {
|
1501
|
-
if (plogits[i] > max_logit) {
|
1502
|
-
max_logit = plogits[i];
|
1503
|
-
max_id = i;
|
1504
|
-
}
|
1525
|
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
1526
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1527
|
+
|
1528
|
+
k = std::max(k, (int) min_keep);
|
1529
|
+
k = std::min(k, (int) candidates->size);
|
1530
|
+
|
1531
|
+
// Sort scores in descending order
|
1532
|
+
if (!candidates->sorted) {
|
1533
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
1534
|
+
return a.logit > b.logit;
|
1535
|
+
};
|
1536
|
+
if (k == (int) candidates->size) {
|
1537
|
+
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
1538
|
+
} else {
|
1539
|
+
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
1505
1540
|
}
|
1506
|
-
|
1541
|
+
candidates->sorted = true;
|
1542
|
+
}
|
1543
|
+
candidates->size = k;
|
1544
|
+
|
1545
|
+
if (ctx) {
|
1546
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1507
1547
|
}
|
1548
|
+
}
|
1508
1549
|
|
1509
|
-
|
1510
|
-
|
1550
|
+
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1551
|
+
if (p >= 1.0f) {
|
1552
|
+
return;
|
1553
|
+
}
|
1511
1554
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1555
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1556
|
+
|
1557
|
+
llama_sample_softmax(ctx, candidates);
|
1558
|
+
|
1559
|
+
// Compute the cumulative probabilities
|
1560
|
+
float cum_sum = 0.0f;
|
1561
|
+
size_t last_idx = candidates->size;
|
1562
|
+
|
1563
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1564
|
+
cum_sum += candidates->data[i].p;
|
1565
|
+
|
1566
|
+
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
1567
|
+
if (cum_sum > p && i >= min_keep) {
|
1568
|
+
last_idx = i;
|
1569
|
+
break;
|
1527
1570
|
}
|
1528
1571
|
}
|
1529
1572
|
|
1530
|
-
|
1573
|
+
// Resize the output vector to keep only the top-p tokens
|
1574
|
+
candidates->size = last_idx;
|
1531
1575
|
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1576
|
+
if (ctx) {
|
1577
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1578
|
+
}
|
1579
|
+
}
|
1535
1580
|
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
const float p = expf(kv.first - maxl);
|
1540
|
-
probs.push_back(p);
|
1541
|
-
sum += p;
|
1581
|
+
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
1582
|
+
if (z >= 1.0f || candidates->size <= 2) {
|
1583
|
+
return;
|
1542
1584
|
}
|
1543
1585
|
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1586
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1587
|
+
|
1588
|
+
llama_sample_softmax(nullptr, candidates);
|
1589
|
+
|
1590
|
+
// Compute the first and second derivatives
|
1591
|
+
std::vector<float> first_derivatives(candidates->size - 1);
|
1592
|
+
std::vector<float> second_derivatives(candidates->size - 2);
|
1593
|
+
|
1594
|
+
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
1595
|
+
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
1596
|
+
}
|
1597
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1598
|
+
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
1547
1599
|
}
|
1548
1600
|
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1601
|
+
// Calculate absolute value of second derivatives
|
1602
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1603
|
+
second_derivatives[i] = abs(second_derivatives[i]);
|
1604
|
+
}
|
1605
|
+
|
1606
|
+
// Normalize the second derivatives
|
1607
|
+
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
1608
|
+
for (float & value : second_derivatives) {
|
1609
|
+
value /= second_derivatives_sum;
|
1610
|
+
}
|
1611
|
+
|
1612
|
+
float cum_sum = 0.0f;
|
1613
|
+
size_t last_idx = candidates->size;
|
1614
|
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
1615
|
+
cum_sum += second_derivatives[i];
|
1616
|
+
|
1617
|
+
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
1618
|
+
if (cum_sum > z && i >= min_keep) {
|
1619
|
+
last_idx = i;
|
1620
|
+
break;
|
1558
1621
|
}
|
1559
1622
|
}
|
1560
1623
|
|
1561
|
-
//
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1624
|
+
// Resize the output vector to keep only the tokens above the tail location
|
1625
|
+
candidates->size = last_idx;
|
1626
|
+
|
1627
|
+
if (ctx) {
|
1628
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1629
|
+
}
|
1630
|
+
}
|
1631
|
+
|
1632
|
+
|
1633
|
+
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
1634
|
+
// Reference implementation:
|
1635
|
+
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
1636
|
+
if (p >= 1.0f) {
|
1637
|
+
return;
|
1638
|
+
}
|
1639
|
+
|
1640
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1641
|
+
|
1642
|
+
// Compute the softmax of logits and calculate entropy
|
1643
|
+
llama_sample_softmax(nullptr, candidates);
|
1644
|
+
|
1645
|
+
float entropy = 0.0f;
|
1646
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1647
|
+
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
// Compute the absolute difference between negative log probability and entropy for each candidate
|
1651
|
+
std::vector<float> shifted_scores;
|
1652
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1653
|
+
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
1654
|
+
shifted_scores.push_back(shifted_score);
|
1655
|
+
}
|
1656
|
+
|
1657
|
+
// Sort tokens based on the shifted_scores and their corresponding indices
|
1658
|
+
std::vector<size_t> indices(candidates->size);
|
1659
|
+
std::iota(indices.begin(), indices.end(), 0);
|
1660
|
+
|
1661
|
+
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
1662
|
+
return shifted_scores[a] < shifted_scores[b];
|
1663
|
+
});
|
1664
|
+
|
1665
|
+
// Compute the cumulative probabilities
|
1666
|
+
float cum_sum = 0.0f;
|
1667
|
+
size_t last_idx = indices.size();
|
1668
|
+
|
1669
|
+
for (size_t i = 0; i < indices.size(); ++i) {
|
1670
|
+
size_t idx = indices[i];
|
1671
|
+
cum_sum += candidates->data[idx].p;
|
1672
|
+
|
1673
|
+
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
1674
|
+
if (cum_sum > p && i >= min_keep - 1) {
|
1675
|
+
last_idx = i + 1;
|
1676
|
+
break;
|
1677
|
+
}
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
// Resize the output vector to keep only the locally typical tokens
|
1681
|
+
std::vector<llama_token_data> new_candidates;
|
1682
|
+
for (size_t i = 0; i < last_idx; ++i) {
|
1683
|
+
size_t idx = indices[i];
|
1684
|
+
new_candidates.push_back(candidates->data[idx]);
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
// Replace the data in candidates with the new_candidates data
|
1688
|
+
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
1689
|
+
candidates->size = new_candidates.size();
|
1690
|
+
|
1691
|
+
if (ctx) {
|
1692
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1693
|
+
}
|
1694
|
+
}
|
1695
|
+
|
1696
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
1697
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1698
|
+
|
1699
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
1700
|
+
candidates_p->data[i].logit /= temp;
|
1701
|
+
}
|
1702
|
+
|
1703
|
+
if (ctx) {
|
1704
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1705
|
+
}
|
1706
|
+
}
|
1707
|
+
|
1708
|
+
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
1709
|
+
if (last_tokens_size == 0 || penalty == 1.0f) {
|
1710
|
+
return;
|
1711
|
+
}
|
1712
|
+
|
1713
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1714
|
+
|
1715
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1716
|
+
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
1717
|
+
if (token_iter == last_tokens + last_tokens_size) {
|
1718
|
+
continue;
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
1722
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
1723
|
+
if (candidates->data[i].logit <= 0) {
|
1724
|
+
candidates->data[i].logit *= penalty;
|
1725
|
+
} else {
|
1726
|
+
candidates->data[i].logit /= penalty;
|
1727
|
+
}
|
1728
|
+
}
|
1729
|
+
|
1730
|
+
candidates->sorted = false;
|
1731
|
+
|
1732
|
+
if (ctx) {
|
1733
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1734
|
+
}
|
1735
|
+
}
|
1736
|
+
|
1737
|
+
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
1738
|
+
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
1739
|
+
return;
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1743
|
+
|
1744
|
+
// Create a frequency map to count occurrences of each token in last_tokens
|
1745
|
+
std::unordered_map<llama_token, int> token_count;
|
1746
|
+
for (size_t i = 0; i < last_tokens_size; ++i) {
|
1747
|
+
token_count[last_tokens_p[i]]++;
|
1748
|
+
}
|
1749
|
+
|
1750
|
+
// Apply frequency and presence penalties to the candidates
|
1751
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1752
|
+
auto token_iter = token_count.find(candidates->data[i].id);
|
1753
|
+
if (token_iter == token_count.end()) {
|
1754
|
+
continue;
|
1755
|
+
}
|
1756
|
+
|
1757
|
+
int count = token_iter->second;
|
1758
|
+
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
1759
|
+
}
|
1760
|
+
|
1761
|
+
candidates->sorted = false;
|
1762
|
+
|
1763
|
+
if (ctx) {
|
1764
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1765
|
+
}
|
1766
|
+
}
|
1767
|
+
|
1768
|
+
|
1769
|
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
1770
|
+
assert(ctx);
|
1771
|
+
auto N = float(llama_n_vocab(ctx));
|
1772
|
+
int64_t t_start_sample_us;
|
1773
|
+
t_start_sample_us = ggml_time_us();
|
1774
|
+
|
1775
|
+
llama_sample_softmax(nullptr, candidates);
|
1776
|
+
|
1777
|
+
// Estimate s_hat using the most probable m tokens
|
1778
|
+
float s_hat = 0.0;
|
1779
|
+
float sum_ti_bi = 0.0;
|
1780
|
+
float sum_ti_sq = 0.0;
|
1781
|
+
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
1782
|
+
float t_i = logf(float(i + 2) / float(i + 1));
|
1783
|
+
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
1784
|
+
sum_ti_bi += t_i * b_i;
|
1785
|
+
sum_ti_sq += t_i * t_i;
|
1786
|
+
}
|
1787
|
+
s_hat = sum_ti_bi / sum_ti_sq;
|
1788
|
+
|
1789
|
+
// Compute k from the estimated s_hat and target surprise value
|
1790
|
+
float epsilon_hat = s_hat - 1;
|
1791
|
+
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
1792
|
+
|
1793
|
+
// Sample the next word X using top-k sampling
|
1794
|
+
llama_sample_top_k(nullptr, candidates, int(k));
|
1795
|
+
if (ctx) {
|
1796
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1797
|
+
}
|
1798
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1799
|
+
t_start_sample_us = ggml_time_us();
|
1800
|
+
|
1801
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1802
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1803
|
+
return candidate.id == X;
|
1804
|
+
}));
|
1805
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1806
|
+
float e = observed_surprise - tau;
|
1807
|
+
|
1808
|
+
// Update mu using the learning rate and error
|
1809
|
+
*mu = *mu - eta * e;
|
1810
|
+
|
1811
|
+
if (ctx) {
|
1812
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1813
|
+
ctx->n_sample++;
|
1814
|
+
}
|
1815
|
+
return X;
|
1816
|
+
}
|
1817
|
+
|
1818
|
+
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
1819
|
+
assert(ctx);
|
1820
|
+
int64_t t_start_sample_us;
|
1821
|
+
t_start_sample_us = ggml_time_us();
|
1822
|
+
|
1823
|
+
llama_sample_softmax(ctx, candidates);
|
1824
|
+
|
1825
|
+
// Truncate the words with surprise values greater than mu
|
1826
|
+
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1827
|
+
return -log2f(candidate.p) > *mu;
|
1828
|
+
}));
|
1829
|
+
|
1830
|
+
// Normalize the probabilities of the remaining words
|
1831
|
+
llama_sample_softmax(ctx, candidates);
|
1832
|
+
|
1833
|
+
// Sample the next word X from the remaining words
|
1834
|
+
if (ctx) {
|
1835
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1836
|
+
}
|
1837
|
+
llama_token X = llama_sample_token(ctx, candidates);
|
1838
|
+
t_start_sample_us = ggml_time_us();
|
1839
|
+
|
1840
|
+
// Compute error as the difference between observed surprise and target surprise value
|
1841
|
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
1842
|
+
return candidate.id == X;
|
1843
|
+
}));
|
1844
|
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
1845
|
+
float e = observed_surprise - tau;
|
1846
|
+
|
1847
|
+
// Update mu using the learning rate and error
|
1848
|
+
*mu = *mu - eta * e;
|
1849
|
+
|
1850
|
+
if (ctx) {
|
1851
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1852
|
+
}
|
1853
|
+
return X;
|
1854
|
+
}
|
1855
|
+
|
1856
|
+
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1857
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1858
|
+
|
1859
|
+
// Find max element
|
1860
|
+
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
1861
|
+
return a.logit < b.logit;
|
1862
|
+
});
|
1863
|
+
|
1864
|
+
llama_token result = max_iter->id;
|
1865
|
+
if (ctx) {
|
1866
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1867
|
+
ctx->n_sample++;
|
1868
|
+
}
|
1869
|
+
return result;
|
1870
|
+
}
|
1871
|
+
|
1872
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
1873
|
+
assert(ctx);
|
1874
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1875
|
+
llama_sample_softmax(nullptr, candidates);
|
1876
|
+
|
1877
|
+
std::vector<float> probs;
|
1878
|
+
probs.reserve(candidates->size);
|
1879
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
1880
|
+
probs.push_back(candidates->data[i].p);
|
1881
|
+
}
|
1567
1882
|
|
1568
1883
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
1884
|
+
auto & rng = ctx->rng;
|
1569
1885
|
int idx = dist(rng);
|
1570
1886
|
|
1571
|
-
|
1887
|
+
llama_token result = candidates->data[idx].id;
|
1888
|
+
|
1889
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1890
|
+
ctx->n_sample++;
|
1891
|
+
return result;
|
1572
1892
|
}
|
1573
1893
|
|
1574
1894
|
//
|
@@ -1581,7 +1901,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1581
1901
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1582
1902
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
1903
|
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
|
-
case
|
1904
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1905
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1906
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
1585
1907
|
default: throw format("invalid output file type %d\n", ftype);
|
1586
1908
|
};
|
1587
1909
|
|
@@ -1618,8 +1940,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1618
1940
|
// quantize only 2D tensors
|
1619
1941
|
quantize &= (tensor.ne.size() == 2);
|
1620
1942
|
|
1621
|
-
//
|
1622
|
-
//if (tensor.name
|
1943
|
+
// uncomment this to keep the output layer in FP16
|
1944
|
+
//if (tensor.name == "output.weight") {
|
1623
1945
|
// quantize = false;
|
1624
1946
|
//}
|
1625
1947
|
|
@@ -1734,7 +2056,7 @@ struct llama_context * llama_init_from_file(
|
|
1734
2056
|
|
1735
2057
|
llama_context * ctx = new llama_context;
|
1736
2058
|
|
1737
|
-
if (params.seed
|
2059
|
+
if (params.seed < 0) {
|
1738
2060
|
params.seed = time(NULL);
|
1739
2061
|
}
|
1740
2062
|
|
@@ -1787,7 +2109,7 @@ struct llama_context * llama_init_from_file(
|
|
1787
2109
|
if (params.logits_all) {
|
1788
2110
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1789
2111
|
} else {
|
1790
|
-
ctx->logits.reserve(hparams.
|
2112
|
+
ctx->logits.reserve(hparams.n_vocab);
|
1791
2113
|
}
|
1792
2114
|
|
1793
2115
|
if (params.embedding){
|
@@ -2069,31 +2391,330 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2069
2391
|
}
|
2070
2392
|
}
|
2071
2393
|
|
2072
|
-
|
2073
|
-
|
2074
|
-
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
2075
|
-
return ctx->model.kv_self.buf.addr;
|
2394
|
+
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2395
|
+
return ctx->model.kv_self.n;
|
2076
2396
|
}
|
2077
2397
|
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2398
|
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
2399
|
+
|
2400
|
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2401
|
+
if (seed < 0) {
|
2402
|
+
seed = time(NULL);
|
2403
|
+
}
|
2404
|
+
ctx->rng.seed(seed);
|
2081
2405
|
}
|
2082
2406
|
|
2083
|
-
|
2084
|
-
|
2407
|
+
// Returns the *maximum* size of the state
|
2408
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
2409
|
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2410
|
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2411
|
+
const size_t s_rng_size = sizeof(size_t);
|
2412
|
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
2413
|
+
const size_t s_logits_capacity = sizeof(size_t);
|
2414
|
+
const size_t s_logits_size = sizeof(size_t);
|
2415
|
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
2416
|
+
const size_t s_embedding_size = sizeof(size_t);
|
2417
|
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
2418
|
+
const size_t s_kv_size = sizeof(size_t);
|
2419
|
+
const size_t s_kv_ntok = sizeof(int);
|
2420
|
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
2421
|
+
|
2422
|
+
const size_t s_total = (
|
2423
|
+
+ s_rng_size
|
2424
|
+
+ s_rng
|
2425
|
+
+ s_logits_capacity
|
2426
|
+
+ s_logits_size
|
2427
|
+
+ s_logits
|
2428
|
+
+ s_embedding_size
|
2429
|
+
+ s_embedding
|
2430
|
+
+ s_kv_size
|
2431
|
+
+ s_kv_ntok
|
2432
|
+
+ s_kv
|
2433
|
+
);
|
2434
|
+
|
2435
|
+
return s_total;
|
2085
2436
|
}
|
2086
2437
|
|
2087
|
-
//
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2438
|
+
// Copies the state to the specified destination address
|
2439
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
2440
|
+
uint8_t * out = dest;
|
2441
|
+
|
2442
|
+
// copy rng
|
2443
|
+
{
|
2444
|
+
std::stringstream rng_ss;
|
2445
|
+
rng_ss << ctx->rng;
|
2446
|
+
|
2447
|
+
const size_t rng_size = rng_ss.str().size();
|
2448
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2449
|
+
|
2450
|
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
2451
|
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
2452
|
+
|
2453
|
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
2454
|
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
2455
|
+
}
|
2456
|
+
|
2457
|
+
// copy logits
|
2458
|
+
{
|
2459
|
+
const size_t logits_cap = ctx->logits.capacity();
|
2460
|
+
const size_t logits_size = ctx->logits.size();
|
2461
|
+
|
2462
|
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
2463
|
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
2464
|
+
|
2465
|
+
if (logits_size) {
|
2466
|
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
2467
|
+
}
|
2468
|
+
|
2469
|
+
out += logits_cap * sizeof(float);
|
2470
|
+
}
|
2471
|
+
|
2472
|
+
// copy embeddings
|
2473
|
+
{
|
2474
|
+
const size_t embedding_size = ctx->embedding.size();
|
2475
|
+
|
2476
|
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
2477
|
+
|
2478
|
+
if (embedding_size) {
|
2479
|
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
2480
|
+
out += embedding_size * sizeof(float);
|
2481
|
+
}
|
2482
|
+
}
|
2483
|
+
|
2484
|
+
// copy kv cache
|
2485
|
+
{
|
2486
|
+
const auto & kv_self = ctx->model.kv_self;
|
2487
|
+
const auto & hparams = ctx->model.hparams;
|
2488
|
+
const int n_layer = hparams.n_layer;
|
2489
|
+
const int n_embd = hparams.n_embd;
|
2490
|
+
const int n_ctx = hparams.n_ctx;
|
2491
|
+
|
2492
|
+
const size_t kv_size = kv_self.buf.size;
|
2493
|
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2494
|
+
|
2495
|
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2496
|
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2497
|
+
|
2498
|
+
if (kv_size) {
|
2499
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2500
|
+
char buffer[4096];
|
2501
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2502
|
+
ggml_cgraph gf{};
|
2503
|
+
gf.n_threads = 1;
|
2504
|
+
|
2505
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2506
|
+
kout3d->data = out;
|
2507
|
+
out += ggml_nbytes(kout3d);
|
2508
|
+
|
2509
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2510
|
+
vout3d->data = out;
|
2511
|
+
out += ggml_nbytes(vout3d);
|
2512
|
+
|
2513
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2514
|
+
n_embd, kv_ntok, n_layer,
|
2515
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2516
|
+
|
2517
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2518
|
+
kv_ntok, n_embd, n_layer,
|
2519
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2520
|
+
|
2521
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
2522
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
2523
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2524
|
+
}
|
2525
|
+
}
|
2526
|
+
|
2527
|
+
const size_t written = out - dest;
|
2528
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2529
|
+
|
2530
|
+
LLAMA_ASSERT(written <= max_size);
|
2531
|
+
|
2532
|
+
return written;
|
2533
|
+
}
|
2534
|
+
|
2535
|
+
// Sets the state reading from the specified source address
|
2536
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2537
|
+
const uint8_t * in = src;
|
2538
|
+
|
2539
|
+
// set rng
|
2540
|
+
{
|
2541
|
+
size_t rng_size;
|
2542
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2543
|
+
|
2544
|
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
2545
|
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
2546
|
+
|
2547
|
+
std::stringstream rng_ss;
|
2548
|
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
2549
|
+
rng_ss >> ctx->rng;
|
2550
|
+
|
2551
|
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
// set logits
|
2555
|
+
{
|
2556
|
+
size_t logits_cap;
|
2557
|
+
size_t logits_size;
|
2558
|
+
|
2559
|
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
2560
|
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
2561
|
+
|
2562
|
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2563
|
+
|
2564
|
+
if (logits_size) {
|
2565
|
+
ctx->logits.resize(logits_size);
|
2566
|
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
2567
|
+
}
|
2568
|
+
|
2569
|
+
in += logits_cap * sizeof(float);
|
2570
|
+
}
|
2571
|
+
|
2572
|
+
// set embeddings
|
2573
|
+
{
|
2574
|
+
size_t embedding_size;
|
2575
|
+
|
2576
|
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
2577
|
+
|
2578
|
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2579
|
+
|
2580
|
+
if (embedding_size) {
|
2581
|
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
2582
|
+
in += embedding_size * sizeof(float);
|
2583
|
+
}
|
2584
|
+
}
|
2585
|
+
|
2586
|
+
// set kv cache
|
2587
|
+
{
|
2588
|
+
const auto & kv_self = ctx->model.kv_self;
|
2589
|
+
const auto & hparams = ctx->model.hparams;
|
2590
|
+
const int n_layer = hparams.n_layer;
|
2591
|
+
const int n_embd = hparams.n_embd;
|
2592
|
+
const int n_ctx = hparams.n_ctx;
|
2593
|
+
|
2594
|
+
size_t kv_size;
|
2595
|
+
int kv_ntok;
|
2596
|
+
|
2597
|
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
2598
|
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2599
|
+
|
2600
|
+
if (kv_size) {
|
2601
|
+
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
2602
|
+
|
2603
|
+
const size_t elt_size = ggml_element_size(kv_self.k);
|
2604
|
+
char buffer[4096];
|
2605
|
+
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
2606
|
+
ggml_cgraph gf{};
|
2607
|
+
gf.n_threads = 1;
|
2608
|
+
|
2609
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
2610
|
+
kin3d->data = (void *) in;
|
2611
|
+
in += ggml_nbytes(kin3d);
|
2612
|
+
|
2613
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
2614
|
+
vin3d->data = (void *) in;
|
2615
|
+
in += ggml_nbytes(vin3d);
|
2616
|
+
|
2617
|
+
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
2618
|
+
n_embd, kv_ntok, n_layer,
|
2619
|
+
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
2620
|
+
|
2621
|
+
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
2622
|
+
kv_ntok, n_embd, n_layer,
|
2623
|
+
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
2624
|
+
|
2625
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
2626
|
+
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
2627
|
+
ggml_graph_compute(cpy_ctx, &gf);
|
2628
|
+
}
|
2629
|
+
|
2630
|
+
ctx->model.kv_self.n = kv_ntok;
|
2631
|
+
}
|
2632
|
+
|
2633
|
+
const size_t nread = in - src;
|
2634
|
+
const size_t max_size = llama_get_state_size(ctx);
|
2635
|
+
|
2636
|
+
LLAMA_ASSERT(nread <= max_size);
|
2637
|
+
|
2638
|
+
return nread;
|
2639
|
+
}
|
2640
|
+
|
2641
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
2642
|
+
llama_file file(path_session, "rb");
|
2643
|
+
|
2644
|
+
// sanity checks
|
2645
|
+
{
|
2646
|
+
const uint32_t magic = file.read_u32();
|
2647
|
+
const uint32_t version = file.read_u32();
|
2648
|
+
|
2649
|
+
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
|
2650
|
+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
2651
|
+
return false;
|
2652
|
+
}
|
2653
|
+
|
2654
|
+
llama_hparams session_hparams;
|
2655
|
+
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
2656
|
+
|
2657
|
+
if (session_hparams != ctx->model.hparams) {
|
2658
|
+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
2659
|
+
return false;
|
2660
|
+
}
|
2661
|
+
}
|
2662
|
+
|
2663
|
+
// load the prompt
|
2664
|
+
{
|
2665
|
+
const uint32_t n_token_count = file.read_u32();
|
2666
|
+
|
2667
|
+
if (n_token_count > n_token_capacity) {
|
2668
|
+
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
2669
|
+
return false;
|
2670
|
+
}
|
2671
|
+
|
2672
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
2673
|
+
*n_token_count_out = n_token_count;
|
2674
|
+
}
|
2675
|
+
|
2676
|
+
// restore the context state
|
2677
|
+
{
|
2678
|
+
const size_t n_state_size_cur = file.size - file.tell();
|
2679
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2680
|
+
|
2681
|
+
if (n_state_size_cur > n_state_size_max) {
|
2682
|
+
fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
2683
|
+
return false;
|
2684
|
+
}
|
2685
|
+
|
2686
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2687
|
+
file.read_raw(state_data.data(), n_state_size_cur);
|
2688
|
+
|
2689
|
+
llama_set_state_data(ctx, state_data.data());
|
2690
|
+
}
|
2691
|
+
|
2692
|
+
return true;
|
2693
|
+
}
|
2694
|
+
|
2695
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
2696
|
+
llama_file file(path_session, "wb");
|
2697
|
+
|
2698
|
+
file.write_u32(LLAMA_SESSION_MAGIC);
|
2699
|
+
file.write_u32(LLAMA_SESSION_VERSION);
|
2700
|
+
|
2701
|
+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
2702
|
+
|
2703
|
+
// save the prompt
|
2704
|
+
file.write_u32((uint32_t) n_token_count);
|
2705
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
2706
|
+
|
2707
|
+
// save the context state
|
2708
|
+
{
|
2709
|
+
const size_t n_state_size_max = llama_get_state_size(ctx);
|
2710
|
+
|
2711
|
+
std::vector<uint8_t> state_data(n_state_size_max);
|
2712
|
+
const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
|
2713
|
+
|
2714
|
+
file.write_raw(state_data.data(), n_state_size_cur);
|
2715
|
+
}
|
2716
|
+
|
2717
|
+
return true;
|
2097
2718
|
}
|
2098
2719
|
|
2099
2720
|
int llama_eval(
|
@@ -2134,15 +2755,15 @@ int llama_tokenize(
|
|
2134
2755
|
return res.size();
|
2135
2756
|
}
|
2136
2757
|
|
2137
|
-
int llama_n_vocab(struct llama_context * ctx) {
|
2758
|
+
int llama_n_vocab(const struct llama_context * ctx) {
|
2138
2759
|
return ctx->vocab.id_to_token.size();
|
2139
2760
|
}
|
2140
2761
|
|
2141
|
-
int llama_n_ctx(struct llama_context * ctx) {
|
2762
|
+
int llama_n_ctx(const struct llama_context * ctx) {
|
2142
2763
|
return ctx->model.hparams.n_ctx;
|
2143
2764
|
}
|
2144
2765
|
|
2145
|
-
int llama_n_embd(struct llama_context * ctx) {
|
2766
|
+
int llama_n_embd(const struct llama_context * ctx) {
|
2146
2767
|
return ctx->model.hparams.n_embd;
|
2147
2768
|
}
|
2148
2769
|
|
@@ -2154,7 +2775,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
2154
2775
|
return ctx->embedding.data();
|
2155
2776
|
}
|
2156
2777
|
|
2157
|
-
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
2778
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
2158
2779
|
if (token >= llama_n_vocab(ctx)) {
|
2159
2780
|
return nullptr;
|
2160
2781
|
}
|
@@ -2170,33 +2791,8 @@ llama_token llama_token_eos() {
|
|
2170
2791
|
return 2;
|
2171
2792
|
}
|
2172
2793
|
|
2173
|
-
llama_token
|
2174
|
-
|
2175
|
-
const llama_token * last_n_tokens_data,
|
2176
|
-
int last_n_tokens_size,
|
2177
|
-
int top_k,
|
2178
|
-
float top_p,
|
2179
|
-
float temp,
|
2180
|
-
float repeat_penalty) {
|
2181
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2182
|
-
|
2183
|
-
llama_token result = 0;
|
2184
|
-
|
2185
|
-
// TODO: avoid this ...
|
2186
|
-
const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
2187
|
-
|
2188
|
-
result = llama_sample_top_p_top_k(
|
2189
|
-
*ctx,
|
2190
|
-
last_n_tokens,
|
2191
|
-
top_k,
|
2192
|
-
top_p,
|
2193
|
-
temp,
|
2194
|
-
repeat_penalty);
|
2195
|
-
|
2196
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2197
|
-
ctx->n_sample++;
|
2198
|
-
|
2199
|
-
return result;
|
2794
|
+
llama_token llama_token_nl() {
|
2795
|
+
return 13;
|
2200
2796
|
}
|
2201
2797
|
|
2202
2798
|
|