llama_cpp 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -66,6 +69,7 @@ enum e_model {
|
|
66
69
|
MODEL_65B,
|
67
70
|
};
|
68
71
|
|
72
|
+
static const size_t kB = 1024;
|
69
73
|
static const size_t MB = 1024*1024;
|
70
74
|
|
71
75
|
// computed for n_ctx == 2048
|
@@ -78,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
78
82
|
(void) tensor;
|
79
83
|
}
|
80
84
|
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
81
104
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
82
105
|
{
|
83
106
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -129,6 +152,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
152
|
return k_sizes;
|
130
153
|
}
|
131
154
|
|
155
|
+
// amount of VRAM needed per batch size to hold temporary results
|
156
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
157
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
158
|
+
{
|
159
|
+
static std::map<e_model, size_t> k_sizes = {
|
160
|
+
{ MODEL_3B, 512ull * kB },
|
161
|
+
{ MODEL_7B, 512ull * kB },
|
162
|
+
{ MODEL_13B, 640ull * kB },
|
163
|
+
{ MODEL_30B, 768ull * kB },
|
164
|
+
{ MODEL_65B, 1536ull * kB },
|
165
|
+
};
|
166
|
+
return k_sizes;
|
167
|
+
}
|
168
|
+
|
169
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
170
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
171
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
172
|
+
{
|
173
|
+
static std::map<e_model, size_t> k_sizes = {
|
174
|
+
{ MODEL_3B, 128ull },
|
175
|
+
{ MODEL_7B, 128ull },
|
176
|
+
{ MODEL_13B, 160ull },
|
177
|
+
{ MODEL_30B, 208ull },
|
178
|
+
{ MODEL_65B, 416ull },
|
179
|
+
};
|
180
|
+
return k_sizes;
|
181
|
+
}
|
182
|
+
|
132
183
|
// default hparams (LLaMA 7B)
|
133
184
|
struct llama_hparams {
|
134
185
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +216,8 @@ struct llama_layer {
|
|
165
216
|
};
|
166
217
|
|
167
218
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
219
|
+
struct ggml_tensor * k = NULL;
|
220
|
+
struct ggml_tensor * v = NULL;
|
170
221
|
|
171
222
|
struct ggml_context * ctx = NULL;
|
172
223
|
|
@@ -253,7 +304,13 @@ struct llama_model {
|
|
253
304
|
|
254
305
|
struct llama_context {
|
255
306
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
307
|
+
#ifdef GGML_USE_METAL
|
308
|
+
~llama_context() {
|
309
|
+
if (ctx_metal) {
|
310
|
+
ggml_metal_free(ctx_metal);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
#endif
|
257
314
|
std::mt19937 rng;
|
258
315
|
|
259
316
|
bool has_evaluated_once = false;
|
@@ -286,6 +343,9 @@ struct llama_context {
|
|
286
343
|
// input embedding (1-dimensional array: [n_embd])
|
287
344
|
std::vector<float> embedding;
|
288
345
|
|
346
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
347
|
+
std::vector<uint8_t> work_buffer;
|
348
|
+
|
289
349
|
// memory buffers used to evaluate the model
|
290
350
|
// TODO: move in llama_state
|
291
351
|
llama_ctx_buffer buf_compute;
|
@@ -295,6 +355,10 @@ struct llama_context {
|
|
295
355
|
ggml_metal_context * ctx_metal = NULL;
|
296
356
|
#endif
|
297
357
|
|
358
|
+
#ifdef GGML_USE_MPI
|
359
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
360
|
+
#endif
|
361
|
+
|
298
362
|
int buf_last = 0;
|
299
363
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
300
364
|
|
@@ -446,9 +510,7 @@ struct llama_file_loader {
|
|
446
510
|
std::string word = file.read_string(len);
|
447
511
|
|
448
512
|
float score = 0.0f;
|
449
|
-
|
450
|
-
file.read_raw(&score, sizeof(score));
|
451
|
-
}
|
513
|
+
file.read_raw(&score, sizeof(score));
|
452
514
|
|
453
515
|
vocab.token_to_id[word] = i;
|
454
516
|
|
@@ -725,7 +787,6 @@ struct llama_model_loader {
|
|
725
787
|
|
726
788
|
};
|
727
789
|
|
728
|
-
|
729
790
|
//
|
730
791
|
// kv cache
|
731
792
|
//
|
@@ -816,7 +877,7 @@ bool llama_mlock_supported() {
|
|
816
877
|
return llama_mlock::SUPPORTED;
|
817
878
|
}
|
818
879
|
|
819
|
-
void
|
880
|
+
void llama_backend_init(bool numa) {
|
820
881
|
ggml_time_init();
|
821
882
|
|
822
883
|
// needed to initialize f16 tables
|
@@ -829,6 +890,16 @@ void llama_init_backend(bool numa) {
|
|
829
890
|
if (numa) {
|
830
891
|
ggml_numa_init();
|
831
892
|
}
|
893
|
+
|
894
|
+
#ifdef GGML_USE_MPI
|
895
|
+
ggml_mpi_backend_init();
|
896
|
+
#endif
|
897
|
+
}
|
898
|
+
|
899
|
+
void llama_backend_free() {
|
900
|
+
#ifdef GGML_USE_MPI
|
901
|
+
ggml_mpi_backend_free();
|
902
|
+
#endif
|
832
903
|
}
|
833
904
|
|
834
905
|
int64_t llama_time_us() {
|
@@ -1112,14 +1183,18 @@ static void llama_model_load_internal(
|
|
1112
1183
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1113
1184
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1114
1185
|
} else {
|
1115
|
-
|
1186
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1187
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1188
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1116
1189
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1117
1190
|
if (n_gpu_layers > 0) {
|
1118
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1119
|
-
__func__,
|
1191
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1192
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1193
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1120
1194
|
}
|
1121
1195
|
}
|
1122
1196
|
#endif // GGML_USE_CUBLAS
|
1197
|
+
|
1123
1198
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1124
1199
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1125
1200
|
|
@@ -1128,6 +1203,10 @@ static void llama_model_load_internal(
|
|
1128
1203
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1129
1204
|
}
|
1130
1205
|
size_t vram_kv_cache = 0;
|
1206
|
+
|
1207
|
+
#ifdef GGML_USE_CUBLAS
|
1208
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1209
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1131
1210
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1132
1211
|
if (low_vram) {
|
1133
1212
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1144,14 +1223,18 @@ static void llama_model_load_internal(
|
|
1144
1223
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1145
1224
|
}
|
1146
1225
|
}
|
1147
|
-
|
1226
|
+
#elif defined(GGML_USE_CLBLAST)
|
1227
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1228
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1229
|
+
#endif // GGML_USE_CUBLAS
|
1230
|
+
|
1148
1231
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1149
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1232
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1150
1233
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1151
1234
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1152
1235
|
#else
|
1153
1236
|
(void) n_gpu_layers;
|
1154
|
-
#endif
|
1237
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1155
1238
|
}
|
1156
1239
|
|
1157
1240
|
// populate `tensors_by_name`
|
@@ -1218,18 +1301,16 @@ static bool llama_eval_internal(
|
|
1218
1301
|
llama_context & lctx,
|
1219
1302
|
const llama_token * tokens,
|
1220
1303
|
const float * embd,
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1304
|
+
int n_tokens,
|
1305
|
+
int n_past,
|
1306
|
+
int n_threads,
|
1224
1307
|
const char * cgraph_fname) {
|
1225
1308
|
|
1226
1309
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
1310
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
return false;
|
1232
|
-
}
|
1311
|
+
#ifdef GGML_USE_MPI
|
1312
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1313
|
+
#endif
|
1233
1314
|
|
1234
1315
|
const int64_t t_start_us = ggml_time_us();
|
1235
1316
|
|
@@ -1261,20 +1342,26 @@ static bool llama_eval_internal(
|
|
1261
1342
|
|
1262
1343
|
struct ggml_context * ctx0 = ggml_init(params);
|
1263
1344
|
|
1345
|
+
ggml_cgraph gf = {};
|
1346
|
+
|
1264
1347
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1265
1348
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1266
|
-
|
1267
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1349
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1268
1350
|
|
1269
1351
|
struct ggml_tensor * cur;
|
1270
1352
|
struct ggml_tensor * inpL;
|
1271
1353
|
|
1272
1354
|
if (tokens) {
|
1273
|
-
struct ggml_tensor *
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1356
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1357
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1358
|
+
|
1359
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1277
1360
|
} else {
|
1361
|
+
#ifdef GGML_USE_MPI
|
1362
|
+
GGML_ASSERT(false && "not implemented");
|
1363
|
+
#endif
|
1364
|
+
|
1278
1365
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
1366
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
1367
|
}
|
@@ -1292,18 +1379,20 @@ static bool llama_eval_internal(
|
|
1292
1379
|
offload_func_t offload_func_v = llama_nop;
|
1293
1380
|
|
1294
1381
|
#ifdef GGML_USE_CUBLAS
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1382
|
+
if (n_gpu_layers > n_layer) {
|
1383
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1384
|
+
}
|
1385
|
+
if (n_gpu_layers > n_layer + 1) {
|
1386
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1387
|
+
}
|
1388
|
+
if (n_gpu_layers > n_layer + 2) {
|
1389
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1390
|
+
}
|
1304
1391
|
#endif // GGML_USE_CUBLAS
|
1305
1392
|
|
1306
1393
|
for (int il = 0; il < n_layer; ++il) {
|
1394
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1395
|
+
|
1307
1396
|
offload_func_t offload_func = llama_nop;
|
1308
1397
|
|
1309
1398
|
#ifdef GGML_USE_CUBLAS
|
@@ -1510,7 +1599,6 @@ static bool llama_eval_internal(
|
|
1510
1599
|
|
1511
1600
|
// input for next layer
|
1512
1601
|
inpL = cur;
|
1513
|
-
|
1514
1602
|
}
|
1515
1603
|
|
1516
1604
|
lctx.use_buf(ctx0, 0);
|
@@ -1518,7 +1606,6 @@ static bool llama_eval_internal(
|
|
1518
1606
|
// used at the end to optionally extract the embeddings
|
1519
1607
|
struct ggml_tensor * embeddings = NULL;
|
1520
1608
|
|
1521
|
-
|
1522
1609
|
// norm
|
1523
1610
|
{
|
1524
1611
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1533,7 +1620,6 @@ static bool llama_eval_internal(
|
|
1533
1620
|
embeddings = cur;
|
1534
1621
|
}
|
1535
1622
|
|
1536
|
-
|
1537
1623
|
// lm_head
|
1538
1624
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1539
1625
|
ggml_set_name(cur, "result_output");
|
@@ -1546,8 +1632,13 @@ static bool llama_eval_internal(
|
|
1546
1632
|
// run the computation
|
1547
1633
|
ggml_build_forward_expand(&gf, cur);
|
1548
1634
|
|
1635
|
+
#if GGML_USE_MPI
|
1636
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1637
|
+
#endif
|
1638
|
+
|
1549
1639
|
#ifdef GGML_USE_METAL
|
1550
1640
|
if (lctx.ctx_metal && N == 1) {
|
1641
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1551
1642
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1552
1643
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1553
1644
|
} else {
|
@@ -1567,12 +1658,21 @@ static bool llama_eval_internal(
|
|
1567
1658
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1568
1659
|
}
|
1569
1660
|
|
1570
|
-
|
1661
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1571
1662
|
}
|
1572
1663
|
#else
|
1573
|
-
|
1664
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1574
1665
|
#endif
|
1575
1666
|
|
1667
|
+
#if GGML_USE_MPI
|
1668
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1669
|
+
#endif
|
1670
|
+
|
1671
|
+
// update kv token count
|
1672
|
+
lctx.kv_self.n = n_past + N;
|
1673
|
+
|
1674
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1675
|
+
|
1576
1676
|
if (cgraph_fname) {
|
1577
1677
|
ggml_graph_export(&gf, cgraph_fname);
|
1578
1678
|
}
|
@@ -1588,23 +1688,17 @@ static bool llama_eval_internal(
|
|
1588
1688
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1589
1689
|
//}
|
1590
1690
|
|
1591
|
-
//embd_w.resize(n_vocab*N);
|
1592
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1593
|
-
|
1594
|
-
// update kv token count
|
1595
|
-
lctx.kv_self.n = n_past + N;
|
1596
|
-
|
1597
1691
|
// extract logits
|
1598
1692
|
{
|
1599
1693
|
auto & logits_out = lctx.logits;
|
1600
1694
|
|
1601
1695
|
if (lctx.logits_all) {
|
1602
1696
|
logits_out.resize(n_vocab * N);
|
1603
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1697
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1604
1698
|
} else {
|
1605
1699
|
// return result for just the last token
|
1606
1700
|
logits_out.resize(n_vocab);
|
1607
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1701
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1608
1702
|
}
|
1609
1703
|
}
|
1610
1704
|
|
@@ -1860,10 +1954,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1860
1954
|
return;
|
1861
1955
|
}
|
1862
1956
|
|
1863
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1864
|
-
|
1865
1957
|
llama_sample_softmax(ctx, candidates);
|
1866
1958
|
|
1959
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1960
|
+
|
1867
1961
|
// Compute the cumulative probabilities
|
1868
1962
|
float cum_sum = 0.0f;
|
1869
1963
|
size_t last_idx = candidates->size;
|
@@ -1892,9 +1986,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1892
1986
|
return;
|
1893
1987
|
}
|
1894
1988
|
|
1895
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1896
|
-
|
1897
1989
|
llama_sample_softmax(nullptr, candidates);
|
1990
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1898
1991
|
|
1899
1992
|
// Compute the first and second derivatives
|
1900
1993
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -1946,11 +2039,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
1946
2039
|
return;
|
1947
2040
|
}
|
1948
2041
|
|
1949
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1950
|
-
|
1951
2042
|
// Compute the softmax of logits and calculate entropy
|
1952
2043
|
llama_sample_softmax(nullptr, candidates);
|
1953
2044
|
|
2045
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2046
|
+
|
1954
2047
|
float entropy = 0.0f;
|
1955
2048
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1956
2049
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2074,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2074
2167
|
}
|
2075
2168
|
}
|
2076
2169
|
|
2170
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2171
|
+
float max_l = *std::max_element(array, array + size);
|
2172
|
+
float sum = 0.f;
|
2173
|
+
for (size_t i = 0; i < size; ++i) {
|
2174
|
+
float p = expf(array[i] - max_l);
|
2175
|
+
sum += p;
|
2176
|
+
array[i] = p;
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
for (size_t i = 0; i < size; ++i) {
|
2180
|
+
array[i] = logf(array[i] / sum);
|
2181
|
+
}
|
2182
|
+
}
|
2183
|
+
|
2184
|
+
void llama_sample_classifier_free_guidance(
|
2185
|
+
struct llama_context * ctx,
|
2186
|
+
llama_token_data_array * candidates,
|
2187
|
+
struct llama_context * guidance_ctx,
|
2188
|
+
float scale,
|
2189
|
+
float smooth_factor) {
|
2190
|
+
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2191
|
+
|
2192
|
+
assert(ctx);
|
2193
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2194
|
+
assert(n_vocab == (int)candidates->size);
|
2195
|
+
assert(!candidates->sorted);
|
2196
|
+
|
2197
|
+
std::vector<float> logits_base;
|
2198
|
+
logits_base.reserve(candidates->size);
|
2199
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2200
|
+
logits_base.push_back(candidates->data[i].logit);
|
2201
|
+
}
|
2202
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2203
|
+
|
2204
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2205
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2206
|
+
|
2207
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2208
|
+
float logit_guidance = logits_guidance[i];
|
2209
|
+
float logit_base = logits_base[i];
|
2210
|
+
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
+
|
2215
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
+
float logit_base = logits_base[i];
|
2217
|
+
float logit_guidance = logits_guidance[i];
|
2218
|
+
|
2219
|
+
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2220
|
+
}
|
2221
|
+
|
2222
|
+
if (ctx) {
|
2223
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2224
|
+
}
|
2225
|
+
}
|
2077
2226
|
|
2078
2227
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2079
2228
|
assert(ctx);
|
@@ -2119,13 +2268,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2119
2268
|
|
2120
2269
|
if (ctx) {
|
2121
2270
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2122
|
-
ctx->n_sample++;
|
2123
2271
|
}
|
2124
2272
|
return X;
|
2125
2273
|
}
|
2126
2274
|
|
2127
2275
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2128
|
-
assert(ctx);
|
2129
2276
|
int64_t t_start_sample_us;
|
2130
2277
|
t_start_sample_us = ggml_time_us();
|
2131
2278
|
|
@@ -2140,13 +2287,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2140
2287
|
candidates->size = 1;
|
2141
2288
|
}
|
2142
2289
|
|
2290
|
+
if (ctx) {
|
2291
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2292
|
+
}
|
2293
|
+
|
2143
2294
|
// Normalize the probabilities of the remaining words
|
2144
2295
|
llama_sample_softmax(ctx, candidates);
|
2145
2296
|
|
2146
2297
|
// Sample the next word X from the remaining words
|
2147
|
-
if (ctx) {
|
2148
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2149
|
-
}
|
2150
2298
|
llama_token X = llama_sample_token(ctx, candidates);
|
2151
2299
|
t_start_sample_us = ggml_time_us();
|
2152
2300
|
|
@@ -2214,10 +2362,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2214
2362
|
}
|
2215
2363
|
float * f32_output = (float *) output.addr;
|
2216
2364
|
|
2217
|
-
|
2365
|
+
ggml_type_traits_t qtype;
|
2218
2366
|
if (ggml_is_quantized(tensor.type)) {
|
2219
|
-
qtype =
|
2220
|
-
if (qtype.
|
2367
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2368
|
+
if (qtype.to_float == NULL) {
|
2221
2369
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2222
2370
|
}
|
2223
2371
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2228,7 +2376,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2228
2376
|
if (tensor.type == GGML_TYPE_F16) {
|
2229
2377
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2230
2378
|
} else if (ggml_is_quantized(tensor.type)) {
|
2231
|
-
qtype.
|
2379
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2232
2380
|
} else {
|
2233
2381
|
LLAMA_ASSERT(false); // unreachable
|
2234
2382
|
}
|
@@ -2253,7 +2401,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2253
2401
|
if (typ == GGML_TYPE_F16) {
|
2254
2402
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2255
2403
|
} else {
|
2256
|
-
qtype.
|
2404
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2257
2405
|
}
|
2258
2406
|
};
|
2259
2407
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -2362,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2362
2510
|
} else {
|
2363
2511
|
new_type = quantized_type;
|
2364
2512
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
+
bool convert_incompatible_tensor = false;
|
2365
2514
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2366
2515
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2367
2516
|
int nx = tensor.ne.at(0);
|
2368
2517
|
int ny = tensor.ne.at(1);
|
2369
2518
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2370
|
-
fprintf(stderr, "\n\
|
2371
|
-
|
2372
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2373
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2519
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
+
convert_incompatible_tensor = true;
|
2374
2521
|
}
|
2375
2522
|
}
|
2376
2523
|
if (tensor.name == "output.weight") {
|
@@ -2398,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2398
2545
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2399
2546
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2400
2547
|
}
|
2548
|
+
if (convert_incompatible_tensor) {
|
2549
|
+
if (tensor.name == "output.weight") {
|
2550
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2551
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2552
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2553
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2554
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2555
|
+
} else {
|
2556
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2557
|
+
}
|
2558
|
+
}
|
2401
2559
|
#endif
|
2402
2560
|
|
2403
2561
|
float * f32_data;
|
@@ -2532,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
|
|
2532
2690
|
}
|
2533
2691
|
|
2534
2692
|
struct llama_context * llama_new_context_with_model(
|
2535
|
-
|
2536
|
-
|
2693
|
+
struct llama_model * model,
|
2694
|
+
struct llama_context_params params) {
|
2537
2695
|
|
2538
2696
|
if (!model) {
|
2539
2697
|
return nullptr;
|
@@ -2602,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2602
2760
|
#ifdef GGML_USE_METAL
|
2603
2761
|
if (params.n_gpu_layers > 0) {
|
2604
2762
|
// this allocates all Metal resources and memory buffers
|
2605
|
-
ctx->ctx_metal = ggml_metal_init();
|
2763
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2606
2764
|
|
2607
2765
|
void * data_ptr = NULL;
|
2608
2766
|
size_t data_size = 0;
|
@@ -2637,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2637
2795
|
}
|
2638
2796
|
#endif
|
2639
2797
|
|
2798
|
+
#ifdef GGML_USE_MPI
|
2799
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2800
|
+
|
2801
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2802
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2803
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2804
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2805
|
+
llama_backend_free();
|
2806
|
+
exit(1);
|
2807
|
+
}
|
2808
|
+
#endif
|
2809
|
+
|
2640
2810
|
return ctx;
|
2641
2811
|
}
|
2642
2812
|
|
@@ -2759,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2759
2929
|
// read tensors and apply
|
2760
2930
|
bool warned = false;
|
2761
2931
|
int n_tensors = 0;
|
2932
|
+
|
2933
|
+
std::vector<uint8_t> work_buffer;
|
2934
|
+
|
2762
2935
|
while (true) {
|
2763
2936
|
int32_t n_dims;
|
2764
2937
|
int32_t length;
|
@@ -2923,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2923
3096
|
}
|
2924
3097
|
|
2925
3098
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2926
|
-
|
2927
|
-
|
3099
|
+
|
3100
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2928
3101
|
|
2929
3102
|
// we won't need these tensors again, reset the context to save memory
|
2930
3103
|
ggml_free(lora_ctx);
|
@@ -3077,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3077
3250
|
|
3078
3251
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3079
3252
|
ggml_cgraph gf{};
|
3080
|
-
gf.n_threads = 1;
|
3081
3253
|
|
3082
3254
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3083
3255
|
kout3d->data = out;
|
@@ -3097,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3097
3269
|
|
3098
3270
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3099
3271
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3100
|
-
|
3272
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3101
3273
|
|
3102
3274
|
ggml_free(cpy_ctx);
|
3103
3275
|
}
|
@@ -3183,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3183
3355
|
|
3184
3356
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3185
3357
|
ggml_cgraph gf{};
|
3186
|
-
gf.n_threads = 1;
|
3187
3358
|
|
3188
3359
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3189
3360
|
kin3d->data = (void *) inp;
|
@@ -3203,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3203
3374
|
|
3204
3375
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3205
3376
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3206
|
-
|
3377
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3207
3378
|
|
3208
3379
|
ggml_free(cpy_ctx);
|
3209
3380
|
}
|
@@ -3219,7 +3390,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3219
3390
|
return nread;
|
3220
3391
|
}
|
3221
3392
|
|
3222
|
-
bool
|
3393
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3223
3394
|
llama_file file(path_session, "rb");
|
3224
3395
|
|
3225
3396
|
// sanity checks
|
@@ -3273,6 +3444,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3273
3444
|
return true;
|
3274
3445
|
}
|
3275
3446
|
|
3447
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3448
|
+
try {
|
3449
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3450
|
+
} catch (const std::exception & err) {
|
3451
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3452
|
+
return false;
|
3453
|
+
}
|
3454
|
+
}
|
3455
|
+
|
3276
3456
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3277
3457
|
llama_file file(path_session, "wb");
|
3278
3458
|
|
@@ -3428,23 +3608,35 @@ llama_token llama_token_nl() {
|
|
3428
3608
|
return 13;
|
3429
3609
|
}
|
3430
3610
|
|
3611
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3612
|
+
struct llama_timings result = {
|
3613
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3614
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3615
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3616
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3617
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3618
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3431
3619
|
|
3432
|
-
|
3433
|
-
|
3620
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3621
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3622
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3623
|
+
};
|
3434
3624
|
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3625
|
+
return result;
|
3626
|
+
}
|
3627
|
+
|
3628
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3629
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3438
3630
|
|
3439
3631
|
fprintf(stderr, "\n");
|
3440
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3632
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3441
3633
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3442
|
-
__func__,
|
3634
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3443
3635
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3444
|
-
__func__,
|
3636
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3445
3637
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3446
|
-
__func__,
|
3447
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3638
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3639
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3448
3640
|
}
|
3449
3641
|
|
3450
3642
|
void llama_reset_timings(struct llama_context * ctx) {
|