llama_cpp 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -66,6 +69,7 @@ enum e_model {
|
|
66
69
|
MODEL_65B,
|
67
70
|
};
|
68
71
|
|
72
|
+
static const size_t kB = 1024;
|
69
73
|
static const size_t MB = 1024*1024;
|
70
74
|
|
71
75
|
// computed for n_ctx == 2048
|
@@ -78,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
78
82
|
(void) tensor;
|
79
83
|
}
|
80
84
|
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
81
104
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
82
105
|
{
|
83
106
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -129,6 +152,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
152
|
return k_sizes;
|
130
153
|
}
|
131
154
|
|
155
|
+
// amount of VRAM needed per batch size to hold temporary results
|
156
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
157
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
158
|
+
{
|
159
|
+
static std::map<e_model, size_t> k_sizes = {
|
160
|
+
{ MODEL_3B, 512ull * kB },
|
161
|
+
{ MODEL_7B, 512ull * kB },
|
162
|
+
{ MODEL_13B, 640ull * kB },
|
163
|
+
{ MODEL_30B, 768ull * kB },
|
164
|
+
{ MODEL_65B, 1536ull * kB },
|
165
|
+
};
|
166
|
+
return k_sizes;
|
167
|
+
}
|
168
|
+
|
169
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
170
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
171
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
172
|
+
{
|
173
|
+
static std::map<e_model, size_t> k_sizes = {
|
174
|
+
{ MODEL_3B, 128ull },
|
175
|
+
{ MODEL_7B, 128ull },
|
176
|
+
{ MODEL_13B, 160ull },
|
177
|
+
{ MODEL_30B, 208ull },
|
178
|
+
{ MODEL_65B, 416ull },
|
179
|
+
};
|
180
|
+
return k_sizes;
|
181
|
+
}
|
182
|
+
|
132
183
|
// default hparams (LLaMA 7B)
|
133
184
|
struct llama_hparams {
|
134
185
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +216,8 @@ struct llama_layer {
|
|
165
216
|
};
|
166
217
|
|
167
218
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
219
|
+
struct ggml_tensor * k = NULL;
|
220
|
+
struct ggml_tensor * v = NULL;
|
170
221
|
|
171
222
|
struct ggml_context * ctx = NULL;
|
172
223
|
|
@@ -253,7 +304,13 @@ struct llama_model {
|
|
253
304
|
|
254
305
|
struct llama_context {
|
255
306
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
307
|
+
#ifdef GGML_USE_METAL
|
308
|
+
~llama_context() {
|
309
|
+
if (ctx_metal) {
|
310
|
+
ggml_metal_free(ctx_metal);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
#endif
|
257
314
|
std::mt19937 rng;
|
258
315
|
|
259
316
|
bool has_evaluated_once = false;
|
@@ -286,6 +343,9 @@ struct llama_context {
|
|
286
343
|
// input embedding (1-dimensional array: [n_embd])
|
287
344
|
std::vector<float> embedding;
|
288
345
|
|
346
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
347
|
+
std::vector<uint8_t> work_buffer;
|
348
|
+
|
289
349
|
// memory buffers used to evaluate the model
|
290
350
|
// TODO: move in llama_state
|
291
351
|
llama_ctx_buffer buf_compute;
|
@@ -295,6 +355,10 @@ struct llama_context {
|
|
295
355
|
ggml_metal_context * ctx_metal = NULL;
|
296
356
|
#endif
|
297
357
|
|
358
|
+
#ifdef GGML_USE_MPI
|
359
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
360
|
+
#endif
|
361
|
+
|
298
362
|
int buf_last = 0;
|
299
363
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
300
364
|
|
@@ -446,9 +510,7 @@ struct llama_file_loader {
|
|
446
510
|
std::string word = file.read_string(len);
|
447
511
|
|
448
512
|
float score = 0.0f;
|
449
|
-
|
450
|
-
file.read_raw(&score, sizeof(score));
|
451
|
-
}
|
513
|
+
file.read_raw(&score, sizeof(score));
|
452
514
|
|
453
515
|
vocab.token_to_id[word] = i;
|
454
516
|
|
@@ -725,7 +787,6 @@ struct llama_model_loader {
|
|
725
787
|
|
726
788
|
};
|
727
789
|
|
728
|
-
|
729
790
|
//
|
730
791
|
// kv cache
|
731
792
|
//
|
@@ -816,7 +877,7 @@ bool llama_mlock_supported() {
|
|
816
877
|
return llama_mlock::SUPPORTED;
|
817
878
|
}
|
818
879
|
|
819
|
-
void
|
880
|
+
void llama_backend_init(bool numa) {
|
820
881
|
ggml_time_init();
|
821
882
|
|
822
883
|
// needed to initialize f16 tables
|
@@ -829,6 +890,16 @@ void llama_init_backend(bool numa) {
|
|
829
890
|
if (numa) {
|
830
891
|
ggml_numa_init();
|
831
892
|
}
|
893
|
+
|
894
|
+
#ifdef GGML_USE_MPI
|
895
|
+
ggml_mpi_backend_init();
|
896
|
+
#endif
|
897
|
+
}
|
898
|
+
|
899
|
+
void llama_backend_free() {
|
900
|
+
#ifdef GGML_USE_MPI
|
901
|
+
ggml_mpi_backend_free();
|
902
|
+
#endif
|
832
903
|
}
|
833
904
|
|
834
905
|
int64_t llama_time_us() {
|
@@ -1112,14 +1183,18 @@ static void llama_model_load_internal(
|
|
1112
1183
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1113
1184
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1114
1185
|
} else {
|
1115
|
-
|
1186
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1187
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1188
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1116
1189
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1117
1190
|
if (n_gpu_layers > 0) {
|
1118
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1119
|
-
__func__,
|
1191
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1192
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1193
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1120
1194
|
}
|
1121
1195
|
}
|
1122
1196
|
#endif // GGML_USE_CUBLAS
|
1197
|
+
|
1123
1198
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1124
1199
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1125
1200
|
|
@@ -1128,6 +1203,10 @@ static void llama_model_load_internal(
|
|
1128
1203
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1129
1204
|
}
|
1130
1205
|
size_t vram_kv_cache = 0;
|
1206
|
+
|
1207
|
+
#ifdef GGML_USE_CUBLAS
|
1208
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1209
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1131
1210
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1132
1211
|
if (low_vram) {
|
1133
1212
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1144,14 +1223,18 @@ static void llama_model_load_internal(
|
|
1144
1223
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1145
1224
|
}
|
1146
1225
|
}
|
1147
|
-
|
1226
|
+
#elif defined(GGML_USE_CLBLAST)
|
1227
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1228
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1229
|
+
#endif // GGML_USE_CUBLAS
|
1230
|
+
|
1148
1231
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1149
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1232
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1150
1233
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1151
1234
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1152
1235
|
#else
|
1153
1236
|
(void) n_gpu_layers;
|
1154
|
-
#endif
|
1237
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1155
1238
|
}
|
1156
1239
|
|
1157
1240
|
// populate `tensors_by_name`
|
@@ -1218,18 +1301,16 @@ static bool llama_eval_internal(
|
|
1218
1301
|
llama_context & lctx,
|
1219
1302
|
const llama_token * tokens,
|
1220
1303
|
const float * embd,
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1304
|
+
int n_tokens,
|
1305
|
+
int n_past,
|
1306
|
+
int n_threads,
|
1224
1307
|
const char * cgraph_fname) {
|
1225
1308
|
|
1226
1309
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
1310
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
return false;
|
1232
|
-
}
|
1311
|
+
#ifdef GGML_USE_MPI
|
1312
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1313
|
+
#endif
|
1233
1314
|
|
1234
1315
|
const int64_t t_start_us = ggml_time_us();
|
1235
1316
|
|
@@ -1261,20 +1342,26 @@ static bool llama_eval_internal(
|
|
1261
1342
|
|
1262
1343
|
struct ggml_context * ctx0 = ggml_init(params);
|
1263
1344
|
|
1345
|
+
ggml_cgraph gf = {};
|
1346
|
+
|
1264
1347
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1265
1348
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1266
|
-
|
1267
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1349
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1268
1350
|
|
1269
1351
|
struct ggml_tensor * cur;
|
1270
1352
|
struct ggml_tensor * inpL;
|
1271
1353
|
|
1272
1354
|
if (tokens) {
|
1273
|
-
struct ggml_tensor *
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1356
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1357
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1358
|
+
|
1359
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1277
1360
|
} else {
|
1361
|
+
#ifdef GGML_USE_MPI
|
1362
|
+
GGML_ASSERT(false && "not implemented");
|
1363
|
+
#endif
|
1364
|
+
|
1278
1365
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
1366
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
1367
|
}
|
@@ -1292,18 +1379,20 @@ static bool llama_eval_internal(
|
|
1292
1379
|
offload_func_t offload_func_v = llama_nop;
|
1293
1380
|
|
1294
1381
|
#ifdef GGML_USE_CUBLAS
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1382
|
+
if (n_gpu_layers > n_layer) {
|
1383
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1384
|
+
}
|
1385
|
+
if (n_gpu_layers > n_layer + 1) {
|
1386
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1387
|
+
}
|
1388
|
+
if (n_gpu_layers > n_layer + 2) {
|
1389
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1390
|
+
}
|
1304
1391
|
#endif // GGML_USE_CUBLAS
|
1305
1392
|
|
1306
1393
|
for (int il = 0; il < n_layer; ++il) {
|
1394
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1395
|
+
|
1307
1396
|
offload_func_t offload_func = llama_nop;
|
1308
1397
|
|
1309
1398
|
#ifdef GGML_USE_CUBLAS
|
@@ -1510,7 +1599,6 @@ static bool llama_eval_internal(
|
|
1510
1599
|
|
1511
1600
|
// input for next layer
|
1512
1601
|
inpL = cur;
|
1513
|
-
|
1514
1602
|
}
|
1515
1603
|
|
1516
1604
|
lctx.use_buf(ctx0, 0);
|
@@ -1518,7 +1606,6 @@ static bool llama_eval_internal(
|
|
1518
1606
|
// used at the end to optionally extract the embeddings
|
1519
1607
|
struct ggml_tensor * embeddings = NULL;
|
1520
1608
|
|
1521
|
-
|
1522
1609
|
// norm
|
1523
1610
|
{
|
1524
1611
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1533,7 +1620,6 @@ static bool llama_eval_internal(
|
|
1533
1620
|
embeddings = cur;
|
1534
1621
|
}
|
1535
1622
|
|
1536
|
-
|
1537
1623
|
// lm_head
|
1538
1624
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1539
1625
|
ggml_set_name(cur, "result_output");
|
@@ -1546,8 +1632,13 @@ static bool llama_eval_internal(
|
|
1546
1632
|
// run the computation
|
1547
1633
|
ggml_build_forward_expand(&gf, cur);
|
1548
1634
|
|
1635
|
+
#if GGML_USE_MPI
|
1636
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1637
|
+
#endif
|
1638
|
+
|
1549
1639
|
#ifdef GGML_USE_METAL
|
1550
1640
|
if (lctx.ctx_metal && N == 1) {
|
1641
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1551
1642
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1552
1643
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1553
1644
|
} else {
|
@@ -1567,12 +1658,21 @@ static bool llama_eval_internal(
|
|
1567
1658
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1568
1659
|
}
|
1569
1660
|
|
1570
|
-
|
1661
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1571
1662
|
}
|
1572
1663
|
#else
|
1573
|
-
|
1664
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1574
1665
|
#endif
|
1575
1666
|
|
1667
|
+
#if GGML_USE_MPI
|
1668
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1669
|
+
#endif
|
1670
|
+
|
1671
|
+
// update kv token count
|
1672
|
+
lctx.kv_self.n = n_past + N;
|
1673
|
+
|
1674
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1675
|
+
|
1576
1676
|
if (cgraph_fname) {
|
1577
1677
|
ggml_graph_export(&gf, cgraph_fname);
|
1578
1678
|
}
|
@@ -1588,23 +1688,17 @@ static bool llama_eval_internal(
|
|
1588
1688
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1589
1689
|
//}
|
1590
1690
|
|
1591
|
-
//embd_w.resize(n_vocab*N);
|
1592
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1593
|
-
|
1594
|
-
// update kv token count
|
1595
|
-
lctx.kv_self.n = n_past + N;
|
1596
|
-
|
1597
1691
|
// extract logits
|
1598
1692
|
{
|
1599
1693
|
auto & logits_out = lctx.logits;
|
1600
1694
|
|
1601
1695
|
if (lctx.logits_all) {
|
1602
1696
|
logits_out.resize(n_vocab * N);
|
1603
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1697
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1604
1698
|
} else {
|
1605
1699
|
// return result for just the last token
|
1606
1700
|
logits_out.resize(n_vocab);
|
1607
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1701
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1608
1702
|
}
|
1609
1703
|
}
|
1610
1704
|
|
@@ -1860,10 +1954,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1860
1954
|
return;
|
1861
1955
|
}
|
1862
1956
|
|
1863
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1864
|
-
|
1865
1957
|
llama_sample_softmax(ctx, candidates);
|
1866
1958
|
|
1959
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1960
|
+
|
1867
1961
|
// Compute the cumulative probabilities
|
1868
1962
|
float cum_sum = 0.0f;
|
1869
1963
|
size_t last_idx = candidates->size;
|
@@ -1892,9 +1986,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1892
1986
|
return;
|
1893
1987
|
}
|
1894
1988
|
|
1895
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1896
|
-
|
1897
1989
|
llama_sample_softmax(nullptr, candidates);
|
1990
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1898
1991
|
|
1899
1992
|
// Compute the first and second derivatives
|
1900
1993
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -1946,11 +2039,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
1946
2039
|
return;
|
1947
2040
|
}
|
1948
2041
|
|
1949
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1950
|
-
|
1951
2042
|
// Compute the softmax of logits and calculate entropy
|
1952
2043
|
llama_sample_softmax(nullptr, candidates);
|
1953
2044
|
|
2045
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2046
|
+
|
1954
2047
|
float entropy = 0.0f;
|
1955
2048
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1956
2049
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2074,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2074
2167
|
}
|
2075
2168
|
}
|
2076
2169
|
|
2170
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2171
|
+
float max_l = *std::max_element(array, array + size);
|
2172
|
+
float sum = 0.f;
|
2173
|
+
for (size_t i = 0; i < size; ++i) {
|
2174
|
+
float p = expf(array[i] - max_l);
|
2175
|
+
sum += p;
|
2176
|
+
array[i] = p;
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
for (size_t i = 0; i < size; ++i) {
|
2180
|
+
array[i] = logf(array[i] / sum);
|
2181
|
+
}
|
2182
|
+
}
|
2183
|
+
|
2184
|
+
void llama_sample_classifier_free_guidance(
|
2185
|
+
struct llama_context * ctx,
|
2186
|
+
llama_token_data_array * candidates,
|
2187
|
+
struct llama_context * guidance_ctx,
|
2188
|
+
float scale,
|
2189
|
+
float smooth_factor) {
|
2190
|
+
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2191
|
+
|
2192
|
+
assert(ctx);
|
2193
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2194
|
+
assert(n_vocab == (int)candidates->size);
|
2195
|
+
assert(!candidates->sorted);
|
2196
|
+
|
2197
|
+
std::vector<float> logits_base;
|
2198
|
+
logits_base.reserve(candidates->size);
|
2199
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2200
|
+
logits_base.push_back(candidates->data[i].logit);
|
2201
|
+
}
|
2202
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2203
|
+
|
2204
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2205
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2206
|
+
|
2207
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2208
|
+
float logit_guidance = logits_guidance[i];
|
2209
|
+
float logit_base = logits_base[i];
|
2210
|
+
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
+
|
2215
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
+
float logit_base = logits_base[i];
|
2217
|
+
float logit_guidance = logits_guidance[i];
|
2218
|
+
|
2219
|
+
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2220
|
+
}
|
2221
|
+
|
2222
|
+
if (ctx) {
|
2223
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2224
|
+
}
|
2225
|
+
}
|
2077
2226
|
|
2078
2227
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2079
2228
|
assert(ctx);
|
@@ -2119,13 +2268,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2119
2268
|
|
2120
2269
|
if (ctx) {
|
2121
2270
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2122
|
-
ctx->n_sample++;
|
2123
2271
|
}
|
2124
2272
|
return X;
|
2125
2273
|
}
|
2126
2274
|
|
2127
2275
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2128
|
-
assert(ctx);
|
2129
2276
|
int64_t t_start_sample_us;
|
2130
2277
|
t_start_sample_us = ggml_time_us();
|
2131
2278
|
|
@@ -2140,13 +2287,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2140
2287
|
candidates->size = 1;
|
2141
2288
|
}
|
2142
2289
|
|
2290
|
+
if (ctx) {
|
2291
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2292
|
+
}
|
2293
|
+
|
2143
2294
|
// Normalize the probabilities of the remaining words
|
2144
2295
|
llama_sample_softmax(ctx, candidates);
|
2145
2296
|
|
2146
2297
|
// Sample the next word X from the remaining words
|
2147
|
-
if (ctx) {
|
2148
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2149
|
-
}
|
2150
2298
|
llama_token X = llama_sample_token(ctx, candidates);
|
2151
2299
|
t_start_sample_us = ggml_time_us();
|
2152
2300
|
|
@@ -2214,10 +2362,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2214
2362
|
}
|
2215
2363
|
float * f32_output = (float *) output.addr;
|
2216
2364
|
|
2217
|
-
|
2365
|
+
ggml_type_traits_t qtype;
|
2218
2366
|
if (ggml_is_quantized(tensor.type)) {
|
2219
|
-
qtype =
|
2220
|
-
if (qtype.
|
2367
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2368
|
+
if (qtype.to_float == NULL) {
|
2221
2369
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2222
2370
|
}
|
2223
2371
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2228,7 +2376,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2228
2376
|
if (tensor.type == GGML_TYPE_F16) {
|
2229
2377
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2230
2378
|
} else if (ggml_is_quantized(tensor.type)) {
|
2231
|
-
qtype.
|
2379
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2232
2380
|
} else {
|
2233
2381
|
LLAMA_ASSERT(false); // unreachable
|
2234
2382
|
}
|
@@ -2253,7 +2401,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2253
2401
|
if (typ == GGML_TYPE_F16) {
|
2254
2402
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2255
2403
|
} else {
|
2256
|
-
qtype.
|
2404
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2257
2405
|
}
|
2258
2406
|
};
|
2259
2407
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -2362,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2362
2510
|
} else {
|
2363
2511
|
new_type = quantized_type;
|
2364
2512
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
+
bool convert_incompatible_tensor = false;
|
2365
2514
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2366
2515
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2367
2516
|
int nx = tensor.ne.at(0);
|
2368
2517
|
int ny = tensor.ne.at(1);
|
2369
2518
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2370
|
-
fprintf(stderr, "\n\
|
2371
|
-
|
2372
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2373
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2519
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
+
convert_incompatible_tensor = true;
|
2374
2521
|
}
|
2375
2522
|
}
|
2376
2523
|
if (tensor.name == "output.weight") {
|
@@ -2398,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2398
2545
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2399
2546
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2400
2547
|
}
|
2548
|
+
if (convert_incompatible_tensor) {
|
2549
|
+
if (tensor.name == "output.weight") {
|
2550
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2551
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2552
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2553
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2554
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2555
|
+
} else {
|
2556
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2557
|
+
}
|
2558
|
+
}
|
2401
2559
|
#endif
|
2402
2560
|
|
2403
2561
|
float * f32_data;
|
@@ -2532,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
|
|
2532
2690
|
}
|
2533
2691
|
|
2534
2692
|
struct llama_context * llama_new_context_with_model(
|
2535
|
-
|
2536
|
-
|
2693
|
+
struct llama_model * model,
|
2694
|
+
struct llama_context_params params) {
|
2537
2695
|
|
2538
2696
|
if (!model) {
|
2539
2697
|
return nullptr;
|
@@ -2602,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2602
2760
|
#ifdef GGML_USE_METAL
|
2603
2761
|
if (params.n_gpu_layers > 0) {
|
2604
2762
|
// this allocates all Metal resources and memory buffers
|
2605
|
-
ctx->ctx_metal = ggml_metal_init();
|
2763
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2606
2764
|
|
2607
2765
|
void * data_ptr = NULL;
|
2608
2766
|
size_t data_size = 0;
|
@@ -2637,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2637
2795
|
}
|
2638
2796
|
#endif
|
2639
2797
|
|
2798
|
+
#ifdef GGML_USE_MPI
|
2799
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2800
|
+
|
2801
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2802
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2803
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2804
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2805
|
+
llama_backend_free();
|
2806
|
+
exit(1);
|
2807
|
+
}
|
2808
|
+
#endif
|
2809
|
+
|
2640
2810
|
return ctx;
|
2641
2811
|
}
|
2642
2812
|
|
@@ -2759,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2759
2929
|
// read tensors and apply
|
2760
2930
|
bool warned = false;
|
2761
2931
|
int n_tensors = 0;
|
2932
|
+
|
2933
|
+
std::vector<uint8_t> work_buffer;
|
2934
|
+
|
2762
2935
|
while (true) {
|
2763
2936
|
int32_t n_dims;
|
2764
2937
|
int32_t length;
|
@@ -2923,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2923
3096
|
}
|
2924
3097
|
|
2925
3098
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2926
|
-
|
2927
|
-
|
3099
|
+
|
3100
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2928
3101
|
|
2929
3102
|
// we won't need these tensors again, reset the context to save memory
|
2930
3103
|
ggml_free(lora_ctx);
|
@@ -3077,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3077
3250
|
|
3078
3251
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3079
3252
|
ggml_cgraph gf{};
|
3080
|
-
gf.n_threads = 1;
|
3081
3253
|
|
3082
3254
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3083
3255
|
kout3d->data = out;
|
@@ -3097,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3097
3269
|
|
3098
3270
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3099
3271
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3100
|
-
|
3272
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3101
3273
|
|
3102
3274
|
ggml_free(cpy_ctx);
|
3103
3275
|
}
|
@@ -3183,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3183
3355
|
|
3184
3356
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3185
3357
|
ggml_cgraph gf{};
|
3186
|
-
gf.n_threads = 1;
|
3187
3358
|
|
3188
3359
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3189
3360
|
kin3d->data = (void *) inp;
|
@@ -3203,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3203
3374
|
|
3204
3375
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3205
3376
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3206
|
-
|
3377
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3207
3378
|
|
3208
3379
|
ggml_free(cpy_ctx);
|
3209
3380
|
}
|
@@ -3219,7 +3390,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3219
3390
|
return nread;
|
3220
3391
|
}
|
3221
3392
|
|
3222
|
-
bool
|
3393
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3223
3394
|
llama_file file(path_session, "rb");
|
3224
3395
|
|
3225
3396
|
// sanity checks
|
@@ -3273,6 +3444,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3273
3444
|
return true;
|
3274
3445
|
}
|
3275
3446
|
|
3447
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3448
|
+
try {
|
3449
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3450
|
+
} catch (const std::exception & err) {
|
3451
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3452
|
+
return false;
|
3453
|
+
}
|
3454
|
+
}
|
3455
|
+
|
3276
3456
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3277
3457
|
llama_file file(path_session, "wb");
|
3278
3458
|
|
@@ -3428,23 +3608,35 @@ llama_token llama_token_nl() {
|
|
3428
3608
|
return 13;
|
3429
3609
|
}
|
3430
3610
|
|
3611
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3612
|
+
struct llama_timings result = {
|
3613
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3614
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3615
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3616
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3617
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3618
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3431
3619
|
|
3432
|
-
|
3433
|
-
|
3620
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3621
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3622
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3623
|
+
};
|
3434
3624
|
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3625
|
+
return result;
|
3626
|
+
}
|
3627
|
+
|
3628
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3629
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3438
3630
|
|
3439
3631
|
fprintf(stderr, "\n");
|
3440
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3632
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3441
3633
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3442
|
-
__func__,
|
3634
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3443
3635
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3444
|
-
__func__,
|
3636
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3445
3637
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3446
|
-
__func__,
|
3447
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3638
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3639
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3448
3640
|
}
|
3449
3641
|
|
3450
3642
|
void llama_reset_timings(struct llama_context * ctx) {
|