llama_cpp 0.3.2 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -79,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
79
82
|
(void) tensor;
|
80
83
|
}
|
81
84
|
|
82
|
-
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
104
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
83
105
|
{
|
84
106
|
static std::map<e_model, size_t> k_sizes = {
|
85
|
-
|
86
|
-
{
|
87
|
-
{
|
88
|
-
{
|
89
|
-
{
|
107
|
+
/* empirical scaling, still a guess */
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
90
113
|
};
|
91
114
|
return k_sizes;
|
92
115
|
}
|
@@ -118,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
118
141
|
|
119
142
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
120
143
|
// not actually needed if BLAS is disabled
|
121
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
122
145
|
{
|
123
146
|
static std::map<e_model, size_t> k_sizes = {
|
124
|
-
{ MODEL_3B,
|
125
|
-
{ MODEL_7B,
|
126
|
-
{ MODEL_13B, 1024ull * MB },
|
127
|
-
{ MODEL_30B, 1280ull * MB },
|
128
|
-
{ MODEL_65B, 1536ull * MB },
|
147
|
+
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
+
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
+
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
+
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
+
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
129
152
|
};
|
130
153
|
return k_sizes;
|
131
154
|
}
|
@@ -167,6 +190,10 @@ struct llama_hparams {
|
|
167
190
|
uint32_t n_head = 32;
|
168
191
|
uint32_t n_layer = 32;
|
169
192
|
uint32_t n_rot = 64;
|
193
|
+
|
194
|
+
float rope_freq_base = 10000.0f;
|
195
|
+
float rope_freq_scale = 1.0f;
|
196
|
+
|
170
197
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
171
198
|
|
172
199
|
bool operator!=(const llama_hparams & other) const {
|
@@ -281,7 +308,7 @@ struct llama_model {
|
|
281
308
|
};
|
282
309
|
|
283
310
|
struct llama_context {
|
284
|
-
llama_context(const llama_model & model
|
311
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
285
312
|
#ifdef GGML_USE_METAL
|
286
313
|
~llama_context() {
|
287
314
|
if (ctx_metal) {
|
@@ -302,7 +329,6 @@ struct llama_context {
|
|
302
329
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
303
330
|
|
304
331
|
const llama_model & model;
|
305
|
-
const llama_vocab & vocab;
|
306
332
|
|
307
333
|
bool model_owner = false;
|
308
334
|
|
@@ -321,6 +347,9 @@ struct llama_context {
|
|
321
347
|
// input embedding (1-dimensional array: [n_embd])
|
322
348
|
std::vector<float> embedding;
|
323
349
|
|
350
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
351
|
+
std::vector<uint8_t> work_buffer;
|
352
|
+
|
324
353
|
// memory buffers used to evaluate the model
|
325
354
|
// TODO: move in llama_state
|
326
355
|
llama_ctx_buffer buf_compute;
|
@@ -330,6 +359,10 @@ struct llama_context {
|
|
330
359
|
ggml_metal_context * ctx_metal = NULL;
|
331
360
|
#endif
|
332
361
|
|
362
|
+
#ifdef GGML_USE_MPI
|
363
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
364
|
+
#endif
|
365
|
+
|
333
366
|
int buf_last = 0;
|
334
367
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
335
368
|
|
@@ -522,7 +555,9 @@ struct llama_file_loader {
|
|
522
555
|
}
|
523
556
|
|
524
557
|
// skip to the next multiple of 32 bytes
|
525
|
-
|
558
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
559
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
560
|
+
}
|
526
561
|
|
527
562
|
tensor.file_off = file.tell();
|
528
563
|
tensor.name = name;
|
@@ -619,7 +654,7 @@ struct llama_model_loader {
|
|
619
654
|
*ctx_size_p = *mmapped_size_p = 0;
|
620
655
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
621
656
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
622
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
657
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
623
658
|
}
|
624
659
|
}
|
625
660
|
|
@@ -758,7 +793,6 @@ struct llama_model_loader {
|
|
758
793
|
|
759
794
|
};
|
760
795
|
|
761
|
-
|
762
796
|
//
|
763
797
|
// kv cache
|
764
798
|
//
|
@@ -815,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
|
|
815
849
|
/*.n_batch =*/ 512,
|
816
850
|
/*.gpu_layers =*/ 0,
|
817
851
|
/*.main_gpu =*/ 0,
|
818
|
-
/*.tensor_split =*/
|
852
|
+
/*.tensor_split =*/ nullptr,
|
853
|
+
/*.rope_freq_base =*/ 10000.0f,
|
854
|
+
/*.rope_freq_scale =*/ 1.0f,
|
819
855
|
/*.progress_callback =*/ nullptr,
|
820
856
|
/*.progress_callback_user_data =*/ nullptr,
|
821
857
|
/*.low_vram =*/ false,
|
@@ -841,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
841
877
|
return result;
|
842
878
|
}
|
843
879
|
|
880
|
+
int llama_max_devices() {
|
881
|
+
return LLAMA_MAX_DEVICES;
|
882
|
+
}
|
883
|
+
|
844
884
|
bool llama_mmap_supported() {
|
845
885
|
return llama_mmap::SUPPORTED;
|
846
886
|
}
|
@@ -849,7 +889,7 @@ bool llama_mlock_supported() {
|
|
849
889
|
return llama_mlock::SUPPORTED;
|
850
890
|
}
|
851
891
|
|
852
|
-
void
|
892
|
+
void llama_backend_init(bool numa) {
|
853
893
|
ggml_time_init();
|
854
894
|
|
855
895
|
// needed to initialize f16 tables
|
@@ -862,6 +902,16 @@ void llama_init_backend(bool numa) {
|
|
862
902
|
if (numa) {
|
863
903
|
ggml_numa_init();
|
864
904
|
}
|
905
|
+
|
906
|
+
#ifdef GGML_USE_MPI
|
907
|
+
ggml_mpi_backend_init();
|
908
|
+
#endif
|
909
|
+
}
|
910
|
+
|
911
|
+
void llama_backend_free() {
|
912
|
+
#ifdef GGML_USE_MPI
|
913
|
+
ggml_mpi_backend_free();
|
914
|
+
#endif
|
865
915
|
}
|
866
916
|
|
867
917
|
int64_t llama_time_us() {
|
@@ -929,6 +979,8 @@ static void llama_model_load_internal(
|
|
929
979
|
int n_gpu_layers,
|
930
980
|
int main_gpu,
|
931
981
|
const float * tensor_split,
|
982
|
+
float rope_freq_base,
|
983
|
+
float rope_freq_scale,
|
932
984
|
bool low_vram,
|
933
985
|
ggml_type memory_type,
|
934
986
|
bool use_mmap,
|
@@ -963,22 +1015,27 @@ static void llama_model_load_internal(
|
|
963
1015
|
}
|
964
1016
|
|
965
1017
|
hparams.n_ctx = n_ctx;
|
1018
|
+
|
1019
|
+
hparams.rope_freq_base = rope_freq_base;
|
1020
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
966
1021
|
}
|
967
1022
|
|
968
1023
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
969
1024
|
|
970
1025
|
{
|
971
|
-
fprintf(stderr, "%s: format = %s\n",
|
972
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
973
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
974
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
975
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
976
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
977
|
-
fprintf(stderr, "%s: n_layer = %u\n",
|
978
|
-
fprintf(stderr, "%s: n_rot = %u\n",
|
1026
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1027
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1028
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1029
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1032
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1034
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
979
1036
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
980
|
-
fprintf(stderr, "%s: n_ff = %u\n",
|
981
|
-
fprintf(stderr, "%s: model size = %s\n",
|
1037
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
982
1039
|
}
|
983
1040
|
|
984
1041
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1127,9 +1184,9 @@ static void llama_model_load_internal(
|
|
1127
1184
|
const size_t mem_required =
|
1128
1185
|
ctx_size +
|
1129
1186
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1130
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1187
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1131
1188
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1132
|
-
MEM_REQ_EVAL().at
|
1189
|
+
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
1133
1190
|
|
1134
1191
|
// this is the memory required by one llama_state
|
1135
1192
|
const size_t mem_required_state =
|
@@ -1232,7 +1289,9 @@ static bool llama_model_load(
|
|
1232
1289
|
int n_batch,
|
1233
1290
|
int n_gpu_layers,
|
1234
1291
|
int main_gpu,
|
1235
|
-
float * tensor_split,
|
1292
|
+
const float * tensor_split,
|
1293
|
+
float rope_freq_base,
|
1294
|
+
float rope_freq_scale,
|
1236
1295
|
bool low_vram,
|
1237
1296
|
ggml_type memory_type,
|
1238
1297
|
bool use_mmap,
|
@@ -1241,7 +1300,7 @@ static bool llama_model_load(
|
|
1241
1300
|
llama_progress_callback progress_callback,
|
1242
1301
|
void *progress_callback_user_data) {
|
1243
1302
|
try {
|
1244
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1303
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1245
1304
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1246
1305
|
return true;
|
1247
1306
|
} catch (const std::exception & err) {
|
@@ -1263,18 +1322,16 @@ static bool llama_eval_internal(
|
|
1263
1322
|
llama_context & lctx,
|
1264
1323
|
const llama_token * tokens,
|
1265
1324
|
const float * embd,
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1325
|
+
int n_tokens,
|
1326
|
+
int n_past,
|
1327
|
+
int n_threads,
|
1269
1328
|
const char * cgraph_fname) {
|
1270
1329
|
|
1271
1330
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
1331
|
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
return false;
|
1277
|
-
}
|
1332
|
+
#ifdef GGML_USE_MPI
|
1333
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1334
|
+
#endif
|
1278
1335
|
|
1279
1336
|
const int64_t t_start_us = ggml_time_us();
|
1280
1337
|
|
@@ -1295,6 +1352,9 @@ static bool llama_eval_internal(
|
|
1295
1352
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
1296
1353
|
const int n_gpu_layers = model.n_gpu_layers;
|
1297
1354
|
|
1355
|
+
const float freq_base = hparams.rope_freq_base;
|
1356
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1357
|
+
|
1298
1358
|
auto & mem_per_token = lctx.mem_per_token;
|
1299
1359
|
auto & buf_compute = lctx.buf_compute;
|
1300
1360
|
|
@@ -1306,20 +1366,26 @@ static bool llama_eval_internal(
|
|
1306
1366
|
|
1307
1367
|
struct ggml_context * ctx0 = ggml_init(params);
|
1308
1368
|
|
1369
|
+
ggml_cgraph gf = {};
|
1370
|
+
|
1309
1371
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1310
1372
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1311
|
-
|
1312
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1373
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1313
1374
|
|
1314
1375
|
struct ggml_tensor * cur;
|
1315
1376
|
struct ggml_tensor * inpL;
|
1316
1377
|
|
1317
1378
|
if (tokens) {
|
1318
|
-
struct ggml_tensor *
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1379
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1380
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1381
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1382
|
+
|
1383
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1322
1384
|
} else {
|
1385
|
+
#ifdef GGML_USE_MPI
|
1386
|
+
GGML_ASSERT(false && "not implemented");
|
1387
|
+
#endif
|
1388
|
+
|
1323
1389
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
1390
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
1391
|
}
|
@@ -1337,18 +1403,20 @@ static bool llama_eval_internal(
|
|
1337
1403
|
offload_func_t offload_func_v = llama_nop;
|
1338
1404
|
|
1339
1405
|
#ifdef GGML_USE_CUBLAS
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1406
|
+
if (n_gpu_layers > n_layer) {
|
1407
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1408
|
+
}
|
1409
|
+
if (n_gpu_layers > n_layer + 1) {
|
1410
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1411
|
+
}
|
1412
|
+
if (n_gpu_layers > n_layer + 2) {
|
1413
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1414
|
+
}
|
1349
1415
|
#endif // GGML_USE_CUBLAS
|
1350
1416
|
|
1351
1417
|
for (int il = 0; il < n_layer; ++il) {
|
1418
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1419
|
+
|
1352
1420
|
offload_func_t offload_func = llama_nop;
|
1353
1421
|
|
1354
1422
|
#ifdef GGML_USE_CUBLAS
|
@@ -1384,11 +1452,11 @@ static bool llama_eval_internal(
|
|
1384
1452
|
offload_func_kq(tmpq);
|
1385
1453
|
ggml_set_name(tmpq, "tmpq");
|
1386
1454
|
|
1387
|
-
struct ggml_tensor * Kcur =
|
1455
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1388
1456
|
offload_func_kq(Kcur);
|
1389
1457
|
ggml_set_name(Kcur, "Kcur");
|
1390
1458
|
|
1391
|
-
struct ggml_tensor * Qcur =
|
1459
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1392
1460
|
offload_func_kq(Qcur);
|
1393
1461
|
ggml_set_name(Qcur, "Qcur");
|
1394
1462
|
|
@@ -1555,7 +1623,6 @@ static bool llama_eval_internal(
|
|
1555
1623
|
|
1556
1624
|
// input for next layer
|
1557
1625
|
inpL = cur;
|
1558
|
-
|
1559
1626
|
}
|
1560
1627
|
|
1561
1628
|
lctx.use_buf(ctx0, 0);
|
@@ -1563,7 +1630,6 @@ static bool llama_eval_internal(
|
|
1563
1630
|
// used at the end to optionally extract the embeddings
|
1564
1631
|
struct ggml_tensor * embeddings = NULL;
|
1565
1632
|
|
1566
|
-
|
1567
1633
|
// norm
|
1568
1634
|
{
|
1569
1635
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1578,7 +1644,6 @@ static bool llama_eval_internal(
|
|
1578
1644
|
embeddings = cur;
|
1579
1645
|
}
|
1580
1646
|
|
1581
|
-
|
1582
1647
|
// lm_head
|
1583
1648
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1584
1649
|
ggml_set_name(cur, "result_output");
|
@@ -1591,8 +1656,13 @@ static bool llama_eval_internal(
|
|
1591
1656
|
// run the computation
|
1592
1657
|
ggml_build_forward_expand(&gf, cur);
|
1593
1658
|
|
1659
|
+
#if GGML_USE_MPI
|
1660
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1661
|
+
#endif
|
1662
|
+
|
1594
1663
|
#ifdef GGML_USE_METAL
|
1595
1664
|
if (lctx.ctx_metal && N == 1) {
|
1665
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1596
1666
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1597
1667
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1598
1668
|
} else {
|
@@ -1612,12 +1682,21 @@ static bool llama_eval_internal(
|
|
1612
1682
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1613
1683
|
}
|
1614
1684
|
|
1615
|
-
|
1685
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1616
1686
|
}
|
1617
1687
|
#else
|
1618
|
-
|
1688
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1689
|
+
#endif
|
1690
|
+
|
1691
|
+
#if GGML_USE_MPI
|
1692
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1619
1693
|
#endif
|
1620
1694
|
|
1695
|
+
// update kv token count
|
1696
|
+
lctx.kv_self.n = n_past + N;
|
1697
|
+
|
1698
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1699
|
+
|
1621
1700
|
if (cgraph_fname) {
|
1622
1701
|
ggml_graph_export(&gf, cgraph_fname);
|
1623
1702
|
}
|
@@ -1633,23 +1712,17 @@ static bool llama_eval_internal(
|
|
1633
1712
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1634
1713
|
//}
|
1635
1714
|
|
1636
|
-
//embd_w.resize(n_vocab*N);
|
1637
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1638
|
-
|
1639
|
-
// update kv token count
|
1640
|
-
lctx.kv_self.n = n_past + N;
|
1641
|
-
|
1642
1715
|
// extract logits
|
1643
1716
|
{
|
1644
1717
|
auto & logits_out = lctx.logits;
|
1645
1718
|
|
1646
1719
|
if (lctx.logits_all) {
|
1647
1720
|
logits_out.resize(n_vocab * N);
|
1648
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1721
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1649
1722
|
} else {
|
1650
1723
|
// return result for just the last token
|
1651
1724
|
logits_out.resize(n_vocab);
|
1652
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1725
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1653
1726
|
}
|
1654
1727
|
}
|
1655
1728
|
|
@@ -1957,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1957
2030
|
}
|
1958
2031
|
|
1959
2032
|
// Normalize the second derivatives
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
2033
|
+
{
|
2034
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2035
|
+
|
2036
|
+
if (second_derivatives_sum > 1e-6f) {
|
2037
|
+
for (float & value : second_derivatives) {
|
2038
|
+
value /= second_derivatives_sum;
|
2039
|
+
}
|
2040
|
+
} else {
|
2041
|
+
for (float & value : second_derivatives) {
|
2042
|
+
value = 1.0f / second_derivatives.size();
|
2043
|
+
}
|
2044
|
+
}
|
1963
2045
|
}
|
1964
2046
|
|
1965
2047
|
float cum_sum = 0.0f;
|
@@ -2118,6 +2200,52 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2118
2200
|
}
|
2119
2201
|
}
|
2120
2202
|
|
2203
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2204
|
+
float max_l = *std::max_element(array, array + size);
|
2205
|
+
float sum = 0.f;
|
2206
|
+
for (size_t i = 0; i < size; ++i) {
|
2207
|
+
float p = expf(array[i] - max_l);
|
2208
|
+
sum += p;
|
2209
|
+
array[i] = p;
|
2210
|
+
}
|
2211
|
+
|
2212
|
+
for (size_t i = 0; i < size; ++i) {
|
2213
|
+
array[i] = logf(array[i] / sum);
|
2214
|
+
}
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
void llama_sample_classifier_free_guidance(
|
2218
|
+
struct llama_context * ctx,
|
2219
|
+
llama_token_data_array * candidates,
|
2220
|
+
struct llama_context * guidance_ctx,
|
2221
|
+
float scale) {
|
2222
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2223
|
+
|
2224
|
+
assert(ctx);
|
2225
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2226
|
+
assert(n_vocab == (int)candidates->size);
|
2227
|
+
assert(!candidates->sorted);
|
2228
|
+
|
2229
|
+
std::vector<float> logits_base;
|
2230
|
+
logits_base.reserve(candidates->size);
|
2231
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2232
|
+
logits_base.push_back(candidates->data[i].logit);
|
2233
|
+
}
|
2234
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2235
|
+
|
2236
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2237
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2238
|
+
|
2239
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2240
|
+
float logit_guidance = logits_guidance[i];
|
2241
|
+
float logit_base = logits_base[i];
|
2242
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2243
|
+
}
|
2244
|
+
|
2245
|
+
if (ctx) {
|
2246
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2247
|
+
}
|
2248
|
+
}
|
2121
2249
|
|
2122
2250
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2123
2251
|
assert(ctx);
|
@@ -2405,15 +2533,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2405
2533
|
} else {
|
2406
2534
|
new_type = quantized_type;
|
2407
2535
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
+
bool convert_incompatible_tensor = false;
|
2408
2537
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2409
2538
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2410
2539
|
int nx = tensor.ne.at(0);
|
2411
2540
|
int ny = tensor.ne.at(1);
|
2412
2541
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2413
|
-
fprintf(stderr, "\n\
|
2414
|
-
|
2415
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2416
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2542
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
+
convert_incompatible_tensor = true;
|
2417
2544
|
}
|
2418
2545
|
}
|
2419
2546
|
if (tensor.name == "output.weight") {
|
@@ -2441,6 +2568,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2441
2568
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2442
2569
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2443
2570
|
}
|
2571
|
+
if (convert_incompatible_tensor) {
|
2572
|
+
if (tensor.name == "output.weight") {
|
2573
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2574
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2575
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2576
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2577
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2578
|
+
} else {
|
2579
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2580
|
+
}
|
2581
|
+
}
|
2444
2582
|
#endif
|
2445
2583
|
|
2446
2584
|
float * f32_data;
|
@@ -2560,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
|
|
2560
2698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2561
2699
|
|
2562
2700
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2563
|
-
params.main_gpu, params.tensor_split, params.
|
2564
|
-
params.
|
2701
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
|
+
params.progress_callback_user_data)) {
|
2565
2704
|
delete model;
|
2566
2705
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2567
2706
|
return nullptr;
|
@@ -2575,14 +2714,14 @@ void llama_free_model(struct llama_model * model) {
|
|
2575
2714
|
}
|
2576
2715
|
|
2577
2716
|
struct llama_context * llama_new_context_with_model(
|
2578
|
-
|
2579
|
-
|
2717
|
+
struct llama_model * model,
|
2718
|
+
struct llama_context_params params) {
|
2580
2719
|
|
2581
2720
|
if (!model) {
|
2582
2721
|
return nullptr;
|
2583
2722
|
}
|
2584
2723
|
|
2585
|
-
llama_context * ctx = new llama_context(*model
|
2724
|
+
llama_context * ctx = new llama_context(*model);
|
2586
2725
|
|
2587
2726
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2588
2727
|
params.seed = time(NULL);
|
@@ -2636,16 +2775,16 @@ struct llama_context * llama_new_context_with_model(
|
|
2636
2775
|
ctx->embedding.resize(hparams.n_embd);
|
2637
2776
|
}
|
2638
2777
|
|
2639
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2778
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
2640
2779
|
|
2641
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
2780
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2642
2781
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2643
2782
|
}
|
2644
2783
|
|
2645
2784
|
#ifdef GGML_USE_METAL
|
2646
2785
|
if (params.n_gpu_layers > 0) {
|
2647
2786
|
// this allocates all Metal resources and memory buffers
|
2648
|
-
ctx->ctx_metal = ggml_metal_init();
|
2787
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2649
2788
|
|
2650
2789
|
void * data_ptr = NULL;
|
2651
2790
|
size_t data_size = 0;
|
@@ -2680,6 +2819,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2680
2819
|
}
|
2681
2820
|
#endif
|
2682
2821
|
|
2822
|
+
#ifdef GGML_USE_MPI
|
2823
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2824
|
+
|
2825
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2826
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2827
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2828
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2829
|
+
llama_backend_free();
|
2830
|
+
exit(1);
|
2831
|
+
}
|
2832
|
+
#endif
|
2833
|
+
|
2683
2834
|
return ctx;
|
2684
2835
|
}
|
2685
2836
|
|
@@ -2802,6 +2953,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2802
2953
|
// read tensors and apply
|
2803
2954
|
bool warned = false;
|
2804
2955
|
int n_tensors = 0;
|
2956
|
+
|
2957
|
+
std::vector<uint8_t> work_buffer;
|
2958
|
+
|
2805
2959
|
while (true) {
|
2806
2960
|
int32_t n_dims;
|
2807
2961
|
int32_t length;
|
@@ -2966,8 +3120,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2966
3120
|
}
|
2967
3121
|
|
2968
3122
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2969
|
-
|
2970
|
-
|
3123
|
+
|
3124
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2971
3125
|
|
2972
3126
|
// we won't need these tensors again, reset the context to save memory
|
2973
3127
|
ggml_free(lora_ctx);
|
@@ -3120,7 +3274,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3120
3274
|
|
3121
3275
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3122
3276
|
ggml_cgraph gf{};
|
3123
|
-
gf.n_threads = 1;
|
3124
3277
|
|
3125
3278
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3126
3279
|
kout3d->data = out;
|
@@ -3140,7 +3293,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3140
3293
|
|
3141
3294
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3142
3295
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3143
|
-
|
3296
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3144
3297
|
|
3145
3298
|
ggml_free(cpy_ctx);
|
3146
3299
|
}
|
@@ -3226,7 +3379,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3226
3379
|
|
3227
3380
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3228
3381
|
ggml_cgraph gf{};
|
3229
|
-
gf.n_threads = 1;
|
3230
3382
|
|
3231
3383
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3232
3384
|
kin3d->data = (void *) inp;
|
@@ -3246,7 +3398,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3246
3398
|
|
3247
3399
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3248
3400
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3249
|
-
|
3401
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3250
3402
|
|
3251
3403
|
ggml_free(cpy_ctx);
|
3252
3404
|
}
|
@@ -3407,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3407
3559
|
return 0;
|
3408
3560
|
}
|
3409
3561
|
|
3410
|
-
int
|
3411
|
-
|
3562
|
+
int llama_tokenize_with_model(
|
3563
|
+
const struct llama_model * model,
|
3412
3564
|
const char * text,
|
3413
3565
|
llama_token * tokens,
|
3414
3566
|
int n_max_tokens,
|
3415
3567
|
bool add_bos) {
|
3416
|
-
auto res = llama_tokenize(
|
3568
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3417
3569
|
|
3418
3570
|
if (n_max_tokens < (int) res.size()) {
|
3419
3571
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3427,8 +3579,29 @@ int llama_tokenize(
|
|
3427
3579
|
return res.size();
|
3428
3580
|
}
|
3429
3581
|
|
3582
|
+
int llama_tokenize(
|
3583
|
+
struct llama_context * ctx,
|
3584
|
+
const char * text,
|
3585
|
+
llama_token * tokens,
|
3586
|
+
int n_max_tokens,
|
3587
|
+
bool add_bos) {
|
3588
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3592
|
+
return model->vocab.id_to_token.size();
|
3593
|
+
}
|
3594
|
+
|
3595
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3596
|
+
return model->hparams.n_ctx;
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
3600
|
+
return model->hparams.n_embd;
|
3601
|
+
}
|
3602
|
+
|
3430
3603
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3431
|
-
return ctx->vocab.id_to_token.size();
|
3604
|
+
return ctx->model.vocab.id_to_token.size();
|
3432
3605
|
}
|
3433
3606
|
|
3434
3607
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3439,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3439
3612
|
return ctx->model.hparams.n_embd;
|
3440
3613
|
}
|
3441
3614
|
|
3442
|
-
int
|
3443
|
-
const struct
|
3615
|
+
int llama_get_vocab_from_model(
|
3616
|
+
const struct llama_model * model,
|
3444
3617
|
const char * * strings,
|
3445
3618
|
float * scores,
|
3446
3619
|
int capacity) {
|
3447
|
-
int n = std::min(capacity, (int)
|
3620
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3448
3621
|
for (int i = 0; i<n; ++i) {
|
3449
|
-
strings[i] =
|
3450
|
-
scores[i] =
|
3622
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
3623
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3451
3624
|
}
|
3452
3625
|
return n;
|
3453
3626
|
}
|
3454
3627
|
|
3628
|
+
int llama_get_vocab(
|
3629
|
+
const struct llama_context * ctx,
|
3630
|
+
const char * * strings,
|
3631
|
+
float * scores,
|
3632
|
+
int capacity) {
|
3633
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
3634
|
+
}
|
3635
|
+
|
3455
3636
|
float * llama_get_logits(struct llama_context * ctx) {
|
3456
3637
|
return ctx->logits.data();
|
3457
3638
|
}
|
@@ -3460,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3460
3641
|
return ctx->embedding.data();
|
3461
3642
|
}
|
3462
3643
|
|
3463
|
-
const char *
|
3464
|
-
if (token >=
|
3644
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
3645
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3465
3646
|
return nullptr;
|
3466
3647
|
}
|
3467
3648
|
|
3468
|
-
return
|
3649
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
3650
|
+
}
|
3651
|
+
|
3652
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
3653
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3469
3654
|
}
|
3470
3655
|
|
3471
3656
|
llama_token llama_token_bos() {
|