llama_cpp 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -79,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
79
82
|
(void) tensor;
|
80
83
|
}
|
81
84
|
|
82
|
-
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
104
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
83
105
|
{
|
84
106
|
static std::map<e_model, size_t> k_sizes = {
|
85
|
-
|
86
|
-
{
|
87
|
-
{
|
88
|
-
{
|
89
|
-
{
|
107
|
+
/* empirical scaling, still a guess */
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
90
113
|
};
|
91
114
|
return k_sizes;
|
92
115
|
}
|
@@ -118,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
118
141
|
|
119
142
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
120
143
|
// not actually needed if BLAS is disabled
|
121
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
122
145
|
{
|
123
146
|
static std::map<e_model, size_t> k_sizes = {
|
124
|
-
{ MODEL_3B,
|
125
|
-
{ MODEL_7B,
|
126
|
-
{ MODEL_13B, 1024ull * MB },
|
127
|
-
{ MODEL_30B, 1280ull * MB },
|
128
|
-
{ MODEL_65B, 1536ull * MB },
|
147
|
+
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
+
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
+
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
+
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
+
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
129
152
|
};
|
130
153
|
return k_sizes;
|
131
154
|
}
|
@@ -167,6 +190,10 @@ struct llama_hparams {
|
|
167
190
|
uint32_t n_head = 32;
|
168
191
|
uint32_t n_layer = 32;
|
169
192
|
uint32_t n_rot = 64;
|
193
|
+
|
194
|
+
float rope_freq_base = 10000.0f;
|
195
|
+
float rope_freq_scale = 1.0f;
|
196
|
+
|
170
197
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
171
198
|
|
172
199
|
bool operator!=(const llama_hparams & other) const {
|
@@ -281,7 +308,7 @@ struct llama_model {
|
|
281
308
|
};
|
282
309
|
|
283
310
|
struct llama_context {
|
284
|
-
llama_context(const llama_model & model
|
311
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
285
312
|
#ifdef GGML_USE_METAL
|
286
313
|
~llama_context() {
|
287
314
|
if (ctx_metal) {
|
@@ -302,7 +329,6 @@ struct llama_context {
|
|
302
329
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
303
330
|
|
304
331
|
const llama_model & model;
|
305
|
-
const llama_vocab & vocab;
|
306
332
|
|
307
333
|
bool model_owner = false;
|
308
334
|
|
@@ -321,6 +347,9 @@ struct llama_context {
|
|
321
347
|
// input embedding (1-dimensional array: [n_embd])
|
322
348
|
std::vector<float> embedding;
|
323
349
|
|
350
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
351
|
+
std::vector<uint8_t> work_buffer;
|
352
|
+
|
324
353
|
// memory buffers used to evaluate the model
|
325
354
|
// TODO: move in llama_state
|
326
355
|
llama_ctx_buffer buf_compute;
|
@@ -330,6 +359,10 @@ struct llama_context {
|
|
330
359
|
ggml_metal_context * ctx_metal = NULL;
|
331
360
|
#endif
|
332
361
|
|
362
|
+
#ifdef GGML_USE_MPI
|
363
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
364
|
+
#endif
|
365
|
+
|
333
366
|
int buf_last = 0;
|
334
367
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
335
368
|
|
@@ -522,7 +555,9 @@ struct llama_file_loader {
|
|
522
555
|
}
|
523
556
|
|
524
557
|
// skip to the next multiple of 32 bytes
|
525
|
-
|
558
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
559
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
560
|
+
}
|
526
561
|
|
527
562
|
tensor.file_off = file.tell();
|
528
563
|
tensor.name = name;
|
@@ -619,7 +654,7 @@ struct llama_model_loader {
|
|
619
654
|
*ctx_size_p = *mmapped_size_p = 0;
|
620
655
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
621
656
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
622
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
657
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
623
658
|
}
|
624
659
|
}
|
625
660
|
|
@@ -758,7 +793,6 @@ struct llama_model_loader {
|
|
758
793
|
|
759
794
|
};
|
760
795
|
|
761
|
-
|
762
796
|
//
|
763
797
|
// kv cache
|
764
798
|
//
|
@@ -815,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
|
|
815
849
|
/*.n_batch =*/ 512,
|
816
850
|
/*.gpu_layers =*/ 0,
|
817
851
|
/*.main_gpu =*/ 0,
|
818
|
-
/*.tensor_split =*/
|
852
|
+
/*.tensor_split =*/ nullptr,
|
853
|
+
/*.rope_freq_base =*/ 10000.0f,
|
854
|
+
/*.rope_freq_scale =*/ 1.0f,
|
819
855
|
/*.progress_callback =*/ nullptr,
|
820
856
|
/*.progress_callback_user_data =*/ nullptr,
|
821
857
|
/*.low_vram =*/ false,
|
@@ -841,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
841
877
|
return result;
|
842
878
|
}
|
843
879
|
|
880
|
+
int llama_max_devices() {
|
881
|
+
return LLAMA_MAX_DEVICES;
|
882
|
+
}
|
883
|
+
|
844
884
|
bool llama_mmap_supported() {
|
845
885
|
return llama_mmap::SUPPORTED;
|
846
886
|
}
|
@@ -849,7 +889,7 @@ bool llama_mlock_supported() {
|
|
849
889
|
return llama_mlock::SUPPORTED;
|
850
890
|
}
|
851
891
|
|
852
|
-
void
|
892
|
+
void llama_backend_init(bool numa) {
|
853
893
|
ggml_time_init();
|
854
894
|
|
855
895
|
// needed to initialize f16 tables
|
@@ -862,6 +902,16 @@ void llama_init_backend(bool numa) {
|
|
862
902
|
if (numa) {
|
863
903
|
ggml_numa_init();
|
864
904
|
}
|
905
|
+
|
906
|
+
#ifdef GGML_USE_MPI
|
907
|
+
ggml_mpi_backend_init();
|
908
|
+
#endif
|
909
|
+
}
|
910
|
+
|
911
|
+
void llama_backend_free() {
|
912
|
+
#ifdef GGML_USE_MPI
|
913
|
+
ggml_mpi_backend_free();
|
914
|
+
#endif
|
865
915
|
}
|
866
916
|
|
867
917
|
int64_t llama_time_us() {
|
@@ -929,6 +979,8 @@ static void llama_model_load_internal(
|
|
929
979
|
int n_gpu_layers,
|
930
980
|
int main_gpu,
|
931
981
|
const float * tensor_split,
|
982
|
+
float rope_freq_base,
|
983
|
+
float rope_freq_scale,
|
932
984
|
bool low_vram,
|
933
985
|
ggml_type memory_type,
|
934
986
|
bool use_mmap,
|
@@ -963,22 +1015,27 @@ static void llama_model_load_internal(
|
|
963
1015
|
}
|
964
1016
|
|
965
1017
|
hparams.n_ctx = n_ctx;
|
1018
|
+
|
1019
|
+
hparams.rope_freq_base = rope_freq_base;
|
1020
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
966
1021
|
}
|
967
1022
|
|
968
1023
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
969
1024
|
|
970
1025
|
{
|
971
|
-
fprintf(stderr, "%s: format = %s\n",
|
972
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
973
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
974
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
975
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
976
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
977
|
-
fprintf(stderr, "%s: n_layer = %u\n",
|
978
|
-
fprintf(stderr, "%s: n_rot = %u\n",
|
1026
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1027
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1028
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1029
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1032
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1034
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
979
1036
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
980
|
-
fprintf(stderr, "%s: n_ff = %u\n",
|
981
|
-
fprintf(stderr, "%s: model size = %s\n",
|
1037
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
982
1039
|
}
|
983
1040
|
|
984
1041
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1127,9 +1184,9 @@ static void llama_model_load_internal(
|
|
1127
1184
|
const size_t mem_required =
|
1128
1185
|
ctx_size +
|
1129
1186
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1130
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1187
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1131
1188
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1132
|
-
MEM_REQ_EVAL().at
|
1189
|
+
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
1133
1190
|
|
1134
1191
|
// this is the memory required by one llama_state
|
1135
1192
|
const size_t mem_required_state =
|
@@ -1232,7 +1289,9 @@ static bool llama_model_load(
|
|
1232
1289
|
int n_batch,
|
1233
1290
|
int n_gpu_layers,
|
1234
1291
|
int main_gpu,
|
1235
|
-
float * tensor_split,
|
1292
|
+
const float * tensor_split,
|
1293
|
+
float rope_freq_base,
|
1294
|
+
float rope_freq_scale,
|
1236
1295
|
bool low_vram,
|
1237
1296
|
ggml_type memory_type,
|
1238
1297
|
bool use_mmap,
|
@@ -1241,7 +1300,7 @@ static bool llama_model_load(
|
|
1241
1300
|
llama_progress_callback progress_callback,
|
1242
1301
|
void *progress_callback_user_data) {
|
1243
1302
|
try {
|
1244
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1303
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1245
1304
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1246
1305
|
return true;
|
1247
1306
|
} catch (const std::exception & err) {
|
@@ -1263,18 +1322,16 @@ static bool llama_eval_internal(
|
|
1263
1322
|
llama_context & lctx,
|
1264
1323
|
const llama_token * tokens,
|
1265
1324
|
const float * embd,
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1325
|
+
int n_tokens,
|
1326
|
+
int n_past,
|
1327
|
+
int n_threads,
|
1269
1328
|
const char * cgraph_fname) {
|
1270
1329
|
|
1271
1330
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
1331
|
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
return false;
|
1277
|
-
}
|
1332
|
+
#ifdef GGML_USE_MPI
|
1333
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1334
|
+
#endif
|
1278
1335
|
|
1279
1336
|
const int64_t t_start_us = ggml_time_us();
|
1280
1337
|
|
@@ -1295,6 +1352,9 @@ static bool llama_eval_internal(
|
|
1295
1352
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
1296
1353
|
const int n_gpu_layers = model.n_gpu_layers;
|
1297
1354
|
|
1355
|
+
const float freq_base = hparams.rope_freq_base;
|
1356
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1357
|
+
|
1298
1358
|
auto & mem_per_token = lctx.mem_per_token;
|
1299
1359
|
auto & buf_compute = lctx.buf_compute;
|
1300
1360
|
|
@@ -1306,20 +1366,26 @@ static bool llama_eval_internal(
|
|
1306
1366
|
|
1307
1367
|
struct ggml_context * ctx0 = ggml_init(params);
|
1308
1368
|
|
1369
|
+
ggml_cgraph gf = {};
|
1370
|
+
|
1309
1371
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1310
1372
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1311
|
-
|
1312
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1373
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1313
1374
|
|
1314
1375
|
struct ggml_tensor * cur;
|
1315
1376
|
struct ggml_tensor * inpL;
|
1316
1377
|
|
1317
1378
|
if (tokens) {
|
1318
|
-
struct ggml_tensor *
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1379
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1380
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1381
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1382
|
+
|
1383
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1322
1384
|
} else {
|
1385
|
+
#ifdef GGML_USE_MPI
|
1386
|
+
GGML_ASSERT(false && "not implemented");
|
1387
|
+
#endif
|
1388
|
+
|
1323
1389
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
1390
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
1391
|
}
|
@@ -1337,18 +1403,20 @@ static bool llama_eval_internal(
|
|
1337
1403
|
offload_func_t offload_func_v = llama_nop;
|
1338
1404
|
|
1339
1405
|
#ifdef GGML_USE_CUBLAS
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1406
|
+
if (n_gpu_layers > n_layer) {
|
1407
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1408
|
+
}
|
1409
|
+
if (n_gpu_layers > n_layer + 1) {
|
1410
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1411
|
+
}
|
1412
|
+
if (n_gpu_layers > n_layer + 2) {
|
1413
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1414
|
+
}
|
1349
1415
|
#endif // GGML_USE_CUBLAS
|
1350
1416
|
|
1351
1417
|
for (int il = 0; il < n_layer; ++il) {
|
1418
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1419
|
+
|
1352
1420
|
offload_func_t offload_func = llama_nop;
|
1353
1421
|
|
1354
1422
|
#ifdef GGML_USE_CUBLAS
|
@@ -1384,11 +1452,11 @@ static bool llama_eval_internal(
|
|
1384
1452
|
offload_func_kq(tmpq);
|
1385
1453
|
ggml_set_name(tmpq, "tmpq");
|
1386
1454
|
|
1387
|
-
struct ggml_tensor * Kcur =
|
1455
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1388
1456
|
offload_func_kq(Kcur);
|
1389
1457
|
ggml_set_name(Kcur, "Kcur");
|
1390
1458
|
|
1391
|
-
struct ggml_tensor * Qcur =
|
1459
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1392
1460
|
offload_func_kq(Qcur);
|
1393
1461
|
ggml_set_name(Qcur, "Qcur");
|
1394
1462
|
|
@@ -1555,7 +1623,6 @@ static bool llama_eval_internal(
|
|
1555
1623
|
|
1556
1624
|
// input for next layer
|
1557
1625
|
inpL = cur;
|
1558
|
-
|
1559
1626
|
}
|
1560
1627
|
|
1561
1628
|
lctx.use_buf(ctx0, 0);
|
@@ -1563,7 +1630,6 @@ static bool llama_eval_internal(
|
|
1563
1630
|
// used at the end to optionally extract the embeddings
|
1564
1631
|
struct ggml_tensor * embeddings = NULL;
|
1565
1632
|
|
1566
|
-
|
1567
1633
|
// norm
|
1568
1634
|
{
|
1569
1635
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1578,7 +1644,6 @@ static bool llama_eval_internal(
|
|
1578
1644
|
embeddings = cur;
|
1579
1645
|
}
|
1580
1646
|
|
1581
|
-
|
1582
1647
|
// lm_head
|
1583
1648
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1584
1649
|
ggml_set_name(cur, "result_output");
|
@@ -1591,8 +1656,13 @@ static bool llama_eval_internal(
|
|
1591
1656
|
// run the computation
|
1592
1657
|
ggml_build_forward_expand(&gf, cur);
|
1593
1658
|
|
1659
|
+
#if GGML_USE_MPI
|
1660
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1661
|
+
#endif
|
1662
|
+
|
1594
1663
|
#ifdef GGML_USE_METAL
|
1595
1664
|
if (lctx.ctx_metal && N == 1) {
|
1665
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1596
1666
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1597
1667
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1598
1668
|
} else {
|
@@ -1612,12 +1682,21 @@ static bool llama_eval_internal(
|
|
1612
1682
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1613
1683
|
}
|
1614
1684
|
|
1615
|
-
|
1685
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1616
1686
|
}
|
1617
1687
|
#else
|
1618
|
-
|
1688
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1689
|
+
#endif
|
1690
|
+
|
1691
|
+
#if GGML_USE_MPI
|
1692
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1619
1693
|
#endif
|
1620
1694
|
|
1695
|
+
// update kv token count
|
1696
|
+
lctx.kv_self.n = n_past + N;
|
1697
|
+
|
1698
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1699
|
+
|
1621
1700
|
if (cgraph_fname) {
|
1622
1701
|
ggml_graph_export(&gf, cgraph_fname);
|
1623
1702
|
}
|
@@ -1633,23 +1712,17 @@ static bool llama_eval_internal(
|
|
1633
1712
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1634
1713
|
//}
|
1635
1714
|
|
1636
|
-
//embd_w.resize(n_vocab*N);
|
1637
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1638
|
-
|
1639
|
-
// update kv token count
|
1640
|
-
lctx.kv_self.n = n_past + N;
|
1641
|
-
|
1642
1715
|
// extract logits
|
1643
1716
|
{
|
1644
1717
|
auto & logits_out = lctx.logits;
|
1645
1718
|
|
1646
1719
|
if (lctx.logits_all) {
|
1647
1720
|
logits_out.resize(n_vocab * N);
|
1648
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1721
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1649
1722
|
} else {
|
1650
1723
|
// return result for just the last token
|
1651
1724
|
logits_out.resize(n_vocab);
|
1652
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1725
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1653
1726
|
}
|
1654
1727
|
}
|
1655
1728
|
|
@@ -1957,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1957
2030
|
}
|
1958
2031
|
|
1959
2032
|
// Normalize the second derivatives
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
2033
|
+
{
|
2034
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2035
|
+
|
2036
|
+
if (second_derivatives_sum > 1e-6f) {
|
2037
|
+
for (float & value : second_derivatives) {
|
2038
|
+
value /= second_derivatives_sum;
|
2039
|
+
}
|
2040
|
+
} else {
|
2041
|
+
for (float & value : second_derivatives) {
|
2042
|
+
value = 1.0f / second_derivatives.size();
|
2043
|
+
}
|
2044
|
+
}
|
1963
2045
|
}
|
1964
2046
|
|
1965
2047
|
float cum_sum = 0.0f;
|
@@ -2118,6 +2200,52 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2118
2200
|
}
|
2119
2201
|
}
|
2120
2202
|
|
2203
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2204
|
+
float max_l = *std::max_element(array, array + size);
|
2205
|
+
float sum = 0.f;
|
2206
|
+
for (size_t i = 0; i < size; ++i) {
|
2207
|
+
float p = expf(array[i] - max_l);
|
2208
|
+
sum += p;
|
2209
|
+
array[i] = p;
|
2210
|
+
}
|
2211
|
+
|
2212
|
+
for (size_t i = 0; i < size; ++i) {
|
2213
|
+
array[i] = logf(array[i] / sum);
|
2214
|
+
}
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
void llama_sample_classifier_free_guidance(
|
2218
|
+
struct llama_context * ctx,
|
2219
|
+
llama_token_data_array * candidates,
|
2220
|
+
struct llama_context * guidance_ctx,
|
2221
|
+
float scale) {
|
2222
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2223
|
+
|
2224
|
+
assert(ctx);
|
2225
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2226
|
+
assert(n_vocab == (int)candidates->size);
|
2227
|
+
assert(!candidates->sorted);
|
2228
|
+
|
2229
|
+
std::vector<float> logits_base;
|
2230
|
+
logits_base.reserve(candidates->size);
|
2231
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2232
|
+
logits_base.push_back(candidates->data[i].logit);
|
2233
|
+
}
|
2234
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2235
|
+
|
2236
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2237
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2238
|
+
|
2239
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2240
|
+
float logit_guidance = logits_guidance[i];
|
2241
|
+
float logit_base = logits_base[i];
|
2242
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2243
|
+
}
|
2244
|
+
|
2245
|
+
if (ctx) {
|
2246
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2247
|
+
}
|
2248
|
+
}
|
2121
2249
|
|
2122
2250
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2123
2251
|
assert(ctx);
|
@@ -2405,15 +2533,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2405
2533
|
} else {
|
2406
2534
|
new_type = quantized_type;
|
2407
2535
|
#ifdef GGML_USE_K_QUANTS
|
2536
|
+
bool convert_incompatible_tensor = false;
|
2408
2537
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2409
2538
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2410
2539
|
int nx = tensor.ne.at(0);
|
2411
2540
|
int ny = tensor.ne.at(1);
|
2412
2541
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2413
|
-
fprintf(stderr, "\n\
|
2414
|
-
|
2415
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2416
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2542
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2543
|
+
convert_incompatible_tensor = true;
|
2417
2544
|
}
|
2418
2545
|
}
|
2419
2546
|
if (tensor.name == "output.weight") {
|
@@ -2441,6 +2568,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2441
2568
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2442
2569
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2443
2570
|
}
|
2571
|
+
if (convert_incompatible_tensor) {
|
2572
|
+
if (tensor.name == "output.weight") {
|
2573
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2574
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2575
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2576
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2577
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2578
|
+
} else {
|
2579
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2580
|
+
}
|
2581
|
+
}
|
2444
2582
|
#endif
|
2445
2583
|
|
2446
2584
|
float * f32_data;
|
@@ -2560,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
|
|
2560
2698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2561
2699
|
|
2562
2700
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2563
|
-
params.main_gpu, params.tensor_split, params.
|
2564
|
-
params.
|
2701
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
|
+
params.progress_callback_user_data)) {
|
2565
2704
|
delete model;
|
2566
2705
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2567
2706
|
return nullptr;
|
@@ -2575,14 +2714,14 @@ void llama_free_model(struct llama_model * model) {
|
|
2575
2714
|
}
|
2576
2715
|
|
2577
2716
|
struct llama_context * llama_new_context_with_model(
|
2578
|
-
|
2579
|
-
|
2717
|
+
struct llama_model * model,
|
2718
|
+
struct llama_context_params params) {
|
2580
2719
|
|
2581
2720
|
if (!model) {
|
2582
2721
|
return nullptr;
|
2583
2722
|
}
|
2584
2723
|
|
2585
|
-
llama_context * ctx = new llama_context(*model
|
2724
|
+
llama_context * ctx = new llama_context(*model);
|
2586
2725
|
|
2587
2726
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2588
2727
|
params.seed = time(NULL);
|
@@ -2636,16 +2775,16 @@ struct llama_context * llama_new_context_with_model(
|
|
2636
2775
|
ctx->embedding.resize(hparams.n_embd);
|
2637
2776
|
}
|
2638
2777
|
|
2639
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2778
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
2640
2779
|
|
2641
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
2780
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2642
2781
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2643
2782
|
}
|
2644
2783
|
|
2645
2784
|
#ifdef GGML_USE_METAL
|
2646
2785
|
if (params.n_gpu_layers > 0) {
|
2647
2786
|
// this allocates all Metal resources and memory buffers
|
2648
|
-
ctx->ctx_metal = ggml_metal_init();
|
2787
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2649
2788
|
|
2650
2789
|
void * data_ptr = NULL;
|
2651
2790
|
size_t data_size = 0;
|
@@ -2680,6 +2819,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2680
2819
|
}
|
2681
2820
|
#endif
|
2682
2821
|
|
2822
|
+
#ifdef GGML_USE_MPI
|
2823
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2824
|
+
|
2825
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2826
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2827
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2828
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2829
|
+
llama_backend_free();
|
2830
|
+
exit(1);
|
2831
|
+
}
|
2832
|
+
#endif
|
2833
|
+
|
2683
2834
|
return ctx;
|
2684
2835
|
}
|
2685
2836
|
|
@@ -2802,6 +2953,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2802
2953
|
// read tensors and apply
|
2803
2954
|
bool warned = false;
|
2804
2955
|
int n_tensors = 0;
|
2956
|
+
|
2957
|
+
std::vector<uint8_t> work_buffer;
|
2958
|
+
|
2805
2959
|
while (true) {
|
2806
2960
|
int32_t n_dims;
|
2807
2961
|
int32_t length;
|
@@ -2966,8 +3120,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2966
3120
|
}
|
2967
3121
|
|
2968
3122
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2969
|
-
|
2970
|
-
|
3123
|
+
|
3124
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2971
3125
|
|
2972
3126
|
// we won't need these tensors again, reset the context to save memory
|
2973
3127
|
ggml_free(lora_ctx);
|
@@ -3120,7 +3274,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3120
3274
|
|
3121
3275
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3122
3276
|
ggml_cgraph gf{};
|
3123
|
-
gf.n_threads = 1;
|
3124
3277
|
|
3125
3278
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3126
3279
|
kout3d->data = out;
|
@@ -3140,7 +3293,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3140
3293
|
|
3141
3294
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3142
3295
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3143
|
-
|
3296
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3144
3297
|
|
3145
3298
|
ggml_free(cpy_ctx);
|
3146
3299
|
}
|
@@ -3226,7 +3379,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3226
3379
|
|
3227
3380
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3228
3381
|
ggml_cgraph gf{};
|
3229
|
-
gf.n_threads = 1;
|
3230
3382
|
|
3231
3383
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3232
3384
|
kin3d->data = (void *) inp;
|
@@ -3246,7 +3398,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3246
3398
|
|
3247
3399
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3248
3400
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3249
|
-
|
3401
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3250
3402
|
|
3251
3403
|
ggml_free(cpy_ctx);
|
3252
3404
|
}
|
@@ -3407,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3407
3559
|
return 0;
|
3408
3560
|
}
|
3409
3561
|
|
3410
|
-
int
|
3411
|
-
|
3562
|
+
int llama_tokenize_with_model(
|
3563
|
+
const struct llama_model * model,
|
3412
3564
|
const char * text,
|
3413
3565
|
llama_token * tokens,
|
3414
3566
|
int n_max_tokens,
|
3415
3567
|
bool add_bos) {
|
3416
|
-
auto res = llama_tokenize(
|
3568
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3417
3569
|
|
3418
3570
|
if (n_max_tokens < (int) res.size()) {
|
3419
3571
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3427,8 +3579,29 @@ int llama_tokenize(
|
|
3427
3579
|
return res.size();
|
3428
3580
|
}
|
3429
3581
|
|
3582
|
+
int llama_tokenize(
|
3583
|
+
struct llama_context * ctx,
|
3584
|
+
const char * text,
|
3585
|
+
llama_token * tokens,
|
3586
|
+
int n_max_tokens,
|
3587
|
+
bool add_bos) {
|
3588
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3592
|
+
return model->vocab.id_to_token.size();
|
3593
|
+
}
|
3594
|
+
|
3595
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3596
|
+
return model->hparams.n_ctx;
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
3600
|
+
return model->hparams.n_embd;
|
3601
|
+
}
|
3602
|
+
|
3430
3603
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3431
|
-
return ctx->vocab.id_to_token.size();
|
3604
|
+
return ctx->model.vocab.id_to_token.size();
|
3432
3605
|
}
|
3433
3606
|
|
3434
3607
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3439,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3439
3612
|
return ctx->model.hparams.n_embd;
|
3440
3613
|
}
|
3441
3614
|
|
3442
|
-
int
|
3443
|
-
const struct
|
3615
|
+
int llama_get_vocab_from_model(
|
3616
|
+
const struct llama_model * model,
|
3444
3617
|
const char * * strings,
|
3445
3618
|
float * scores,
|
3446
3619
|
int capacity) {
|
3447
|
-
int n = std::min(capacity, (int)
|
3620
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3448
3621
|
for (int i = 0; i<n; ++i) {
|
3449
|
-
strings[i] =
|
3450
|
-
scores[i] =
|
3622
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
3623
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3451
3624
|
}
|
3452
3625
|
return n;
|
3453
3626
|
}
|
3454
3627
|
|
3628
|
+
int llama_get_vocab(
|
3629
|
+
const struct llama_context * ctx,
|
3630
|
+
const char * * strings,
|
3631
|
+
float * scores,
|
3632
|
+
int capacity) {
|
3633
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
3634
|
+
}
|
3635
|
+
|
3455
3636
|
float * llama_get_logits(struct llama_context * ctx) {
|
3456
3637
|
return ctx->logits.data();
|
3457
3638
|
}
|
@@ -3460,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3460
3641
|
return ctx->embedding.data();
|
3461
3642
|
}
|
3462
3643
|
|
3463
|
-
const char *
|
3464
|
-
if (token >=
|
3644
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
3645
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3465
3646
|
return nullptr;
|
3466
3647
|
}
|
3467
3648
|
|
3468
|
-
return
|
3649
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
3650
|
+
}
|
3651
|
+
|
3652
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
3653
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3469
3654
|
}
|
3470
3655
|
|
3471
3656
|
llama_token llama_token_bos() {
|