llama_cpp 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -66,6 +66,7 @@ enum e_model {
|
|
66
66
|
MODEL_65B,
|
67
67
|
};
|
68
68
|
|
69
|
+
static const size_t kB = 1024;
|
69
70
|
static const size_t MB = 1024*1024;
|
70
71
|
|
71
72
|
// computed for n_ctx == 2048
|
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
130
|
return k_sizes;
|
130
131
|
}
|
131
132
|
|
133
|
+
// amount of VRAM needed per batch size to hold temporary results
|
134
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
135
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
136
|
+
{
|
137
|
+
static std::map<e_model, size_t> k_sizes = {
|
138
|
+
{ MODEL_3B, 512ull * kB },
|
139
|
+
{ MODEL_7B, 512ull * kB },
|
140
|
+
{ MODEL_13B, 640ull * kB },
|
141
|
+
{ MODEL_30B, 768ull * kB },
|
142
|
+
{ MODEL_65B, 1536ull * kB },
|
143
|
+
};
|
144
|
+
return k_sizes;
|
145
|
+
}
|
146
|
+
|
147
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
148
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
149
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
150
|
+
{
|
151
|
+
static std::map<e_model, size_t> k_sizes = {
|
152
|
+
{ MODEL_3B, 128ull },
|
153
|
+
{ MODEL_7B, 128ull },
|
154
|
+
{ MODEL_13B, 160ull },
|
155
|
+
{ MODEL_30B, 208ull },
|
156
|
+
{ MODEL_65B, 416ull },
|
157
|
+
};
|
158
|
+
return k_sizes;
|
159
|
+
}
|
160
|
+
|
132
161
|
// default hparams (LLaMA 7B)
|
133
162
|
struct llama_hparams {
|
134
163
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +194,8 @@ struct llama_layer {
|
|
165
194
|
};
|
166
195
|
|
167
196
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
197
|
+
struct ggml_tensor * k = NULL;
|
198
|
+
struct ggml_tensor * v = NULL;
|
170
199
|
|
171
200
|
struct ggml_context * ctx = NULL;
|
172
201
|
|
@@ -253,7 +282,13 @@ struct llama_model {
|
|
253
282
|
|
254
283
|
struct llama_context {
|
255
284
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
285
|
+
#ifdef GGML_USE_METAL
|
286
|
+
~llama_context() {
|
287
|
+
if (ctx_metal) {
|
288
|
+
ggml_metal_free(ctx_metal);
|
289
|
+
}
|
290
|
+
}
|
291
|
+
#endif
|
257
292
|
std::mt19937 rng;
|
258
293
|
|
259
294
|
bool has_evaluated_once = false;
|
@@ -364,96 +399,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
364
399
|
return size / ggml_blck_size(type);
|
365
400
|
}
|
366
401
|
|
367
|
-
struct llama_load_tensor_shard {
|
368
|
-
std::vector<uint32_t> ne;
|
369
|
-
size_t size;
|
370
|
-
enum ggml_type type;
|
371
|
-
size_t file_idx;
|
372
|
-
size_t file_off;
|
373
|
-
|
374
|
-
void calc_size() {
|
375
|
-
size = llama_calc_tensor_size(ne, type);
|
376
|
-
}
|
377
|
-
};
|
378
|
-
|
379
|
-
enum llama_split_type {
|
380
|
-
SPLIT_NONE,
|
381
|
-
SPLIT_BY_COLUMNS,
|
382
|
-
SPLIT_BY_ROWS
|
383
|
-
};
|
384
|
-
|
385
402
|
struct llama_load_tensor {
|
386
|
-
std::vector<llama_load_tensor_shard> shards;
|
387
|
-
|
388
403
|
std::string name;
|
389
404
|
enum ggml_type type = GGML_TYPE_F32;
|
390
|
-
llama_split_type split_type = SPLIT_NONE;
|
391
405
|
std::vector<uint32_t> ne;
|
406
|
+
size_t file_off;
|
392
407
|
size_t size;
|
393
408
|
struct ggml_tensor * ggml_tensor = NULL;
|
394
409
|
uint8_t * data;
|
395
|
-
|
396
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
397
|
-
|
398
|
-
void calc_all() {
|
399
|
-
calc_type();
|
400
|
-
calc_split_type();
|
401
|
-
calc_ne();
|
402
|
-
calc_size();
|
403
|
-
}
|
404
|
-
|
405
|
-
void calc_type() {
|
406
|
-
const auto & first_shard = shards.at(0);
|
407
|
-
for (const auto & shard : shards) {
|
408
|
-
if (shard.type != first_shard.type) {
|
409
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
410
|
-
}
|
411
|
-
}
|
412
|
-
type = first_shard.type;
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_split_type() {
|
416
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
417
|
-
shards.size() == 1) { // only one file?
|
418
|
-
split_type = SPLIT_NONE;
|
419
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
420
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
421
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
422
|
-
split_type = SPLIT_BY_COLUMNS;
|
423
|
-
} else {
|
424
|
-
split_type = SPLIT_BY_ROWS;
|
425
|
-
}
|
426
|
-
}
|
427
|
-
|
428
|
-
void calc_ne() {
|
429
|
-
const auto & first_shard = shards.at(0);
|
430
|
-
for (const auto & shard : shards) {
|
431
|
-
if (shard.ne != first_shard.ne) {
|
432
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
433
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
434
|
-
}
|
435
|
-
}
|
436
|
-
ne = first_shard.ne;
|
437
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
438
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
439
|
-
switch (split_type) {
|
440
|
-
case SPLIT_NONE:
|
441
|
-
ne = first_shard.ne;
|
442
|
-
break;
|
443
|
-
case SPLIT_BY_COLUMNS:
|
444
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
445
|
-
first_shard.ne[1]};
|
446
|
-
break;
|
447
|
-
case SPLIT_BY_ROWS:
|
448
|
-
ne = {first_shard.ne[0],
|
449
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
450
|
-
break;
|
451
|
-
}
|
452
|
-
}
|
453
|
-
|
454
|
-
void calc_size() {
|
455
|
-
size = llama_calc_tensor_size(ne, type);
|
456
|
-
}
|
457
410
|
};
|
458
411
|
|
459
412
|
struct llama_load_tensors_map {
|
@@ -476,13 +429,13 @@ struct llama_file_loader {
|
|
476
429
|
llama_hparams hparams;
|
477
430
|
llama_vocab vocab;
|
478
431
|
|
479
|
-
llama_file_loader(const char * fname,
|
432
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
480
433
|
: file(fname, "rb") {
|
481
434
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
482
435
|
read_magic();
|
483
436
|
read_hparams();
|
484
437
|
read_vocab();
|
485
|
-
read_tensor_metadata(
|
438
|
+
read_tensor_metadata(tensors_map);
|
486
439
|
}
|
487
440
|
void read_magic() {
|
488
441
|
uint32_t magic = file.read_u32();
|
@@ -528,9 +481,7 @@ struct llama_file_loader {
|
|
528
481
|
std::string word = file.read_string(len);
|
529
482
|
|
530
483
|
float score = 0.0f;
|
531
|
-
|
532
|
-
file.read_raw(&score, sizeof(score));
|
533
|
-
}
|
484
|
+
file.read_raw(&score, sizeof(score));
|
534
485
|
|
535
486
|
vocab.token_to_id[word] = i;
|
536
487
|
|
@@ -539,19 +490,19 @@ struct llama_file_loader {
|
|
539
490
|
tok_score.score = score;
|
540
491
|
}
|
541
492
|
}
|
542
|
-
void read_tensor_metadata(
|
493
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
543
494
|
while (file.tell() < file.size) {
|
544
|
-
|
495
|
+
llama_load_tensor tensor;
|
545
496
|
uint32_t n_dims = file.read_u32();
|
546
497
|
uint32_t name_len = file.read_u32();
|
547
|
-
|
548
|
-
|
549
|
-
file.read_raw(
|
498
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
499
|
+
tensor.ne.resize(n_dims);
|
500
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
550
501
|
std::string name = file.read_string(name_len);
|
551
502
|
if (n_dims < 1 || n_dims > 2) {
|
552
503
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
553
504
|
}
|
554
|
-
switch (
|
505
|
+
switch (tensor.type) {
|
555
506
|
case GGML_TYPE_F32:
|
556
507
|
case GGML_TYPE_F16:
|
557
508
|
case GGML_TYPE_Q4_0:
|
@@ -566,30 +517,20 @@ struct llama_file_loader {
|
|
566
517
|
case GGML_TYPE_Q6_K:
|
567
518
|
break;
|
568
519
|
default: {
|
569
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
520
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
570
521
|
}
|
571
522
|
}
|
572
523
|
|
573
|
-
|
574
|
-
|
575
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
576
|
-
}
|
577
|
-
shard.file_idx = file_idx;
|
578
|
-
shard.file_off = file.tell();
|
524
|
+
// skip to the next multiple of 32 bytes
|
525
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
579
526
|
|
580
|
-
|
581
|
-
|
527
|
+
tensor.file_off = file.tell();
|
528
|
+
tensor.name = name;
|
529
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
530
|
+
file.seek(tensor.size, SEEK_CUR);
|
582
531
|
|
583
|
-
|
584
|
-
|
585
|
-
if (it != tensors_map.name_to_idx.end()) {
|
586
|
-
idx = it->second;
|
587
|
-
} else {
|
588
|
-
tensors_map.tensors.emplace_back(name);
|
589
|
-
idx = tensors_map.tensors.size() - 1;
|
590
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
591
|
-
}
|
592
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
532
|
+
tensors_map.tensors.push_back(tensor);
|
533
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
593
534
|
}
|
594
535
|
}
|
595
536
|
};
|
@@ -659,56 +600,19 @@ struct llama_file_saver {
|
|
659
600
|
};
|
660
601
|
|
661
602
|
struct llama_model_loader {
|
662
|
-
std::
|
603
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
663
604
|
llama_load_tensors_map tensors_map;
|
664
605
|
bool use_mmap;
|
665
606
|
size_t num_ggml_tensors_created = 0;
|
666
607
|
struct ggml_context * ggml_ctx = NULL;
|
667
608
|
std::unique_ptr<llama_mmap> mapping;
|
668
609
|
|
669
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
670
|
-
|
671
|
-
file_loaders.emplace_back(first_file);
|
672
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
673
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
674
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
675
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
676
|
-
file_loaders.emplace_back(ith_file);
|
677
|
-
if (ith_file->hparams != first_file->hparams) {
|
678
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
679
|
-
}
|
680
|
-
}
|
610
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
611
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
681
612
|
if (!llama_mmap::SUPPORTED) {
|
682
613
|
use_mmap = false;
|
683
614
|
}
|
684
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
685
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
686
|
-
use_mmap = false;
|
687
|
-
}
|
688
615
|
this->use_mmap = use_mmap;
|
689
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
690
|
-
lt.calc_all();
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
bool alignment_prevents_mmap() {
|
695
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
696
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
697
|
-
if (shard.file_off & 3) {
|
698
|
-
return true;
|
699
|
-
}
|
700
|
-
}
|
701
|
-
}
|
702
|
-
return false;
|
703
|
-
}
|
704
|
-
|
705
|
-
uint32_t guess_n_parts() const {
|
706
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
707
|
-
if (it == tensors_map.name_to_idx.end()) {
|
708
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
709
|
-
}
|
710
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
711
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
712
616
|
}
|
713
617
|
|
714
618
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -774,7 +678,7 @@ struct llama_model_loader {
|
|
774
678
|
}
|
775
679
|
|
776
680
|
if (use_mmap) {
|
777
|
-
mapping.reset(new llama_mmap(&
|
681
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
778
682
|
if (lmlock) {
|
779
683
|
lmlock->init(mapping->addr);
|
780
684
|
}
|
@@ -830,45 +734,13 @@ struct llama_model_loader {
|
|
830
734
|
|
831
735
|
void load_data_for(llama_load_tensor & lt) {
|
832
736
|
if (use_mmap) {
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
737
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
738
|
+
} else {
|
739
|
+
llama_file & file = file_loader->file;
|
740
|
+
file.seek(lt.file_off, SEEK_SET);
|
838
741
|
file.read_raw(lt.data, lt.size);
|
839
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
840
|
-
size_t offset = 0;
|
841
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
842
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
843
|
-
file.seek(shard.file_off, SEEK_SET);
|
844
|
-
file.read_raw(lt.data + offset, shard.size);
|
845
|
-
offset += shard.size;
|
846
|
-
}
|
847
|
-
LLAMA_ASSERT(offset == lt.size);
|
848
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
849
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
850
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
851
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
852
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
853
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
854
|
-
file.seek(shard.file_off, SEEK_SET);
|
855
|
-
tmp_bufs.at(i).resize(shard.size);
|
856
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
857
|
-
}
|
858
|
-
// Then reshape.
|
859
|
-
size_t num_rows = lt.ne.at(1);
|
860
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
861
|
-
size_t out_offset = 0;
|
862
|
-
for (size_t row = 0; row < num_rows; row++) {
|
863
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
864
|
-
memcpy(lt.data + out_offset,
|
865
|
-
tmp_buf.addr + row * per_shard_row_size,
|
866
|
-
per_shard_row_size);
|
867
|
-
out_offset += per_shard_row_size;
|
868
|
-
}
|
869
|
-
}
|
870
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
871
742
|
}
|
743
|
+
|
872
744
|
if (0) {
|
873
745
|
print_checksum(lt);
|
874
746
|
}
|
@@ -938,7 +810,7 @@ static bool kv_cache_init(
|
|
938
810
|
|
939
811
|
struct llama_context_params llama_context_default_params() {
|
940
812
|
struct llama_context_params result = {
|
941
|
-
/*.seed =*/
|
813
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
942
814
|
/*.n_ctx =*/ 512,
|
943
815
|
/*.n_batch =*/ 512,
|
944
816
|
/*.gpu_layers =*/ 0,
|
@@ -1067,12 +939,12 @@ static void llama_model_load_internal(
|
|
1067
939
|
|
1068
940
|
model.t_start_us = ggml_time_us();
|
1069
941
|
|
1070
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
942
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1071
943
|
|
1072
|
-
vocab = std::move(ml->
|
1073
|
-
model.hparams = ml->
|
944
|
+
vocab = std::move(ml->file_loader->vocab);
|
945
|
+
model.hparams = ml->file_loader->hparams;
|
1074
946
|
model.n_gpu_layers = n_gpu_layers;
|
1075
|
-
llama_file_version file_version = ml->
|
947
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1076
948
|
auto & hparams = model.hparams;
|
1077
949
|
|
1078
950
|
{
|
@@ -1106,7 +978,6 @@ static void llama_model_load_internal(
|
|
1106
978
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1107
979
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1108
980
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1109
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1110
981
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
982
|
}
|
1112
983
|
|
@@ -1274,14 +1145,18 @@ static void llama_model_load_internal(
|
|
1274
1145
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1275
1146
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1276
1147
|
} else {
|
1277
|
-
|
1148
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1149
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1150
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1278
1151
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1279
1152
|
if (n_gpu_layers > 0) {
|
1280
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1281
|
-
__func__,
|
1153
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1154
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1155
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1282
1156
|
}
|
1283
1157
|
}
|
1284
1158
|
#endif // GGML_USE_CUBLAS
|
1159
|
+
|
1285
1160
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1286
1161
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1287
1162
|
|
@@ -1290,6 +1165,10 @@ static void llama_model_load_internal(
|
|
1290
1165
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1291
1166
|
}
|
1292
1167
|
size_t vram_kv_cache = 0;
|
1168
|
+
|
1169
|
+
#ifdef GGML_USE_CUBLAS
|
1170
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1171
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1293
1172
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1294
1173
|
if (low_vram) {
|
1295
1174
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1306,14 +1185,18 @@ static void llama_model_load_internal(
|
|
1306
1185
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1307
1186
|
}
|
1308
1187
|
}
|
1309
|
-
|
1188
|
+
#elif defined(GGML_USE_CLBLAST)
|
1189
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1190
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1191
|
+
#endif // GGML_USE_CUBLAS
|
1192
|
+
|
1310
1193
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1311
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1194
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1312
1195
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1313
1196
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1314
1197
|
#else
|
1315
1198
|
(void) n_gpu_layers;
|
1316
|
-
#endif
|
1199
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1317
1200
|
}
|
1318
1201
|
|
1319
1202
|
// populate `tensors_by_name`
|
@@ -1369,22 +1252,26 @@ static bool llama_model_load(
|
|
1369
1252
|
|
1370
1253
|
// evaluate the transformer
|
1371
1254
|
//
|
1372
|
-
// - lctx:
|
1373
|
-
// - tokens:
|
1374
|
-
// -
|
1375
|
-
// -
|
1376
|
-
// -
|
1255
|
+
// - lctx: llama context
|
1256
|
+
// - tokens: new batch of tokens to process
|
1257
|
+
// - embd embeddings input
|
1258
|
+
// - n_tokens number of tokens
|
1259
|
+
// - n_past: the context size so far
|
1260
|
+
// - n_threads: number of threads to use
|
1377
1261
|
//
|
1378
1262
|
static bool llama_eval_internal(
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1263
|
+
llama_context & lctx,
|
1264
|
+
const llama_token * tokens,
|
1265
|
+
const float * embd,
|
1266
|
+
const int n_tokens,
|
1267
|
+
const int n_past,
|
1268
|
+
const int n_threads,
|
1384
1269
|
const char * cgraph_fname) {
|
1385
1270
|
|
1271
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
|
+
|
1386
1273
|
// enforce that the first token is BOS
|
1387
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1274
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1388
1275
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1389
1276
|
return false;
|
1390
1277
|
}
|
@@ -1424,12 +1311,18 @@ static bool llama_eval_internal(
|
|
1424
1311
|
ggml_cgraph gf = {};
|
1425
1312
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1426
1313
|
|
1427
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1428
|
-
ggml_set_name(embd, "embd");
|
1429
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1430
|
-
|
1431
1314
|
struct ggml_tensor * cur;
|
1432
|
-
struct ggml_tensor * inpL
|
1315
|
+
struct ggml_tensor * inpL;
|
1316
|
+
|
1317
|
+
if (tokens) {
|
1318
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1319
|
+
ggml_set_name(embd, "embd");
|
1320
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1321
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1322
|
+
} else {
|
1323
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
|
+
}
|
1433
1326
|
|
1434
1327
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1435
1328
|
(void) i_gpu_start;
|
@@ -2012,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2012
1905
|
return;
|
2013
1906
|
}
|
2014
1907
|
|
2015
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2016
|
-
|
2017
1908
|
llama_sample_softmax(ctx, candidates);
|
2018
1909
|
|
1910
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1911
|
+
|
2019
1912
|
// Compute the cumulative probabilities
|
2020
1913
|
float cum_sum = 0.0f;
|
2021
1914
|
size_t last_idx = candidates->size;
|
@@ -2044,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2044
1937
|
return;
|
2045
1938
|
}
|
2046
1939
|
|
2047
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2048
|
-
|
2049
1940
|
llama_sample_softmax(nullptr, candidates);
|
1941
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2050
1942
|
|
2051
1943
|
// Compute the first and second derivatives
|
2052
1944
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -2098,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
2098
1990
|
return;
|
2099
1991
|
}
|
2100
1992
|
|
2101
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2102
|
-
|
2103
1993
|
// Compute the softmax of logits and calculate entropy
|
2104
1994
|
llama_sample_softmax(nullptr, candidates);
|
2105
1995
|
|
1996
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1997
|
+
|
2106
1998
|
float entropy = 0.0f;
|
2107
1999
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2108
2000
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2271,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2271
2163
|
|
2272
2164
|
if (ctx) {
|
2273
2165
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2274
|
-
ctx->n_sample++;
|
2275
2166
|
}
|
2276
2167
|
return X;
|
2277
2168
|
}
|
2278
2169
|
|
2279
2170
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2280
|
-
assert(ctx);
|
2281
2171
|
int64_t t_start_sample_us;
|
2282
2172
|
t_start_sample_us = ggml_time_us();
|
2283
2173
|
|
@@ -2292,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2292
2182
|
candidates->size = 1;
|
2293
2183
|
}
|
2294
2184
|
|
2185
|
+
if (ctx) {
|
2186
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2187
|
+
}
|
2188
|
+
|
2295
2189
|
// Normalize the probabilities of the remaining words
|
2296
2190
|
llama_sample_softmax(ctx, candidates);
|
2297
2191
|
|
2298
2192
|
// Sample the next word X from the remaining words
|
2299
|
-
if (ctx) {
|
2300
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2301
|
-
}
|
2302
2193
|
llama_token X = llama_sample_token(ctx, candidates);
|
2303
2194
|
t_start_sample_us = ggml_time_us();
|
2304
2195
|
|
@@ -2366,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2366
2257
|
}
|
2367
2258
|
float * f32_output = (float *) output.addr;
|
2368
2259
|
|
2369
|
-
|
2260
|
+
ggml_type_traits_t qtype;
|
2370
2261
|
if (ggml_is_quantized(tensor.type)) {
|
2371
|
-
qtype =
|
2372
|
-
if (qtype.
|
2262
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2263
|
+
if (qtype.to_float == NULL) {
|
2373
2264
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2374
2265
|
}
|
2375
2266
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2380,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2380
2271
|
if (tensor.type == GGML_TYPE_F16) {
|
2381
2272
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2382
2273
|
} else if (ggml_is_quantized(tensor.type)) {
|
2383
|
-
qtype.
|
2274
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2384
2275
|
} else {
|
2385
2276
|
LLAMA_ASSERT(false); // unreachable
|
2386
2277
|
}
|
@@ -2405,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2405
2296
|
if (typ == GGML_TYPE_F16) {
|
2406
2297
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2407
2298
|
} else {
|
2408
|
-
qtype.
|
2299
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2409
2300
|
}
|
2410
2301
|
};
|
2411
2302
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -2451,9 +2342,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2451
2342
|
nthread = std::thread::hardware_concurrency();
|
2452
2343
|
}
|
2453
2344
|
|
2454
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2455
|
-
|
2456
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2345
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2346
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2457
2347
|
|
2458
2348
|
#ifdef GGML_USE_K_QUANTS
|
2459
2349
|
int n_attention_wv = 0;
|
@@ -2654,6 +2544,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2654
2544
|
}
|
2655
2545
|
}
|
2656
2546
|
|
2547
|
+
|
2548
|
+
|
2657
2549
|
//
|
2658
2550
|
// interface implementation
|
2659
2551
|
//
|
@@ -2692,7 +2584,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2692
2584
|
|
2693
2585
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
2694
2586
|
|
2695
|
-
if (params.seed
|
2587
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2696
2588
|
params.seed = time(NULL);
|
2697
2589
|
}
|
2698
2590
|
|
@@ -2874,7 +2766,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2874
2766
|
|
2875
2767
|
// create a name -> tensor map of the model to accelerate lookups
|
2876
2768
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2877
|
-
for (auto & kv: model.tensors_by_name) {
|
2769
|
+
for (const auto & kv: model.tensors_by_name) {
|
2878
2770
|
model_tensors.insert(kv);
|
2879
2771
|
}
|
2880
2772
|
|
@@ -2885,7 +2777,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2885
2777
|
llama_buffer base_buf;
|
2886
2778
|
if (path_base_model) {
|
2887
2779
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2888
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2780
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2889
2781
|
|
2890
2782
|
size_t ctx_size;
|
2891
2783
|
size_t mmapped_size;
|
@@ -2903,7 +2795,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2903
2795
|
|
2904
2796
|
// maybe this should in llama_model_loader
|
2905
2797
|
if (model_loader->use_mmap) {
|
2906
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2798
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2907
2799
|
}
|
2908
2800
|
}
|
2909
2801
|
|
@@ -2964,7 +2856,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2964
2856
|
return false;
|
2965
2857
|
}
|
2966
2858
|
}
|
2967
|
-
ggml_tensor* lora_tensor;
|
2859
|
+
ggml_tensor * lora_tensor;
|
2968
2860
|
if (n_dims == 2) {
|
2969
2861
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2970
2862
|
}
|
@@ -2972,6 +2864,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2972
2864
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2973
2865
|
return 1;
|
2974
2866
|
}
|
2867
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2975
2868
|
|
2976
2869
|
// load tensor data
|
2977
2870
|
size_t offset = fin.tellg();
|
@@ -2987,6 +2880,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2987
2880
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2988
2881
|
|
2989
2882
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2883
|
+
|
2884
|
+
offload_func_t offload_func = llama_nop;
|
2885
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2886
|
+
|
2887
|
+
#ifdef GGML_USE_CUBLAS
|
2888
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2889
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2890
|
+
throw std::runtime_error(format(
|
2891
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2892
|
+
}
|
2893
|
+
offload_func = ggml_cuda_assign_buffers;
|
2894
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2895
|
+
}
|
2896
|
+
#endif // GGML_USE_CUBLAS
|
2897
|
+
|
2990
2898
|
ggml_tensor * base_t;
|
2991
2899
|
if (model_loader) {
|
2992
2900
|
// load from base model
|
@@ -3014,7 +2922,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3014
2922
|
}
|
3015
2923
|
|
3016
2924
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2925
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2926
|
+
ggml_set_name(loraA, "loraA");
|
2927
|
+
|
3017
2928
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2929
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2930
|
+
ggml_set_name(loraB, "loraB");
|
3018
2931
|
|
3019
2932
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3020
2933
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -3024,19 +2937,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3024
2937
|
|
3025
2938
|
// w = w + BA*s
|
3026
2939
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2940
|
+
offload_func(BA);
|
2941
|
+
ggml_set_name(BA, "BA");
|
3027
2942
|
|
3028
2943
|
if (scaling != 1.0f) {
|
3029
2944
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2945
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2946
|
+
|
3030
2947
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2948
|
+
offload_func(BA);
|
2949
|
+
ggml_set_name(BA, "BA_scaled");
|
3031
2950
|
}
|
3032
2951
|
|
3033
2952
|
ggml_tensor * r;
|
3034
2953
|
if (base_t == dest_t) {
|
3035
2954
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2955
|
+
offload_func_force_inplace(r);
|
2956
|
+
ggml_set_name(r, "r_add_inplace");
|
3036
2957
|
}
|
3037
2958
|
else {
|
3038
2959
|
r = ggml_add(lora_ctx, base_t, BA);
|
2960
|
+
offload_func(r);
|
2961
|
+
ggml_set_name(r, "r_add");
|
2962
|
+
|
3039
2963
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2964
|
+
offload_func(r);
|
2965
|
+
ggml_set_name(r, "r_cpy");
|
3040
2966
|
}
|
3041
2967
|
|
3042
2968
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3091,8 +3017,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
3091
3017
|
|
3092
3018
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3093
3019
|
|
3094
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3095
|
-
if (seed
|
3020
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
3021
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3096
3022
|
seed = time(NULL);
|
3097
3023
|
}
|
3098
3024
|
ctx->rng.seed(seed);
|
@@ -3336,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3336
3262
|
return nread;
|
3337
3263
|
}
|
3338
3264
|
|
3339
|
-
bool
|
3265
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3340
3266
|
llama_file file(path_session, "rb");
|
3341
3267
|
|
3342
3268
|
// sanity checks
|
@@ -3390,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3390
3316
|
return true;
|
3391
3317
|
}
|
3392
3318
|
|
3319
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3320
|
+
try {
|
3321
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3322
|
+
} catch (const std::exception & err) {
|
3323
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3324
|
+
return false;
|
3325
|
+
}
|
3326
|
+
}
|
3327
|
+
|
3393
3328
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3394
3329
|
llama_file file(path_session, "wb");
|
3395
3330
|
|
@@ -3421,7 +3356,29 @@ int llama_eval(
|
|
3421
3356
|
int n_tokens,
|
3422
3357
|
int n_past,
|
3423
3358
|
int n_threads) {
|
3424
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3359
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3360
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3361
|
+
return 1;
|
3362
|
+
}
|
3363
|
+
|
3364
|
+
// get a more accurate load time, upon first eval
|
3365
|
+
// TODO: fix this
|
3366
|
+
if (!ctx->has_evaluated_once) {
|
3367
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3368
|
+
ctx->has_evaluated_once = true;
|
3369
|
+
}
|
3370
|
+
|
3371
|
+
return 0;
|
3372
|
+
}
|
3373
|
+
|
3374
|
+
|
3375
|
+
int llama_eval_embd(
|
3376
|
+
struct llama_context * ctx,
|
3377
|
+
const float * embd,
|
3378
|
+
int n_tokens,
|
3379
|
+
int n_past,
|
3380
|
+
int n_threads) {
|
3381
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3425
3382
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3426
3383
|
return 1;
|
3427
3384
|
}
|
@@ -3442,7 +3399,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3442
3399
|
|
3443
3400
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3444
3401
|
|
3445
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3402
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3446
3403
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3447
3404
|
return 1;
|
3448
3405
|
}
|
@@ -3523,23 +3480,35 @@ llama_token llama_token_nl() {
|
|
3523
3480
|
return 13;
|
3524
3481
|
}
|
3525
3482
|
|
3483
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3484
|
+
struct llama_timings result = {
|
3485
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3486
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3487
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3488
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3489
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3490
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3526
3491
|
|
3527
|
-
|
3528
|
-
|
3492
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3493
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3494
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3495
|
+
};
|
3529
3496
|
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3497
|
+
return result;
|
3498
|
+
}
|
3499
|
+
|
3500
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3501
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3533
3502
|
|
3534
3503
|
fprintf(stderr, "\n");
|
3535
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3504
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3536
3505
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3537
|
-
__func__,
|
3506
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3538
3507
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3539
|
-
__func__,
|
3508
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3540
3509
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3541
|
-
__func__,
|
3542
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3510
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3511
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3543
3512
|
}
|
3544
3513
|
|
3545
3514
|
void llama_reset_timings(struct llama_context * ctx) {
|