llama_cpp 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -66,6 +66,7 @@ enum e_model {
|
|
66
66
|
MODEL_65B,
|
67
67
|
};
|
68
68
|
|
69
|
+
static const size_t kB = 1024;
|
69
70
|
static const size_t MB = 1024*1024;
|
70
71
|
|
71
72
|
// computed for n_ctx == 2048
|
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
130
|
return k_sizes;
|
130
131
|
}
|
131
132
|
|
133
|
+
// amount of VRAM needed per batch size to hold temporary results
|
134
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
135
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
136
|
+
{
|
137
|
+
static std::map<e_model, size_t> k_sizes = {
|
138
|
+
{ MODEL_3B, 512ull * kB },
|
139
|
+
{ MODEL_7B, 512ull * kB },
|
140
|
+
{ MODEL_13B, 640ull * kB },
|
141
|
+
{ MODEL_30B, 768ull * kB },
|
142
|
+
{ MODEL_65B, 1536ull * kB },
|
143
|
+
};
|
144
|
+
return k_sizes;
|
145
|
+
}
|
146
|
+
|
147
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
148
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
149
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
150
|
+
{
|
151
|
+
static std::map<e_model, size_t> k_sizes = {
|
152
|
+
{ MODEL_3B, 128ull },
|
153
|
+
{ MODEL_7B, 128ull },
|
154
|
+
{ MODEL_13B, 160ull },
|
155
|
+
{ MODEL_30B, 208ull },
|
156
|
+
{ MODEL_65B, 416ull },
|
157
|
+
};
|
158
|
+
return k_sizes;
|
159
|
+
}
|
160
|
+
|
132
161
|
// default hparams (LLaMA 7B)
|
133
162
|
struct llama_hparams {
|
134
163
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +194,8 @@ struct llama_layer {
|
|
165
194
|
};
|
166
195
|
|
167
196
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
197
|
+
struct ggml_tensor * k = NULL;
|
198
|
+
struct ggml_tensor * v = NULL;
|
170
199
|
|
171
200
|
struct ggml_context * ctx = NULL;
|
172
201
|
|
@@ -253,7 +282,13 @@ struct llama_model {
|
|
253
282
|
|
254
283
|
struct llama_context {
|
255
284
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
285
|
+
#ifdef GGML_USE_METAL
|
286
|
+
~llama_context() {
|
287
|
+
if (ctx_metal) {
|
288
|
+
ggml_metal_free(ctx_metal);
|
289
|
+
}
|
290
|
+
}
|
291
|
+
#endif
|
257
292
|
std::mt19937 rng;
|
258
293
|
|
259
294
|
bool has_evaluated_once = false;
|
@@ -364,96 +399,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
364
399
|
return size / ggml_blck_size(type);
|
365
400
|
}
|
366
401
|
|
367
|
-
struct llama_load_tensor_shard {
|
368
|
-
std::vector<uint32_t> ne;
|
369
|
-
size_t size;
|
370
|
-
enum ggml_type type;
|
371
|
-
size_t file_idx;
|
372
|
-
size_t file_off;
|
373
|
-
|
374
|
-
void calc_size() {
|
375
|
-
size = llama_calc_tensor_size(ne, type);
|
376
|
-
}
|
377
|
-
};
|
378
|
-
|
379
|
-
enum llama_split_type {
|
380
|
-
SPLIT_NONE,
|
381
|
-
SPLIT_BY_COLUMNS,
|
382
|
-
SPLIT_BY_ROWS
|
383
|
-
};
|
384
|
-
|
385
402
|
struct llama_load_tensor {
|
386
|
-
std::vector<llama_load_tensor_shard> shards;
|
387
|
-
|
388
403
|
std::string name;
|
389
404
|
enum ggml_type type = GGML_TYPE_F32;
|
390
|
-
llama_split_type split_type = SPLIT_NONE;
|
391
405
|
std::vector<uint32_t> ne;
|
406
|
+
size_t file_off;
|
392
407
|
size_t size;
|
393
408
|
struct ggml_tensor * ggml_tensor = NULL;
|
394
409
|
uint8_t * data;
|
395
|
-
|
396
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
397
|
-
|
398
|
-
void calc_all() {
|
399
|
-
calc_type();
|
400
|
-
calc_split_type();
|
401
|
-
calc_ne();
|
402
|
-
calc_size();
|
403
|
-
}
|
404
|
-
|
405
|
-
void calc_type() {
|
406
|
-
const auto & first_shard = shards.at(0);
|
407
|
-
for (const auto & shard : shards) {
|
408
|
-
if (shard.type != first_shard.type) {
|
409
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
410
|
-
}
|
411
|
-
}
|
412
|
-
type = first_shard.type;
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_split_type() {
|
416
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
417
|
-
shards.size() == 1) { // only one file?
|
418
|
-
split_type = SPLIT_NONE;
|
419
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
420
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
421
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
422
|
-
split_type = SPLIT_BY_COLUMNS;
|
423
|
-
} else {
|
424
|
-
split_type = SPLIT_BY_ROWS;
|
425
|
-
}
|
426
|
-
}
|
427
|
-
|
428
|
-
void calc_ne() {
|
429
|
-
const auto & first_shard = shards.at(0);
|
430
|
-
for (const auto & shard : shards) {
|
431
|
-
if (shard.ne != first_shard.ne) {
|
432
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
433
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
434
|
-
}
|
435
|
-
}
|
436
|
-
ne = first_shard.ne;
|
437
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
438
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
439
|
-
switch (split_type) {
|
440
|
-
case SPLIT_NONE:
|
441
|
-
ne = first_shard.ne;
|
442
|
-
break;
|
443
|
-
case SPLIT_BY_COLUMNS:
|
444
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
445
|
-
first_shard.ne[1]};
|
446
|
-
break;
|
447
|
-
case SPLIT_BY_ROWS:
|
448
|
-
ne = {first_shard.ne[0],
|
449
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
450
|
-
break;
|
451
|
-
}
|
452
|
-
}
|
453
|
-
|
454
|
-
void calc_size() {
|
455
|
-
size = llama_calc_tensor_size(ne, type);
|
456
|
-
}
|
457
410
|
};
|
458
411
|
|
459
412
|
struct llama_load_tensors_map {
|
@@ -476,13 +429,13 @@ struct llama_file_loader {
|
|
476
429
|
llama_hparams hparams;
|
477
430
|
llama_vocab vocab;
|
478
431
|
|
479
|
-
llama_file_loader(const char * fname,
|
432
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
480
433
|
: file(fname, "rb") {
|
481
434
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
482
435
|
read_magic();
|
483
436
|
read_hparams();
|
484
437
|
read_vocab();
|
485
|
-
read_tensor_metadata(
|
438
|
+
read_tensor_metadata(tensors_map);
|
486
439
|
}
|
487
440
|
void read_magic() {
|
488
441
|
uint32_t magic = file.read_u32();
|
@@ -528,9 +481,7 @@ struct llama_file_loader {
|
|
528
481
|
std::string word = file.read_string(len);
|
529
482
|
|
530
483
|
float score = 0.0f;
|
531
|
-
|
532
|
-
file.read_raw(&score, sizeof(score));
|
533
|
-
}
|
484
|
+
file.read_raw(&score, sizeof(score));
|
534
485
|
|
535
486
|
vocab.token_to_id[word] = i;
|
536
487
|
|
@@ -539,19 +490,19 @@ struct llama_file_loader {
|
|
539
490
|
tok_score.score = score;
|
540
491
|
}
|
541
492
|
}
|
542
|
-
void read_tensor_metadata(
|
493
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
543
494
|
while (file.tell() < file.size) {
|
544
|
-
|
495
|
+
llama_load_tensor tensor;
|
545
496
|
uint32_t n_dims = file.read_u32();
|
546
497
|
uint32_t name_len = file.read_u32();
|
547
|
-
|
548
|
-
|
549
|
-
file.read_raw(
|
498
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
499
|
+
tensor.ne.resize(n_dims);
|
500
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
550
501
|
std::string name = file.read_string(name_len);
|
551
502
|
if (n_dims < 1 || n_dims > 2) {
|
552
503
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
553
504
|
}
|
554
|
-
switch (
|
505
|
+
switch (tensor.type) {
|
555
506
|
case GGML_TYPE_F32:
|
556
507
|
case GGML_TYPE_F16:
|
557
508
|
case GGML_TYPE_Q4_0:
|
@@ -566,30 +517,20 @@ struct llama_file_loader {
|
|
566
517
|
case GGML_TYPE_Q6_K:
|
567
518
|
break;
|
568
519
|
default: {
|
569
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
520
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
570
521
|
}
|
571
522
|
}
|
572
523
|
|
573
|
-
|
574
|
-
|
575
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
576
|
-
}
|
577
|
-
shard.file_idx = file_idx;
|
578
|
-
shard.file_off = file.tell();
|
524
|
+
// skip to the next multiple of 32 bytes
|
525
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
579
526
|
|
580
|
-
|
581
|
-
|
527
|
+
tensor.file_off = file.tell();
|
528
|
+
tensor.name = name;
|
529
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
530
|
+
file.seek(tensor.size, SEEK_CUR);
|
582
531
|
|
583
|
-
|
584
|
-
|
585
|
-
if (it != tensors_map.name_to_idx.end()) {
|
586
|
-
idx = it->second;
|
587
|
-
} else {
|
588
|
-
tensors_map.tensors.emplace_back(name);
|
589
|
-
idx = tensors_map.tensors.size() - 1;
|
590
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
591
|
-
}
|
592
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
532
|
+
tensors_map.tensors.push_back(tensor);
|
533
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
593
534
|
}
|
594
535
|
}
|
595
536
|
};
|
@@ -659,56 +600,19 @@ struct llama_file_saver {
|
|
659
600
|
};
|
660
601
|
|
661
602
|
struct llama_model_loader {
|
662
|
-
std::
|
603
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
663
604
|
llama_load_tensors_map tensors_map;
|
664
605
|
bool use_mmap;
|
665
606
|
size_t num_ggml_tensors_created = 0;
|
666
607
|
struct ggml_context * ggml_ctx = NULL;
|
667
608
|
std::unique_ptr<llama_mmap> mapping;
|
668
609
|
|
669
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
670
|
-
|
671
|
-
file_loaders.emplace_back(first_file);
|
672
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
673
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
674
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
675
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
676
|
-
file_loaders.emplace_back(ith_file);
|
677
|
-
if (ith_file->hparams != first_file->hparams) {
|
678
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
679
|
-
}
|
680
|
-
}
|
610
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
611
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
681
612
|
if (!llama_mmap::SUPPORTED) {
|
682
613
|
use_mmap = false;
|
683
614
|
}
|
684
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
685
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
686
|
-
use_mmap = false;
|
687
|
-
}
|
688
615
|
this->use_mmap = use_mmap;
|
689
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
690
|
-
lt.calc_all();
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
bool alignment_prevents_mmap() {
|
695
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
696
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
697
|
-
if (shard.file_off & 3) {
|
698
|
-
return true;
|
699
|
-
}
|
700
|
-
}
|
701
|
-
}
|
702
|
-
return false;
|
703
|
-
}
|
704
|
-
|
705
|
-
uint32_t guess_n_parts() const {
|
706
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
707
|
-
if (it == tensors_map.name_to_idx.end()) {
|
708
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
709
|
-
}
|
710
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
711
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
712
616
|
}
|
713
617
|
|
714
618
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -774,7 +678,7 @@ struct llama_model_loader {
|
|
774
678
|
}
|
775
679
|
|
776
680
|
if (use_mmap) {
|
777
|
-
mapping.reset(new llama_mmap(&
|
681
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
778
682
|
if (lmlock) {
|
779
683
|
lmlock->init(mapping->addr);
|
780
684
|
}
|
@@ -830,45 +734,13 @@ struct llama_model_loader {
|
|
830
734
|
|
831
735
|
void load_data_for(llama_load_tensor & lt) {
|
832
736
|
if (use_mmap) {
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
737
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
738
|
+
} else {
|
739
|
+
llama_file & file = file_loader->file;
|
740
|
+
file.seek(lt.file_off, SEEK_SET);
|
838
741
|
file.read_raw(lt.data, lt.size);
|
839
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
840
|
-
size_t offset = 0;
|
841
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
842
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
843
|
-
file.seek(shard.file_off, SEEK_SET);
|
844
|
-
file.read_raw(lt.data + offset, shard.size);
|
845
|
-
offset += shard.size;
|
846
|
-
}
|
847
|
-
LLAMA_ASSERT(offset == lt.size);
|
848
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
849
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
850
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
851
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
852
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
853
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
854
|
-
file.seek(shard.file_off, SEEK_SET);
|
855
|
-
tmp_bufs.at(i).resize(shard.size);
|
856
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
857
|
-
}
|
858
|
-
// Then reshape.
|
859
|
-
size_t num_rows = lt.ne.at(1);
|
860
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
861
|
-
size_t out_offset = 0;
|
862
|
-
for (size_t row = 0; row < num_rows; row++) {
|
863
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
864
|
-
memcpy(lt.data + out_offset,
|
865
|
-
tmp_buf.addr + row * per_shard_row_size,
|
866
|
-
per_shard_row_size);
|
867
|
-
out_offset += per_shard_row_size;
|
868
|
-
}
|
869
|
-
}
|
870
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
871
742
|
}
|
743
|
+
|
872
744
|
if (0) {
|
873
745
|
print_checksum(lt);
|
874
746
|
}
|
@@ -938,7 +810,7 @@ static bool kv_cache_init(
|
|
938
810
|
|
939
811
|
struct llama_context_params llama_context_default_params() {
|
940
812
|
struct llama_context_params result = {
|
941
|
-
/*.seed =*/
|
813
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
942
814
|
/*.n_ctx =*/ 512,
|
943
815
|
/*.n_batch =*/ 512,
|
944
816
|
/*.gpu_layers =*/ 0,
|
@@ -1067,12 +939,12 @@ static void llama_model_load_internal(
|
|
1067
939
|
|
1068
940
|
model.t_start_us = ggml_time_us();
|
1069
941
|
|
1070
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
942
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1071
943
|
|
1072
|
-
vocab = std::move(ml->
|
1073
|
-
model.hparams = ml->
|
944
|
+
vocab = std::move(ml->file_loader->vocab);
|
945
|
+
model.hparams = ml->file_loader->hparams;
|
1074
946
|
model.n_gpu_layers = n_gpu_layers;
|
1075
|
-
llama_file_version file_version = ml->
|
947
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1076
948
|
auto & hparams = model.hparams;
|
1077
949
|
|
1078
950
|
{
|
@@ -1106,7 +978,6 @@ static void llama_model_load_internal(
|
|
1106
978
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1107
979
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1108
980
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1109
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1110
981
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
982
|
}
|
1112
983
|
|
@@ -1274,14 +1145,18 @@ static void llama_model_load_internal(
|
|
1274
1145
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1275
1146
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1276
1147
|
} else {
|
1277
|
-
|
1148
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1149
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1150
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1278
1151
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1279
1152
|
if (n_gpu_layers > 0) {
|
1280
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1281
|
-
__func__,
|
1153
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1154
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1155
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1282
1156
|
}
|
1283
1157
|
}
|
1284
1158
|
#endif // GGML_USE_CUBLAS
|
1159
|
+
|
1285
1160
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1286
1161
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1287
1162
|
|
@@ -1290,6 +1165,10 @@ static void llama_model_load_internal(
|
|
1290
1165
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1291
1166
|
}
|
1292
1167
|
size_t vram_kv_cache = 0;
|
1168
|
+
|
1169
|
+
#ifdef GGML_USE_CUBLAS
|
1170
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1171
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1293
1172
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1294
1173
|
if (low_vram) {
|
1295
1174
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1306,14 +1185,18 @@ static void llama_model_load_internal(
|
|
1306
1185
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1307
1186
|
}
|
1308
1187
|
}
|
1309
|
-
|
1188
|
+
#elif defined(GGML_USE_CLBLAST)
|
1189
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1190
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1191
|
+
#endif // GGML_USE_CUBLAS
|
1192
|
+
|
1310
1193
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1311
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1194
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1312
1195
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1313
1196
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1314
1197
|
#else
|
1315
1198
|
(void) n_gpu_layers;
|
1316
|
-
#endif
|
1199
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1317
1200
|
}
|
1318
1201
|
|
1319
1202
|
// populate `tensors_by_name`
|
@@ -1369,22 +1252,26 @@ static bool llama_model_load(
|
|
1369
1252
|
|
1370
1253
|
// evaluate the transformer
|
1371
1254
|
//
|
1372
|
-
// - lctx:
|
1373
|
-
// - tokens:
|
1374
|
-
// -
|
1375
|
-
// -
|
1376
|
-
// -
|
1255
|
+
// - lctx: llama context
|
1256
|
+
// - tokens: new batch of tokens to process
|
1257
|
+
// - embd embeddings input
|
1258
|
+
// - n_tokens number of tokens
|
1259
|
+
// - n_past: the context size so far
|
1260
|
+
// - n_threads: number of threads to use
|
1377
1261
|
//
|
1378
1262
|
static bool llama_eval_internal(
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1263
|
+
llama_context & lctx,
|
1264
|
+
const llama_token * tokens,
|
1265
|
+
const float * embd,
|
1266
|
+
const int n_tokens,
|
1267
|
+
const int n_past,
|
1268
|
+
const int n_threads,
|
1384
1269
|
const char * cgraph_fname) {
|
1385
1270
|
|
1271
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
|
+
|
1386
1273
|
// enforce that the first token is BOS
|
1387
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1274
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1388
1275
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1389
1276
|
return false;
|
1390
1277
|
}
|
@@ -1424,12 +1311,18 @@ static bool llama_eval_internal(
|
|
1424
1311
|
ggml_cgraph gf = {};
|
1425
1312
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1426
1313
|
|
1427
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1428
|
-
ggml_set_name(embd, "embd");
|
1429
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1430
|
-
|
1431
1314
|
struct ggml_tensor * cur;
|
1432
|
-
struct ggml_tensor * inpL
|
1315
|
+
struct ggml_tensor * inpL;
|
1316
|
+
|
1317
|
+
if (tokens) {
|
1318
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1319
|
+
ggml_set_name(embd, "embd");
|
1320
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1321
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1322
|
+
} else {
|
1323
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
|
+
}
|
1433
1326
|
|
1434
1327
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1435
1328
|
(void) i_gpu_start;
|
@@ -2012,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2012
1905
|
return;
|
2013
1906
|
}
|
2014
1907
|
|
2015
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2016
|
-
|
2017
1908
|
llama_sample_softmax(ctx, candidates);
|
2018
1909
|
|
1910
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1911
|
+
|
2019
1912
|
// Compute the cumulative probabilities
|
2020
1913
|
float cum_sum = 0.0f;
|
2021
1914
|
size_t last_idx = candidates->size;
|
@@ -2044,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2044
1937
|
return;
|
2045
1938
|
}
|
2046
1939
|
|
2047
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2048
|
-
|
2049
1940
|
llama_sample_softmax(nullptr, candidates);
|
1941
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
2050
1942
|
|
2051
1943
|
// Compute the first and second derivatives
|
2052
1944
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -2098,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
2098
1990
|
return;
|
2099
1991
|
}
|
2100
1992
|
|
2101
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
2102
|
-
|
2103
1993
|
// Compute the softmax of logits and calculate entropy
|
2104
1994
|
llama_sample_softmax(nullptr, candidates);
|
2105
1995
|
|
1996
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1997
|
+
|
2106
1998
|
float entropy = 0.0f;
|
2107
1999
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2108
2000
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2271,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2271
2163
|
|
2272
2164
|
if (ctx) {
|
2273
2165
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2274
|
-
ctx->n_sample++;
|
2275
2166
|
}
|
2276
2167
|
return X;
|
2277
2168
|
}
|
2278
2169
|
|
2279
2170
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2280
|
-
assert(ctx);
|
2281
2171
|
int64_t t_start_sample_us;
|
2282
2172
|
t_start_sample_us = ggml_time_us();
|
2283
2173
|
|
@@ -2292,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2292
2182
|
candidates->size = 1;
|
2293
2183
|
}
|
2294
2184
|
|
2185
|
+
if (ctx) {
|
2186
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2187
|
+
}
|
2188
|
+
|
2295
2189
|
// Normalize the probabilities of the remaining words
|
2296
2190
|
llama_sample_softmax(ctx, candidates);
|
2297
2191
|
|
2298
2192
|
// Sample the next word X from the remaining words
|
2299
|
-
if (ctx) {
|
2300
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2301
|
-
}
|
2302
2193
|
llama_token X = llama_sample_token(ctx, candidates);
|
2303
2194
|
t_start_sample_us = ggml_time_us();
|
2304
2195
|
|
@@ -2366,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2366
2257
|
}
|
2367
2258
|
float * f32_output = (float *) output.addr;
|
2368
2259
|
|
2369
|
-
|
2260
|
+
ggml_type_traits_t qtype;
|
2370
2261
|
if (ggml_is_quantized(tensor.type)) {
|
2371
|
-
qtype =
|
2372
|
-
if (qtype.
|
2262
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2263
|
+
if (qtype.to_float == NULL) {
|
2373
2264
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2374
2265
|
}
|
2375
2266
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2380,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2380
2271
|
if (tensor.type == GGML_TYPE_F16) {
|
2381
2272
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2382
2273
|
} else if (ggml_is_quantized(tensor.type)) {
|
2383
|
-
qtype.
|
2274
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2384
2275
|
} else {
|
2385
2276
|
LLAMA_ASSERT(false); // unreachable
|
2386
2277
|
}
|
@@ -2405,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2405
2296
|
if (typ == GGML_TYPE_F16) {
|
2406
2297
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2407
2298
|
} else {
|
2408
|
-
qtype.
|
2299
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2409
2300
|
}
|
2410
2301
|
};
|
2411
2302
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -2451,9 +2342,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2451
2342
|
nthread = std::thread::hardware_concurrency();
|
2452
2343
|
}
|
2453
2344
|
|
2454
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2455
|
-
|
2456
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2345
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2346
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2457
2347
|
|
2458
2348
|
#ifdef GGML_USE_K_QUANTS
|
2459
2349
|
int n_attention_wv = 0;
|
@@ -2654,6 +2544,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2654
2544
|
}
|
2655
2545
|
}
|
2656
2546
|
|
2547
|
+
|
2548
|
+
|
2657
2549
|
//
|
2658
2550
|
// interface implementation
|
2659
2551
|
//
|
@@ -2692,7 +2584,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2692
2584
|
|
2693
2585
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
2694
2586
|
|
2695
|
-
if (params.seed
|
2587
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2696
2588
|
params.seed = time(NULL);
|
2697
2589
|
}
|
2698
2590
|
|
@@ -2874,7 +2766,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2874
2766
|
|
2875
2767
|
// create a name -> tensor map of the model to accelerate lookups
|
2876
2768
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2877
|
-
for (auto & kv: model.tensors_by_name) {
|
2769
|
+
for (const auto & kv: model.tensors_by_name) {
|
2878
2770
|
model_tensors.insert(kv);
|
2879
2771
|
}
|
2880
2772
|
|
@@ -2885,7 +2777,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2885
2777
|
llama_buffer base_buf;
|
2886
2778
|
if (path_base_model) {
|
2887
2779
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2888
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2780
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2889
2781
|
|
2890
2782
|
size_t ctx_size;
|
2891
2783
|
size_t mmapped_size;
|
@@ -2903,7 +2795,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2903
2795
|
|
2904
2796
|
// maybe this should in llama_model_loader
|
2905
2797
|
if (model_loader->use_mmap) {
|
2906
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2798
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2907
2799
|
}
|
2908
2800
|
}
|
2909
2801
|
|
@@ -2964,7 +2856,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2964
2856
|
return false;
|
2965
2857
|
}
|
2966
2858
|
}
|
2967
|
-
ggml_tensor* lora_tensor;
|
2859
|
+
ggml_tensor * lora_tensor;
|
2968
2860
|
if (n_dims == 2) {
|
2969
2861
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2970
2862
|
}
|
@@ -2972,6 +2864,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2972
2864
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2973
2865
|
return 1;
|
2974
2866
|
}
|
2867
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2975
2868
|
|
2976
2869
|
// load tensor data
|
2977
2870
|
size_t offset = fin.tellg();
|
@@ -2987,6 +2880,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2987
2880
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2988
2881
|
|
2989
2882
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2883
|
+
|
2884
|
+
offload_func_t offload_func = llama_nop;
|
2885
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2886
|
+
|
2887
|
+
#ifdef GGML_USE_CUBLAS
|
2888
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2889
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2890
|
+
throw std::runtime_error(format(
|
2891
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2892
|
+
}
|
2893
|
+
offload_func = ggml_cuda_assign_buffers;
|
2894
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2895
|
+
}
|
2896
|
+
#endif // GGML_USE_CUBLAS
|
2897
|
+
|
2990
2898
|
ggml_tensor * base_t;
|
2991
2899
|
if (model_loader) {
|
2992
2900
|
// load from base model
|
@@ -3014,7 +2922,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3014
2922
|
}
|
3015
2923
|
|
3016
2924
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2925
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2926
|
+
ggml_set_name(loraA, "loraA");
|
2927
|
+
|
3017
2928
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2929
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2930
|
+
ggml_set_name(loraB, "loraB");
|
3018
2931
|
|
3019
2932
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3020
2933
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -3024,19 +2937,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3024
2937
|
|
3025
2938
|
// w = w + BA*s
|
3026
2939
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2940
|
+
offload_func(BA);
|
2941
|
+
ggml_set_name(BA, "BA");
|
3027
2942
|
|
3028
2943
|
if (scaling != 1.0f) {
|
3029
2944
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2945
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2946
|
+
|
3030
2947
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2948
|
+
offload_func(BA);
|
2949
|
+
ggml_set_name(BA, "BA_scaled");
|
3031
2950
|
}
|
3032
2951
|
|
3033
2952
|
ggml_tensor * r;
|
3034
2953
|
if (base_t == dest_t) {
|
3035
2954
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2955
|
+
offload_func_force_inplace(r);
|
2956
|
+
ggml_set_name(r, "r_add_inplace");
|
3036
2957
|
}
|
3037
2958
|
else {
|
3038
2959
|
r = ggml_add(lora_ctx, base_t, BA);
|
2960
|
+
offload_func(r);
|
2961
|
+
ggml_set_name(r, "r_add");
|
2962
|
+
|
3039
2963
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2964
|
+
offload_func(r);
|
2965
|
+
ggml_set_name(r, "r_cpy");
|
3040
2966
|
}
|
3041
2967
|
|
3042
2968
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3091,8 +3017,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
3091
3017
|
|
3092
3018
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3093
3019
|
|
3094
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3095
|
-
if (seed
|
3020
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
3021
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3096
3022
|
seed = time(NULL);
|
3097
3023
|
}
|
3098
3024
|
ctx->rng.seed(seed);
|
@@ -3336,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3336
3262
|
return nread;
|
3337
3263
|
}
|
3338
3264
|
|
3339
|
-
bool
|
3265
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3340
3266
|
llama_file file(path_session, "rb");
|
3341
3267
|
|
3342
3268
|
// sanity checks
|
@@ -3390,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3390
3316
|
return true;
|
3391
3317
|
}
|
3392
3318
|
|
3319
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3320
|
+
try {
|
3321
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3322
|
+
} catch (const std::exception & err) {
|
3323
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3324
|
+
return false;
|
3325
|
+
}
|
3326
|
+
}
|
3327
|
+
|
3393
3328
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3394
3329
|
llama_file file(path_session, "wb");
|
3395
3330
|
|
@@ -3421,7 +3356,29 @@ int llama_eval(
|
|
3421
3356
|
int n_tokens,
|
3422
3357
|
int n_past,
|
3423
3358
|
int n_threads) {
|
3424
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3359
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3360
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3361
|
+
return 1;
|
3362
|
+
}
|
3363
|
+
|
3364
|
+
// get a more accurate load time, upon first eval
|
3365
|
+
// TODO: fix this
|
3366
|
+
if (!ctx->has_evaluated_once) {
|
3367
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3368
|
+
ctx->has_evaluated_once = true;
|
3369
|
+
}
|
3370
|
+
|
3371
|
+
return 0;
|
3372
|
+
}
|
3373
|
+
|
3374
|
+
|
3375
|
+
int llama_eval_embd(
|
3376
|
+
struct llama_context * ctx,
|
3377
|
+
const float * embd,
|
3378
|
+
int n_tokens,
|
3379
|
+
int n_past,
|
3380
|
+
int n_threads) {
|
3381
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3425
3382
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3426
3383
|
return 1;
|
3427
3384
|
}
|
@@ -3442,7 +3399,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3442
3399
|
|
3443
3400
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3444
3401
|
|
3445
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3402
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3446
3403
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3447
3404
|
return 1;
|
3448
3405
|
}
|
@@ -3523,23 +3480,35 @@ llama_token llama_token_nl() {
|
|
3523
3480
|
return 13;
|
3524
3481
|
}
|
3525
3482
|
|
3483
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3484
|
+
struct llama_timings result = {
|
3485
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3486
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3487
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3488
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3489
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3490
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3526
3491
|
|
3527
|
-
|
3528
|
-
|
3492
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3493
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3494
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3495
|
+
};
|
3529
3496
|
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3497
|
+
return result;
|
3498
|
+
}
|
3499
|
+
|
3500
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3501
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3533
3502
|
|
3534
3503
|
fprintf(stderr, "\n");
|
3535
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3504
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3536
3505
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3537
|
-
__func__,
|
3506
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3538
3507
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3539
|
-
__func__,
|
3508
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3540
3509
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3541
|
-
__func__,
|
3542
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3510
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3511
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3543
3512
|
}
|
3544
3513
|
|
3545
3514
|
void llama_reset_timings(struct llama_context * ctx) {
|