llama_cpp 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +39 -1
- data/ext/llama_cpp/src/ggml.c +914 -509
- data/ext/llama_cpp/src/ggml.h +42 -27
- data/ext/llama_cpp/src/llama.cpp +293 -303
- data/ext/llama_cpp/src/llama.h +19 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -2
- data/sig/llama_cpp.rbs +52 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -12,6 +12,19 @@
|
|
12
12
|
#include <cassert>
|
13
13
|
#include <cstring>
|
14
14
|
|
15
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
16
|
+
#define WIN32_LEAN_AND_MEAN
|
17
|
+
#include <Windows.h>
|
18
|
+
#else
|
19
|
+
#include <sys/types.h>
|
20
|
+
#include <sys/mman.h>
|
21
|
+
#include <unistd.h>
|
22
|
+
#include <fcntl.h>
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
+
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
27
|
+
|
15
28
|
#define LLAMA_USE_SCRATCH
|
16
29
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
17
30
|
|
@@ -142,6 +155,10 @@ struct llama_model {
|
|
142
155
|
// the model memory buffer
|
143
156
|
std::vector<uint8_t> buf;
|
144
157
|
|
158
|
+
// model memory mapped file
|
159
|
+
void * mm_addr = NULL;
|
160
|
+
uint64_t mm_length = 0;
|
161
|
+
|
145
162
|
// tensors
|
146
163
|
int n_loaded;
|
147
164
|
std::unordered_map<std::string, struct ggml_tensor *> tensors;
|
@@ -165,6 +182,7 @@ struct llama_context {
|
|
165
182
|
|
166
183
|
int64_t t_load_us = 0;
|
167
184
|
int64_t t_start_us = 0;
|
185
|
+
bool has_evaluated_once = false;
|
168
186
|
|
169
187
|
int64_t t_sample_us = 0;
|
170
188
|
int64_t t_eval_us = 0;
|
@@ -206,7 +224,7 @@ struct llama_context {
|
|
206
224
|
}
|
207
225
|
|
208
226
|
if (buf_last >= 0) {
|
209
|
-
buf_max_size[buf_last] =
|
227
|
+
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
|
210
228
|
}
|
211
229
|
|
212
230
|
buf_last = i;
|
@@ -238,14 +256,15 @@ static bool kv_cache_init(
|
|
238
256
|
const int n_embd = hparams.n_embd;
|
239
257
|
const int n_layer = hparams.n_layer;
|
240
258
|
|
241
|
-
const
|
242
|
-
const
|
259
|
+
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
260
|
+
const int64_t n_elements = n_embd*n_mem;
|
243
261
|
|
244
262
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
245
263
|
|
246
264
|
struct ggml_init_params params;
|
247
265
|
params.mem_size = cache.buf.size();
|
248
266
|
params.mem_buffer = cache.buf.data();
|
267
|
+
params.no_alloc = false;
|
249
268
|
|
250
269
|
cache.ctx = ggml_init(params);
|
251
270
|
|
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
|
|
288
307
|
// model loading
|
289
308
|
//
|
290
309
|
|
310
|
+
static void *mmap_file(const char *fname, uint64_t *mm_length) {
|
311
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
312
|
+
HANDLE hFile = CreateFileA(fname,
|
313
|
+
GENERIC_READ,
|
314
|
+
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
315
|
+
NULL,
|
316
|
+
OPEN_EXISTING,
|
317
|
+
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
+
NULL);
|
319
|
+
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
+
LARGE_INTEGER fileSize;
|
321
|
+
fileSize.QuadPart = -1;
|
322
|
+
GetFileSizeEx(hFile, &fileSize);
|
323
|
+
int64_t length = fileSize.QuadPart;
|
324
|
+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
+
CloseHandle(hFile);
|
326
|
+
if (!hMapping) return 0;
|
327
|
+
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
+
CloseHandle(hMapping);
|
329
|
+
if (!addr) return 0;
|
330
|
+
#else
|
331
|
+
int fd = open(fname, O_RDONLY);
|
332
|
+
if (fd == -1) return 0;
|
333
|
+
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
+
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
+
close(fd);
|
336
|
+
if (addr == MAP_FAILED) return 0;
|
337
|
+
#endif
|
338
|
+
*mm_length = length;
|
339
|
+
return addr;
|
340
|
+
}
|
341
|
+
|
342
|
+
static void munmap_file(void * addr, size_t length) {
|
343
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
344
|
+
UnmapViewOfFile(addr);
|
345
|
+
#else
|
346
|
+
munmap(addr, length);
|
347
|
+
#endif
|
348
|
+
}
|
349
|
+
|
350
|
+
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
|
351
|
+
fprintf(stderr,
|
352
|
+
"%s: invalid model file (bad magic [got %#x want %#x])\n"
|
353
|
+
"\tyou most likely need to regenerate your ggml files\n"
|
354
|
+
"\tthe benefit is you'll get 10-100x faster load times\n"
|
355
|
+
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
|
356
|
+
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
|
357
|
+
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
|
358
|
+
path, got, want);
|
359
|
+
return false;
|
360
|
+
}
|
361
|
+
|
291
362
|
static bool llama_model_load(
|
292
363
|
const std::string & fname,
|
293
364
|
llama_context & lctx,
|
@@ -299,22 +370,24 @@ static bool llama_model_load(
|
|
299
370
|
void *progress_callback_user_data) {
|
300
371
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
301
372
|
|
302
|
-
|
303
|
-
|
304
|
-
lctx.t_start_us = t_start_us;
|
305
|
-
|
306
|
-
std::vector<char> f_buf(1024*1024);
|
373
|
+
lctx.t_start_us = ggml_time_us();
|
307
374
|
|
308
375
|
auto & model = lctx.model;
|
309
376
|
auto & vocab = lctx.vocab;
|
310
377
|
|
311
378
|
auto fin = std::ifstream(fname, std::ios::binary);
|
312
|
-
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
313
379
|
if (!fin) {
|
314
380
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
315
381
|
return false;
|
316
382
|
}
|
317
383
|
|
384
|
+
std::vector<char> f_buf(1024*1024);
|
385
|
+
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
386
|
+
|
387
|
+
fin.seekg(0, fin.end);
|
388
|
+
const size_t file_size = fin.tellg();
|
389
|
+
fin.seekg(0);
|
390
|
+
|
318
391
|
// verify magic
|
319
392
|
{
|
320
393
|
uint32_t magic;
|
@@ -325,8 +398,7 @@ static bool llama_model_load(
|
|
325
398
|
return false;
|
326
399
|
}
|
327
400
|
if (magic != LLAMA_FILE_MAGIC) {
|
328
|
-
|
329
|
-
return false;
|
401
|
+
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
330
402
|
}
|
331
403
|
|
332
404
|
uint32_t format_version;
|
@@ -449,43 +521,24 @@ static bool llama_model_load(
|
|
449
521
|
}
|
450
522
|
}
|
451
523
|
|
524
|
+
// map model into memory
|
525
|
+
char *mm_addr = NULL;
|
526
|
+
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
+
if (model.mm_addr == NULL) {
|
528
|
+
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
+
return false;
|
530
|
+
}
|
531
|
+
mm_addr = (char *)model.mm_addr;
|
532
|
+
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
|
+
|
452
534
|
auto & ctx = model.ctx;
|
453
535
|
|
454
536
|
size_t ctx_size = 0;
|
455
|
-
|
456
537
|
{
|
457
|
-
const auto &
|
458
|
-
|
459
|
-
const int n_embd = hparams.n_embd;
|
538
|
+
const auto &hparams = model.hparams;
|
460
539
|
const int n_layer = hparams.n_layer;
|
461
|
-
const int n_ctx = hparams.n_ctx;
|
462
|
-
const int n_vocab = hparams.n_vocab;
|
463
|
-
|
464
|
-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
|
465
|
-
|
466
|
-
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
|
467
|
-
|
468
|
-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
|
469
|
-
|
470
|
-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
|
471
|
-
|
472
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
|
473
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
|
474
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
|
475
|
-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
|
476
|
-
|
477
|
-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
|
478
|
-
|
479
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
|
480
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
481
|
-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
482
|
-
|
483
|
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
484
|
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
485
|
-
|
486
540
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
487
|
-
|
488
|
-
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
541
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
489
542
|
}
|
490
543
|
|
491
544
|
// print memory requirements
|
@@ -495,6 +548,7 @@ static bool llama_model_load(
|
|
495
548
|
// this is the total memory required to run the inference
|
496
549
|
const size_t mem_required =
|
497
550
|
ctx_size +
|
551
|
+
model.mm_length +
|
498
552
|
MEM_REQ_SCRATCH0.at(model.type) +
|
499
553
|
MEM_REQ_SCRATCH1.at(model.type) +
|
500
554
|
MEM_REQ_EVAL.at (model.type);
|
@@ -514,6 +568,7 @@ static bool llama_model_load(
|
|
514
568
|
struct ggml_init_params params = {
|
515
569
|
/*.mem_size =*/ lctx.model.buf.size(),
|
516
570
|
/*.mem_buffer =*/ lctx.model.buf.data(),
|
571
|
+
/*.no_alloc =*/ true,
|
517
572
|
};
|
518
573
|
|
519
574
|
model.ctx = ggml_init(params);
|
@@ -576,234 +631,106 @@ static bool llama_model_load(
|
|
576
631
|
}
|
577
632
|
}
|
578
633
|
|
579
|
-
const size_t file_offset = fin.tellg();
|
580
|
-
|
581
|
-
fin.close();
|
582
|
-
|
583
634
|
std::vector<uint8_t> tmp;
|
584
635
|
|
585
636
|
if (progress_callback) {
|
586
637
|
progress_callback(0.0, progress_callback_user_data);
|
587
638
|
}
|
588
639
|
|
589
|
-
|
590
|
-
const int part_id = i;
|
591
|
-
//const int part_id = n_parts - i - 1;
|
640
|
+
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
|
592
641
|
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
642
|
+
// load weights
|
643
|
+
{
|
644
|
+
size_t total_size = 0;
|
645
|
+
model.n_loaded = 0;
|
599
646
|
|
600
|
-
|
601
|
-
|
647
|
+
while (true) {
|
648
|
+
int32_t n_dims;
|
649
|
+
int32_t length;
|
650
|
+
int32_t ftype;
|
602
651
|
|
603
|
-
|
604
|
-
|
652
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
605
655
|
|
606
|
-
|
656
|
+
if (fin.eof()) {
|
657
|
+
break;
|
658
|
+
}
|
607
659
|
|
608
|
-
|
609
|
-
|
610
|
-
|
660
|
+
int32_t nelements = 1;
|
661
|
+
int32_t ne[2] = { 1, 1 };
|
662
|
+
for (int i = 0; i < n_dims; ++i) {
|
663
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
+
nelements *= ne[i];
|
665
|
+
}
|
611
666
|
|
612
|
-
|
667
|
+
std::string name(length, 0);
|
668
|
+
fin.read(&name[0], length);
|
613
669
|
|
614
|
-
|
670
|
+
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
+
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
+
return false;
|
673
|
+
}
|
615
674
|
|
616
|
-
|
617
|
-
int32_t n_dims;
|
618
|
-
int32_t length;
|
619
|
-
int32_t ftype;
|
675
|
+
auto tensor = model.tensors[name.data()];
|
620
676
|
|
621
|
-
|
622
|
-
|
623
|
-
|
677
|
+
if (ggml_nelements(tensor) != nelements) {
|
678
|
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
+
return false;
|
680
|
+
}
|
681
|
+
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
683
|
+
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
+
return false;
|
685
|
+
}
|
686
|
+
if (0) {
|
687
|
+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
+
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
+
}
|
624
690
|
|
625
|
-
|
691
|
+
switch (ftype) {
|
692
|
+
case 0: // f32
|
693
|
+
case 1: // f16
|
626
694
|
break;
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
nelements *= ne[i];
|
634
|
-
}
|
635
|
-
|
636
|
-
std::string name(length, 0);
|
637
|
-
fin.read(&name[0], length);
|
638
|
-
|
639
|
-
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
640
|
-
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
695
|
+
case 2: // q4_0
|
696
|
+
case 3: // q4_1
|
697
|
+
assert(ne[0] % 64 == 0);
|
698
|
+
break;
|
699
|
+
default:
|
700
|
+
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
641
701
|
return false;
|
642
|
-
|
643
|
-
|
644
|
-
// split_type = 0: split by columns
|
645
|
-
// split_type = 1: split by rows
|
646
|
-
int split_type = 0;
|
647
|
-
|
648
|
-
// split_type = 0:
|
649
|
-
// regex:
|
650
|
-
// - tok_embeddings.*
|
651
|
-
// - layers.*.attention.wo.weight
|
652
|
-
// - layers.*.feed_forward.w2.weight
|
653
|
-
|
654
|
-
// split_type = 1:
|
655
|
-
// regex:
|
656
|
-
// - output.*
|
657
|
-
// - layers.*.attention.wq.weight
|
658
|
-
// - layers.*.attention.wk.weight
|
659
|
-
// - layers.*.attention.wv.weight
|
660
|
-
// - layers.*.feed_forward.w1.weight
|
661
|
-
// - layers.*.feed_forward.w3.weight
|
662
|
-
if (name.find("tok_embeddings") != std::string::npos) {
|
663
|
-
split_type = 0;
|
664
|
-
} else if (name.find("layers") != std::string::npos) {
|
665
|
-
if (name.find("attention.wo.weight") != std::string::npos) {
|
666
|
-
split_type = 0;
|
667
|
-
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
668
|
-
split_type = 0;
|
669
|
-
} else {
|
670
|
-
split_type = 1;
|
671
|
-
}
|
672
|
-
} else if (name.find("output") != std::string::npos) {
|
673
|
-
split_type = 1;
|
674
|
-
}
|
675
|
-
|
676
|
-
auto tensor = model.tensors[name.data()];
|
677
|
-
|
678
|
-
if (n_dims == 1) {
|
679
|
-
if (ggml_nelements(tensor) != nelements) {
|
680
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
681
|
-
return false;
|
682
|
-
}
|
683
|
-
} else {
|
684
|
-
if (ggml_nelements(tensor)/n_parts != nelements) {
|
685
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
686
|
-
return false;
|
687
|
-
}
|
688
|
-
}
|
689
|
-
|
690
|
-
if (n_dims == 1) {
|
691
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
692
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
693
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
694
|
-
return false;
|
695
|
-
}
|
696
|
-
} else {
|
697
|
-
if (split_type == 0) {
|
698
|
-
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
|
699
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
700
|
-
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
|
701
|
-
return false;
|
702
|
-
}
|
703
|
-
} else {
|
704
|
-
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
|
705
|
-
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
706
|
-
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
|
707
|
-
return false;
|
708
|
-
}
|
709
|
-
}
|
710
|
-
}
|
711
|
-
|
712
|
-
if (0) {
|
713
|
-
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
714
|
-
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
715
|
-
}
|
716
|
-
|
717
|
-
size_t bpe = 0;
|
718
|
-
|
719
|
-
switch (ftype) {
|
720
|
-
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
721
|
-
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
722
|
-
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
723
|
-
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
724
|
-
default:
|
725
|
-
{
|
726
|
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
727
|
-
return false;
|
728
|
-
}
|
729
|
-
};
|
730
|
-
|
731
|
-
if (n_dims == 1 || n_parts == 1) {
|
732
|
-
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
733
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
734
|
-
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
735
|
-
return false;
|
736
|
-
}
|
737
|
-
|
738
|
-
if (part_id == 0) {
|
739
|
-
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
740
|
-
} else {
|
741
|
-
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
|
742
|
-
}
|
743
|
-
|
744
|
-
total_size += ggml_nbytes(tensor);
|
745
|
-
} else {
|
746
|
-
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
|
747
|
-
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
748
|
-
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
|
749
|
-
return false;
|
750
|
-
}
|
751
|
-
|
752
|
-
if (split_type == 0) {
|
753
|
-
const int np0 = ne[0];
|
754
|
-
|
755
|
-
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
756
|
-
assert(row_size == tensor->nb[1]);
|
757
|
-
|
758
|
-
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
759
|
-
const size_t offset_row = i1*row_size;
|
760
|
-
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
761
|
-
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
|
762
|
-
}
|
763
|
-
} else {
|
764
|
-
const int np1 = ne[1];
|
765
|
-
|
766
|
-
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
|
767
|
-
|
768
|
-
for (int i1 = 0; i1 < ne[1]; ++i1) {
|
769
|
-
const size_t offset_row = (i1 + part_id*np1)*row_size;
|
770
|
-
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
|
771
|
-
}
|
772
|
-
}
|
773
|
-
|
774
|
-
total_size += ggml_nbytes(tensor)/n_parts;
|
775
|
-
}
|
776
|
-
|
777
|
-
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
778
|
-
model.n_loaded++;
|
779
|
-
|
780
|
-
// progress
|
781
|
-
if (progress_callback) {
|
782
|
-
float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
|
783
|
-
float current_progress = (float(i) + current_file_progress) / float(n_parts);
|
784
|
-
progress_callback(current_progress, progress_callback_user_data);
|
785
|
-
}
|
786
|
-
if (model.n_loaded % 8 == 0) {
|
787
|
-
fprintf(stderr, ".");
|
788
|
-
fflush(stderr);
|
789
|
-
}
|
790
|
-
}
|
791
|
-
|
792
|
-
fprintf(stderr, " done\n");
|
702
|
+
};
|
793
703
|
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
704
|
+
// load the tensor data into memory without copying or reading it
|
705
|
+
size_t offset = fin.tellg();
|
706
|
+
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
+
offset = (offset + 31) & -32;
|
708
|
+
tensor->data = mm_addr + offset;
|
709
|
+
fin.seekg(offset + tensor_data_size);
|
710
|
+
total_size += tensor_data_size;
|
711
|
+
model.n_loaded++;
|
712
|
+
|
713
|
+
// progress
|
714
|
+
if (progress_callback) {
|
715
|
+
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
+
progress_callback(current_progress, progress_callback_user_data);
|
800
717
|
}
|
801
718
|
}
|
802
719
|
|
803
720
|
fin.close();
|
721
|
+
|
722
|
+
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
723
|
+
if (model.n_loaded == 0) {
|
724
|
+
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
+
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
+
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
+
return false;
|
728
|
+
}
|
804
729
|
}
|
805
730
|
|
806
|
-
|
731
|
+
// loading time will be recalculate after the first eval, so
|
732
|
+
// we take page faults deferred by mmap() into consideration
|
733
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
807
734
|
|
808
735
|
if (progress_callback) {
|
809
736
|
progress_callback(1.0, progress_callback_user_data);
|
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
|
|
849
776
|
struct ggml_init_params params = {
|
850
777
|
/*.mem_size =*/ buf_compute.size(),
|
851
778
|
/*.mem_buffer =*/ buf_compute.data(),
|
779
|
+
/*.no_alloc =*/ false,
|
852
780
|
};
|
853
781
|
|
854
782
|
struct ggml_context * ctx0 = ggml_init(params);
|
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
|
|
856
784
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
857
785
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
858
786
|
ggml_cgraph gf = {};
|
859
|
-
gf.n_threads = N
|
787
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
860
788
|
|
861
789
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
862
790
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -882,37 +810,35 @@ static bool llama_eval_internal(
|
|
882
810
|
|
883
811
|
// self-attention
|
884
812
|
{
|
885
|
-
|
886
|
-
struct ggml_tensor *
|
887
|
-
struct ggml_tensor *
|
813
|
+
// compute Q and K and RoPE them
|
814
|
+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
815
|
+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
888
816
|
|
889
817
|
// store key and value to memory
|
890
|
-
|
818
|
+
{
|
819
|
+
// compute the transposed [N, n_embd] V matrix
|
820
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
821
|
+
|
891
822
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
892
|
-
struct ggml_tensor * v =
|
823
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
824
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
825
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
893
826
|
|
827
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
894
828
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
895
829
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
896
830
|
}
|
897
831
|
|
898
|
-
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
899
832
|
struct ggml_tensor * Q =
|
900
833
|
ggml_permute(ctx0,
|
901
|
-
|
902
|
-
ggml_cpy(ctx0,
|
903
|
-
Qcur,
|
904
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
905
|
-
n_past, n_rot, 0),
|
834
|
+
Qcur,
|
906
835
|
0, 2, 1, 3);
|
907
836
|
|
908
|
-
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
909
837
|
struct ggml_tensor * K =
|
910
838
|
ggml_permute(ctx0,
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
n_embd/n_head, n_head, n_past + N),
|
915
|
-
n_past, n_rot, 1),
|
839
|
+
ggml_reshape_3d(ctx0,
|
840
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
841
|
+
n_embd/n_head, n_head, n_past + N),
|
916
842
|
0, 2, 1, 3);
|
917
843
|
|
918
844
|
// K * Q
|
@@ -930,18 +856,23 @@ static bool llama_eval_internal(
|
|
930
856
|
// KQ = soft_max(KQ_masked)
|
931
857
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
932
858
|
|
933
|
-
//
|
934
|
-
struct ggml_tensor *
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
1, 2, 0, 3),
|
941
|
-
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
859
|
+
// split cached V into n_head heads
|
860
|
+
struct ggml_tensor * V =
|
861
|
+
ggml_view_3d(ctx0, kv_self.v,
|
862
|
+
n_past + N, n_embd/n_head, n_head,
|
863
|
+
n_ctx*ggml_element_size(kv_self.v),
|
864
|
+
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
865
|
+
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
942
866
|
|
943
|
-
|
944
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0,
|
867
|
+
#if 1
|
868
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
869
|
+
#else
|
870
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
871
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
872
|
+
// is there a better way?
|
873
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
874
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
875
|
+
#endif
|
945
876
|
|
946
877
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
947
878
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
@@ -1027,9 +958,13 @@ static bool llama_eval_internal(
|
|
1027
958
|
ggml_build_forward_expand(&gf, inpL);
|
1028
959
|
ggml_graph_compute (ctx0, &gf);
|
1029
960
|
|
961
|
+
// print timing information per ggml operation (for debugging purposes)
|
962
|
+
// requires GGML_PERF to be defined
|
963
|
+
//ggml_graph_print(&gf);
|
964
|
+
|
965
|
+
// plot the computation graph in dot format (for debugging purposes)
|
1030
966
|
//if (n_past%100 == 0) {
|
1031
|
-
//
|
1032
|
-
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
967
|
+
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1033
968
|
//}
|
1034
969
|
|
1035
970
|
//embd_w.resize(n_vocab*N);
|
@@ -1126,7 +1061,7 @@ struct llama_tokenizer {
|
|
1126
1061
|
size_t offs = 0;
|
1127
1062
|
while (offs < text.size()) {
|
1128
1063
|
llama_sp_symbol sym;
|
1129
|
-
size_t char_len =
|
1064
|
+
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
|
1130
1065
|
sym.text = text.c_str() + offs;
|
1131
1066
|
sym.n = char_len;
|
1132
1067
|
offs += char_len;
|
@@ -1266,6 +1201,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1266
1201
|
const auto & logits = lctx.logits;
|
1267
1202
|
const auto * plogits = logits.data() + logits.size() - n_logits;
|
1268
1203
|
|
1204
|
+
if (temp <= 0) {
|
1205
|
+
// select the token with the highest logit directly
|
1206
|
+
float max_logit = plogits[0];
|
1207
|
+
llama_vocab::id max_id = 0;
|
1208
|
+
|
1209
|
+
for (int i = 1; i < n_logits; ++i) {
|
1210
|
+
if (plogits[i] > max_logit) {
|
1211
|
+
max_logit = plogits[i];
|
1212
|
+
max_id = i;
|
1213
|
+
}
|
1214
|
+
}
|
1215
|
+
return max_id;
|
1216
|
+
}
|
1217
|
+
|
1269
1218
|
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
1270
1219
|
logits_id.reserve(n_logits);
|
1271
1220
|
|
@@ -1287,17 +1236,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1287
1236
|
}
|
1288
1237
|
}
|
1289
1238
|
|
1290
|
-
sample_top_k(logits_id, top_k);
|
1291
|
-
|
1292
|
-
float maxl = -std::numeric_limits<float>::infinity();
|
1293
|
-
for (const auto & kv : logits_id) {
|
1294
|
-
maxl = std::max(maxl, kv.first);
|
1295
|
-
}
|
1239
|
+
sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
|
1296
1240
|
|
1297
1241
|
// compute probs for the top k tokens
|
1298
1242
|
std::vector<float> probs;
|
1299
1243
|
probs.reserve(logits_id.size());
|
1300
1244
|
|
1245
|
+
float maxl = logits_id[0].first;
|
1301
1246
|
double sum = 0.0;
|
1302
1247
|
for (const auto & kv : logits_id) {
|
1303
1248
|
const float p = expf(kv.first - maxl);
|
@@ -1320,16 +1265,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1320
1265
|
break;
|
1321
1266
|
}
|
1322
1267
|
}
|
1323
|
-
|
1324
|
-
cumsum = 1.0/cumsum;
|
1325
|
-
for (int i = 0; i < (int) probs.size(); i++) {
|
1326
|
-
probs[i] *= cumsum;
|
1327
|
-
}
|
1328
1268
|
}
|
1329
1269
|
|
1330
1270
|
//printf("\n");
|
1331
1271
|
//for (int i = 0; i < (int) 10; i++) {
|
1332
|
-
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
1272
|
+
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
|
1333
1273
|
//}
|
1334
1274
|
//printf("\n\n");
|
1335
1275
|
//exit(0);
|
@@ -1385,8 +1325,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1385
1325
|
return false;
|
1386
1326
|
}
|
1387
1327
|
if (magic != LLAMA_FILE_MAGIC) {
|
1388
|
-
|
1389
|
-
return false;
|
1328
|
+
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
|
1390
1329
|
}
|
1391
1330
|
|
1392
1331
|
fout.write((char *) &magic, sizeof(magic));
|
@@ -1444,7 +1383,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1444
1383
|
return false;
|
1445
1384
|
}
|
1446
1385
|
|
1447
|
-
std::
|
1386
|
+
std::vector<char> word(32);
|
1448
1387
|
vocab.id_to_token.resize(n_vocab);
|
1449
1388
|
for (int i = 0; i < n_vocab; i++) {
|
1450
1389
|
uint32_t len;
|
@@ -1452,17 +1391,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1452
1391
|
fout.write((char *) &len, sizeof(len));
|
1453
1392
|
|
1454
1393
|
word.resize(len);
|
1455
|
-
finp.read ((char *) word
|
1456
|
-
fout.write((char *) word
|
1394
|
+
finp.read ((char *) &word[0], len);
|
1395
|
+
fout.write((char *) &word[0], len);
|
1457
1396
|
|
1458
1397
|
float score;
|
1459
1398
|
finp.read ((char *) &score, sizeof(score));
|
1460
1399
|
fout.write((char *) &score, sizeof(score));
|
1461
1400
|
|
1462
|
-
vocab.token_to_id[word] = i;
|
1401
|
+
vocab.token_to_id[word.data()] = i;
|
1463
1402
|
|
1464
1403
|
auto &tok_score = vocab.id_to_token[i];
|
1465
|
-
tok_score.tok = word;
|
1404
|
+
tok_score.tok = word.data();
|
1466
1405
|
tok_score.score = score;
|
1467
1406
|
}
|
1468
1407
|
}
|
@@ -1503,6 +1442,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1503
1442
|
std::string name(length, 0);
|
1504
1443
|
finp.read (&name[0], length);
|
1505
1444
|
|
1445
|
+
{
|
1446
|
+
// ensure tensor data is aligned
|
1447
|
+
uint64_t offset = finp.tellg();
|
1448
|
+
offset = (offset + 31) & -32;
|
1449
|
+
finp.seekg(offset);
|
1450
|
+
}
|
1451
|
+
|
1506
1452
|
{
|
1507
1453
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1508
1454
|
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
@@ -1558,6 +1504,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1558
1504
|
}
|
1559
1505
|
fout.write(&name[0], length);
|
1560
1506
|
|
1507
|
+
{
|
1508
|
+
// ensure tensor data is aligned
|
1509
|
+
uint64_t offset = fout.tellp();
|
1510
|
+
offset = (offset + 31) & -32;
|
1511
|
+
fout.seekp(offset);
|
1512
|
+
}
|
1513
|
+
|
1561
1514
|
if (quantize) {
|
1562
1515
|
printf("quantizing .. ");
|
1563
1516
|
work.resize(nelements); // for quantization
|
@@ -1655,7 +1608,10 @@ struct llama_context * llama_init_from_file(
|
|
1655
1608
|
|
1656
1609
|
if (params.use_mlock) {
|
1657
1610
|
char *err;
|
1658
|
-
if (!ggml_mlock(ctx->model.ctx,
|
1611
|
+
if (!ggml_mlock(ctx->model.ctx,
|
1612
|
+
ctx->model.mm_addr,
|
1613
|
+
ctx->model.mm_length,
|
1614
|
+
&err)) {
|
1659
1615
|
fprintf(stderr, "%s\n", err);
|
1660
1616
|
free(err);
|
1661
1617
|
llama_free(ctx);
|
@@ -1664,7 +1620,7 @@ struct llama_context * llama_init_from_file(
|
|
1664
1620
|
}
|
1665
1621
|
|
1666
1622
|
// reserve memory for context buffers
|
1667
|
-
{
|
1623
|
+
if (!params.vocab_only) {
|
1668
1624
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
1669
1625
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
1670
1626
|
llama_free(ctx);
|
@@ -1705,6 +1661,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1705
1661
|
ggml_free(ctx->model.ctx);
|
1706
1662
|
}
|
1707
1663
|
|
1664
|
+
if (ctx->model.mm_addr) {
|
1665
|
+
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1666
|
+
}
|
1667
|
+
|
1708
1668
|
delete ctx;
|
1709
1669
|
}
|
1710
1670
|
|
@@ -1720,6 +1680,33 @@ int llama_model_quantize(
|
|
1720
1680
|
return 0;
|
1721
1681
|
}
|
1722
1682
|
|
1683
|
+
// Returns the KV cache that will contain the context for the
|
1684
|
+
// ongoing prediction with the model.
|
1685
|
+
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
1686
|
+
return ctx->model.kv_self.buf.data();
|
1687
|
+
}
|
1688
|
+
|
1689
|
+
// Returns the size of the KV cache
|
1690
|
+
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
1691
|
+
return ctx->model.kv_self.buf.size();
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
1695
|
+
return ctx->model.kv_self.n;
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
// Sets the KV cache containing the current context for the model
|
1699
|
+
void llama_set_kv_cache(
|
1700
|
+
struct llama_context * ctx,
|
1701
|
+
const uint8_t * kv_cache,
|
1702
|
+
size_t n_size,
|
1703
|
+
int n_token_count) {
|
1704
|
+
// Make sure we have the same kv cache setup
|
1705
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
1706
|
+
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
1707
|
+
ctx->model.kv_self.n = n_token_count;
|
1708
|
+
}
|
1709
|
+
|
1723
1710
|
int llama_eval(
|
1724
1711
|
struct llama_context * ctx,
|
1725
1712
|
const llama_token * tokens,
|
@@ -1730,7 +1717,11 @@ int llama_eval(
|
|
1730
1717
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
1731
1718
|
return 1;
|
1732
1719
|
}
|
1733
|
-
|
1720
|
+
// get a more accurate load time, upon first eval
|
1721
|
+
if (!ctx->has_evaluated_once) {
|
1722
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
1723
|
+
ctx->has_evaluated_once = true;
|
1724
|
+
}
|
1734
1725
|
return 0;
|
1735
1726
|
}
|
1736
1727
|
|
@@ -1823,9 +1814,9 @@ llama_token llama_sample_top_p_top_k(
|
|
1823
1814
|
void llama_print_timings(struct llama_context * ctx) {
|
1824
1815
|
const int64_t t_end_us = ggml_time_us();
|
1825
1816
|
|
1826
|
-
const int32_t n_sample =
|
1827
|
-
const int32_t n_eval =
|
1828
|
-
const int32_t n_p_eval =
|
1817
|
+
const int32_t n_sample = Max(1, ctx->n_sample);
|
1818
|
+
const int32_t n_eval = Max(1, ctx->n_eval);
|
1819
|
+
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
|
1829
1820
|
|
1830
1821
|
fprintf(stderr, "\n");
|
1831
1822
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
@@ -1837,7 +1828,6 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
1837
1828
|
|
1838
1829
|
void llama_reset_timings(struct llama_context * ctx) {
|
1839
1830
|
ctx->t_start_us = ggml_time_us();
|
1840
|
-
|
1841
1831
|
ctx->t_sample_us = ctx->n_sample = 0;
|
1842
1832
|
ctx->t_eval_us = ctx->n_eval = 0;
|
1843
1833
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|