llama_cpp 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +246 -133
- data/ext/llama_cpp/src/ggml.c +362 -137
- data/ext/llama_cpp/src/ggml.h +13 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +173 -102
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,7 +190,7 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
-
#define GGML_QNT_VERSION
|
193
|
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
194
194
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
195
|
|
196
196
|
#define GGML_MAX_DIMS 4
|
@@ -313,6 +313,7 @@ extern "C" {
|
|
313
313
|
GGML_OP_ROPE,
|
314
314
|
GGML_OP_ROPE_BACK,
|
315
315
|
GGML_OP_ALIBI,
|
316
|
+
GGML_OP_CLAMP,
|
316
317
|
GGML_OP_CONV_1D_1S,
|
317
318
|
GGML_OP_CONV_1D_2S,
|
318
319
|
|
@@ -849,7 +850,7 @@ extern "C" {
|
|
849
850
|
int n_past);
|
850
851
|
|
851
852
|
// in-place, returns view(a)
|
852
|
-
GGML_API struct ggml_tensor *
|
853
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
853
854
|
struct ggml_context * ctx,
|
854
855
|
struct ggml_tensor * a,
|
855
856
|
int n_past);
|
@@ -897,7 +898,16 @@ extern "C" {
|
|
897
898
|
struct ggml_context * ctx,
|
898
899
|
struct ggml_tensor * a,
|
899
900
|
int n_past,
|
900
|
-
int n_head
|
901
|
+
int n_head,
|
902
|
+
float bias_max);
|
903
|
+
|
904
|
+
// clamp
|
905
|
+
// in-place, returns view(a)
|
906
|
+
struct ggml_tensor * ggml_clamp(
|
907
|
+
struct ggml_context * ctx,
|
908
|
+
struct ggml_tensor * a,
|
909
|
+
float min,
|
910
|
+
float max);
|
901
911
|
|
902
912
|
// padding = 1
|
903
913
|
// TODO: we don't support extra parameters for now
|
@@ -101,12 +101,12 @@ struct llama_file {
|
|
101
101
|
LLAMA_ASSERT(ret == 0); // same
|
102
102
|
}
|
103
103
|
|
104
|
-
void read_raw(void * ptr, size_t
|
105
|
-
if (
|
104
|
+
void read_raw(void * ptr, size_t len) const {
|
105
|
+
if (len == 0) {
|
106
106
|
return;
|
107
107
|
}
|
108
108
|
errno = 0;
|
109
|
-
std::size_t ret = std::fread(ptr,
|
109
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
110
110
|
if (ferror(fp)) {
|
111
111
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
112
112
|
}
|
@@ -127,12 +127,12 @@ struct llama_file {
|
|
127
127
|
return std::string(chars.data(), len);
|
128
128
|
}
|
129
129
|
|
130
|
-
void write_raw(const void * ptr, size_t
|
131
|
-
if (
|
130
|
+
void write_raw(const void * ptr, size_t len) const {
|
131
|
+
if (len == 0) {
|
132
132
|
return;
|
133
133
|
}
|
134
134
|
errno = 0;
|
135
|
-
size_t ret = std::fwrite(ptr,
|
135
|
+
size_t ret = std::fwrite(ptr, len, 1, fp);
|
136
136
|
if (ret != 1) {
|
137
137
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
138
138
|
}
|
@@ -172,7 +172,7 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file,
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
@@ -184,9 +184,9 @@ struct llama_mmap {
|
|
184
184
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
185
185
|
}
|
186
186
|
|
187
|
-
if (prefetch) {
|
187
|
+
if (prefetch > 0) {
|
188
188
|
// Advise the kernel to preload the mapped memory
|
189
|
-
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
189
|
+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
190
190
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
191
191
|
strerror(errno));
|
192
192
|
}
|
@@ -267,9 +267,9 @@ struct llama_mlock {
|
|
267
267
|
}
|
268
268
|
}
|
269
269
|
|
270
|
-
void init(void *
|
271
|
-
LLAMA_ASSERT(
|
272
|
-
|
270
|
+
void init(void * ptr) {
|
271
|
+
LLAMA_ASSERT(addr == NULL && size == 0);
|
272
|
+
addr = ptr;
|
273
273
|
}
|
274
274
|
|
275
275
|
void grow_to(size_t target_size) {
|
@@ -340,14 +340,14 @@ struct llama_mlock {
|
|
340
340
|
return (size_t) si.dwPageSize;
|
341
341
|
}
|
342
342
|
|
343
|
-
bool raw_lock(void *
|
343
|
+
bool raw_lock(void * ptr, size_t len) {
|
344
344
|
for (int tries = 1; ; tries++) {
|
345
|
-
if (VirtualLock(
|
345
|
+
if (VirtualLock(ptr, len)) {
|
346
346
|
return true;
|
347
347
|
}
|
348
348
|
if (tries == 2) {
|
349
349
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
350
|
-
|
350
|
+
len, size, llama_format_win_err(GetLastError()).c_str());
|
351
351
|
return false;
|
352
352
|
}
|
353
353
|
|
@@ -363,7 +363,7 @@ struct llama_mlock {
|
|
363
363
|
// is equal to the number of pages in its minimum working set minus
|
364
364
|
// a small overhead."
|
365
365
|
// Hopefully a megabyte is enough overhead:
|
366
|
-
size_t increment =
|
366
|
+
size_t increment = len + 1048576;
|
367
367
|
// The minimum must be <= the maximum, so we need to increase both:
|
368
368
|
min_ws_size += increment;
|
369
369
|
max_ws_size += increment;
|
@@ -375,8 +375,8 @@ struct llama_mlock {
|
|
375
375
|
}
|
376
376
|
}
|
377
377
|
|
378
|
-
void raw_unlock(void *
|
379
|
-
if (!VirtualUnlock(
|
378
|
+
void raw_unlock(void * ptr, size_t len) {
|
379
|
+
if (!VirtualUnlock(ptr, len)) {
|
380
380
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
381
381
|
llama_format_win_err(GetLastError()).c_str());
|
382
382
|
}
|
@@ -388,12 +388,12 @@ struct llama_mlock {
|
|
388
388
|
return (size_t) 65536;
|
389
389
|
}
|
390
390
|
|
391
|
-
bool raw_lock(const void * addr, size_t
|
391
|
+
bool raw_lock(const void * addr, size_t len) {
|
392
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
393
|
return false;
|
394
394
|
}
|
395
395
|
|
396
|
-
void raw_unlock(const void * addr, size_t
|
396
|
+
void raw_unlock(const void * addr, size_t len) {}
|
397
397
|
#endif
|
398
398
|
};
|
399
399
|
|
@@ -404,10 +404,10 @@ struct llama_buffer {
|
|
404
404
|
|
405
405
|
llama_buffer() = default;
|
406
406
|
|
407
|
-
void resize(size_t
|
407
|
+
void resize(size_t len) {
|
408
408
|
delete[] addr;
|
409
|
-
addr = new uint8_t[
|
410
|
-
|
409
|
+
addr = new uint8_t[len];
|
410
|
+
size = len;
|
411
411
|
}
|
412
412
|
|
413
413
|
~llama_buffer() {
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstddef>
|
4
5
|
#include <cstdint>
|
5
6
|
#include <cstdio>
|
6
7
|
#endif
|
@@ -45,6 +46,7 @@ enum e_model {
|
|
45
46
|
MODEL_65B,
|
46
47
|
};
|
47
48
|
|
49
|
+
|
48
50
|
static const size_t MB = 1024*1024;
|
49
51
|
|
50
52
|
// computed for n_ctx == 2048
|
@@ -110,7 +112,7 @@ struct llama_hparams {
|
|
110
112
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
111
113
|
|
112
114
|
bool operator!=(const llama_hparams & other) const {
|
113
|
-
return memcmp(this, &other, sizeof(llama_hparams));
|
115
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
114
116
|
}
|
115
117
|
};
|
116
118
|
|
@@ -406,6 +408,7 @@ enum llama_file_version {
|
|
406
408
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
407
409
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
410
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
411
|
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
409
412
|
};
|
410
413
|
|
411
414
|
struct llama_file_loader {
|
@@ -424,24 +427,30 @@ struct llama_file_loader {
|
|
424
427
|
}
|
425
428
|
void read_magic() {
|
426
429
|
uint32_t magic = file.read_u32();
|
427
|
-
uint32_t version = 0;
|
428
430
|
|
429
|
-
if (magic
|
430
|
-
|
431
|
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
432
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
433
|
+
return;
|
431
434
|
}
|
432
435
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
436
|
+
uint32_t version = file.read_u32();
|
437
|
+
|
438
|
+
switch (magic) {
|
439
|
+
case LLAMA_FILE_MAGIC_GGMF:
|
440
|
+
switch (version) {
|
441
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
442
|
+
}
|
443
|
+
break;
|
444
|
+
case LLAMA_FILE_MAGIC_GGJT:
|
445
|
+
switch (version) {
|
446
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
447
|
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
448
|
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
449
|
+
}
|
444
450
|
}
|
451
|
+
|
452
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
453
|
+
magic, version);
|
445
454
|
}
|
446
455
|
void read_hparams() {
|
447
456
|
hparams.n_vocab = file.read_u32();
|
@@ -499,7 +508,7 @@ struct llama_file_loader {
|
|
499
508
|
|
500
509
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
501
510
|
// skip to the next multiple of 32 bytes
|
502
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
511
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
503
512
|
}
|
504
513
|
shard.file_idx = file_idx;
|
505
514
|
shard.file_off = file.tell();
|
@@ -574,7 +583,7 @@ struct llama_file_saver {
|
|
574
583
|
file.write_u32(new_type);
|
575
584
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
576
585
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
577
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
586
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
578
587
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
579
588
|
file.write_raw(new_data, new_size);
|
580
589
|
}
|
@@ -641,7 +650,7 @@ struct llama_model_loader {
|
|
641
650
|
}
|
642
651
|
}
|
643
652
|
|
644
|
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
653
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
645
654
|
auto it = tensors_map.name_to_idx.find(name);
|
646
655
|
if (it == tensors_map.name_to_idx.end()) {
|
647
656
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -652,10 +661,10 @@ struct llama_model_loader {
|
|
652
661
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
653
662
|
}
|
654
663
|
|
655
|
-
return get_tensor_for(lt);
|
664
|
+
return get_tensor_for(lt, backend);
|
656
665
|
}
|
657
666
|
|
658
|
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
667
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
659
668
|
struct ggml_tensor * tensor;
|
660
669
|
if (lt.ne.size() == 2) {
|
661
670
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
@@ -665,6 +674,7 @@ struct llama_model_loader {
|
|
665
674
|
}
|
666
675
|
ggml_set_name(tensor, lt.name.c_str());
|
667
676
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
677
|
+
tensor->backend = backend;
|
668
678
|
lt.ggml_tensor = tensor;
|
669
679
|
num_ggml_tensors_created++;
|
670
680
|
return tensor;
|
@@ -678,12 +688,16 @@ struct llama_model_loader {
|
|
678
688
|
|
679
689
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
680
690
|
size_t data_size = 0;
|
691
|
+
size_t prefetch_size = 0;
|
681
692
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
682
693
|
data_size += lt.size;
|
694
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
695
|
+
prefetch_size += lt.size;
|
696
|
+
}
|
683
697
|
}
|
684
698
|
|
685
699
|
if (use_mmap) {
|
686
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
700
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
687
701
|
if (!lmlock) {
|
688
702
|
// Don't call the callback since the actual loading will be lazy
|
689
703
|
// and we can't measure it.
|
@@ -696,6 +710,9 @@ struct llama_model_loader {
|
|
696
710
|
|
697
711
|
size_t done_size = 0;
|
698
712
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
713
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
714
|
+
continue;
|
715
|
+
}
|
699
716
|
if (progress_callback) {
|
700
717
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
701
718
|
}
|
@@ -708,9 +725,6 @@ struct llama_model_loader {
|
|
708
725
|
lmlock->grow_to(done_size);
|
709
726
|
}
|
710
727
|
}
|
711
|
-
if (progress_callback) {
|
712
|
-
progress_callback(1.0f, progress_callback_user_data);
|
713
|
-
}
|
714
728
|
}
|
715
729
|
|
716
730
|
void load_data_for(llama_load_tensor & lt) {
|
@@ -835,6 +849,21 @@ bool llama_mlock_supported() {
|
|
835
849
|
return llama_mlock::SUPPORTED;
|
836
850
|
}
|
837
851
|
|
852
|
+
void llama_init_backend() {
|
853
|
+
ggml_time_init();
|
854
|
+
|
855
|
+
// needed to initialize f16 tables
|
856
|
+
{
|
857
|
+
struct ggml_init_params params = { 0, NULL, false };
|
858
|
+
struct ggml_context * ctx = ggml_init(params);
|
859
|
+
ggml_free(ctx);
|
860
|
+
}
|
861
|
+
}
|
862
|
+
|
863
|
+
int64_t llama_time_us() {
|
864
|
+
return ggml_time_us();
|
865
|
+
}
|
866
|
+
|
838
867
|
//
|
839
868
|
// model loading
|
840
869
|
//
|
@@ -844,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
844
873
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
845
874
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
846
875
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
-
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (
|
876
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
877
|
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
848
878
|
}
|
849
879
|
|
850
880
|
return "unknown";
|
@@ -924,11 +954,19 @@ static void llama_model_load_internal(
|
|
924
954
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
925
955
|
}
|
926
956
|
|
927
|
-
if (file_version
|
957
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
928
958
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
959
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
960
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/
|
961
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
962
|
+
}
|
963
|
+
}
|
964
|
+
|
965
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
966
|
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
967
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
968
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
969
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
932
970
|
}
|
933
971
|
}
|
934
972
|
|
@@ -941,27 +979,7 @@ static void llama_model_load_internal(
|
|
941
979
|
size_t ctx_size;
|
942
980
|
size_t mmapped_size;
|
943
981
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
944
|
-
fprintf(stderr, "%s: ggml ctx size = %
|
945
|
-
|
946
|
-
// print memory requirements
|
947
|
-
{
|
948
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
949
|
-
|
950
|
-
// this is the total memory required to run the inference
|
951
|
-
const size_t mem_required =
|
952
|
-
ctx_size +
|
953
|
-
mmapped_size +
|
954
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
955
|
-
MEM_REQ_SCRATCH1().at(model.type) +
|
956
|
-
MEM_REQ_EVAL().at(model.type);
|
957
|
-
|
958
|
-
// this is the memory required by one llama_state
|
959
|
-
const size_t mem_required_state =
|
960
|
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
961
|
-
|
962
|
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
963
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
964
|
-
}
|
982
|
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
965
983
|
|
966
984
|
// create the ggml context
|
967
985
|
{
|
@@ -983,7 +1001,14 @@ static void llama_model_load_internal(
|
|
983
1001
|
}
|
984
1002
|
}
|
985
1003
|
|
1004
|
+
#ifdef GGML_USE_CUBLAS
|
1005
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
1006
|
+
#else
|
1007
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1008
|
+
#endif
|
1009
|
+
|
986
1010
|
// prepare memory for the weights
|
1011
|
+
size_t vram_total = 0;
|
987
1012
|
{
|
988
1013
|
const uint32_t n_embd = hparams.n_embd;
|
989
1014
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -991,70 +1016,122 @@ static void llama_model_load_internal(
|
|
991
1016
|
|
992
1017
|
ml->ggml_ctx = ctx;
|
993
1018
|
|
994
|
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
995
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
996
|
-
|
1019
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1020
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1021
|
+
|
1022
|
+
// "output" tensor
|
1023
|
+
{
|
1024
|
+
ggml_backend backend_output;
|
1025
|
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1026
|
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
1027
|
+
} else {
|
1028
|
+
backend_output = GGML_BACKEND_CPU;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
997
1035
|
|
998
1036
|
model.layers.resize(n_layer);
|
999
1037
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1038
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1039
|
+
|
1000
1040
|
auto & layer = model.layers[i];
|
1001
1041
|
|
1002
1042
|
std::string layers_i = "layers." + std::to_string(i);
|
1003
1043
|
|
1004
|
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
1044
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1045
|
+
|
1046
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
1047
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
1048
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
1049
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
1005
1050
|
|
1006
|
-
layer.
|
1007
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
1008
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
1009
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
1051
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1010
1052
|
|
1011
|
-
layer.
|
1053
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
1054
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
1055
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
1012
1056
|
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1057
|
+
if (backend == GGML_BACKEND_CUDA) {
|
1058
|
+
vram_total +=
|
1059
|
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1060
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1061
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1062
|
+
}
|
1016
1063
|
}
|
1017
1064
|
}
|
1018
1065
|
|
1019
1066
|
ml->done_getting_tensors();
|
1020
1067
|
|
1021
|
-
//
|
1022
|
-
|
1023
|
-
|
1024
|
-
}
|
1068
|
+
// print memory requirements
|
1069
|
+
{
|
1070
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1025
1071
|
|
1026
|
-
|
1072
|
+
// this is the total memory required to run the inference
|
1073
|
+
const size_t mem_required =
|
1074
|
+
ctx_size +
|
1075
|
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
1076
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
1077
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
1078
|
+
MEM_REQ_EVAL().at(model.type);
|
1079
|
+
|
1080
|
+
// this is the memory required by one llama_state
|
1081
|
+
const size_t mem_required_state =
|
1082
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
1083
|
+
|
1084
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1085
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1027
1086
|
|
1028
|
-
model.mapping = std::move(ml->mapping);
|
1029
1087
|
#ifdef GGML_USE_CUBLAS
|
1030
|
-
{
|
1031
1088
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
1089
|
|
1033
1090
|
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1091
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1092
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1093
|
+
}
|
1094
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1095
|
+
#else
|
1096
|
+
(void) n_gpu_layers;
|
1097
|
+
#endif
|
1098
|
+
}
|
1034
1099
|
|
1035
|
-
|
1100
|
+
// populate `tensors_by_name`
|
1101
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1102
|
+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
1103
|
+
}
|
1036
1104
|
|
1037
|
-
|
1038
|
-
const auto & layer = model.layers[i];
|
1105
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1039
1106
|
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1107
|
+
#ifdef GGML_USE_CUBLAS
|
1108
|
+
{
|
1109
|
+
size_t done_size = 0;
|
1110
|
+
size_t data_size = 0;
|
1111
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1112
|
+
data_size += lt.size;
|
1113
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1114
|
+
done_size += lt.size;
|
1115
|
+
}
|
1047
1116
|
}
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1117
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1118
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1119
|
+
continue;
|
1120
|
+
}
|
1121
|
+
if (progress_callback) {
|
1122
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1123
|
+
}
|
1124
|
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1125
|
+
done_size += lt.size;
|
1051
1126
|
}
|
1127
|
+
}
|
1128
|
+
#endif // GGML_USE_CUBLAS
|
1052
1129
|
|
1053
|
-
|
1130
|
+
if (progress_callback) {
|
1131
|
+
progress_callback(1.0f, progress_callback_user_data);
|
1054
1132
|
}
|
1055
|
-
|
1056
|
-
(
|
1057
|
-
#endif
|
1133
|
+
|
1134
|
+
model.mapping = std::move(ml->mapping);
|
1058
1135
|
|
1059
1136
|
// loading time will be recalculate after the first eval, so
|
1060
1137
|
// we take page faults deferred by mmap() into consideration
|
@@ -1153,10 +1230,8 @@ static bool llama_eval_internal(
|
|
1153
1230
|
{
|
1154
1231
|
cur = ggml_rms_norm(ctx0, inpL);
|
1155
1232
|
|
1156
|
-
// cur = attention_norm
|
1157
|
-
cur = ggml_mul(ctx0,
|
1158
|
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
1159
|
-
cur);
|
1233
|
+
// cur = cur*attention_norm(broadcasted)
|
1234
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1160
1235
|
}
|
1161
1236
|
|
1162
1237
|
// self-attention
|
@@ -1263,10 +1338,8 @@ static bool llama_eval_internal(
|
|
1263
1338
|
{
|
1264
1339
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1265
1340
|
|
1266
|
-
// cur = ffn_norm
|
1267
|
-
cur = ggml_mul(ctx0,
|
1268
|
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
1269
|
-
cur);
|
1341
|
+
// cur = cur*ffn_norm(broadcasted)
|
1342
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1270
1343
|
}
|
1271
1344
|
|
1272
1345
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
@@ -1303,10 +1376,8 @@ static bool llama_eval_internal(
|
|
1303
1376
|
|
1304
1377
|
inpL = ggml_rms_norm(ctx0, inpL);
|
1305
1378
|
|
1306
|
-
// inpL = norm
|
1307
|
-
inpL = ggml_mul(ctx0,
|
1308
|
-
ggml_repeat(ctx0, model.norm, inpL),
|
1309
|
-
inpL);
|
1379
|
+
// inpL = inpL*norm(broadcasted)
|
1380
|
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
1310
1381
|
|
1311
1382
|
embeddings = inpL;
|
1312
1383
|
}
|
@@ -2130,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
|
2130
2201
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
2131
2202
|
unsigned percentage = (unsigned) (100 * progress);
|
2132
2203
|
while (percentage > *cur_percentage_p) {
|
2133
|
-
|
2204
|
+
*cur_percentage_p = percentage;
|
2134
2205
|
fprintf(stderr, ".");
|
2135
2206
|
fflush(stderr);
|
2136
2207
|
if (percentage >= 100) {
|
@@ -2223,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2223
2294
|
{
|
2224
2295
|
uint32_t magic;
|
2225
2296
|
fin.read((char *) &magic, sizeof(magic));
|
2226
|
-
if (magic !=
|
2297
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
2227
2298
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
2228
2299
|
return 1;
|
2229
2300
|
}
|
@@ -2287,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2287
2358
|
|
2288
2359
|
// maybe this should in llama_model_loader
|
2289
2360
|
if (model_loader->use_mmap) {
|
2290
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
2361
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2291
2362
|
}
|
2292
2363
|
}
|
2293
2364
|
|
@@ -2380,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2380
2451
|
}
|
2381
2452
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
2382
2453
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
2383
|
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
2454
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
2384
2455
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
2385
2456
|
model_loader->load_data_for(lt);
|
2386
2457
|
lt.ggml_tensor->data = lt.data;
|
@@ -2606,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
2606
2677
|
}
|
2607
2678
|
|
2608
2679
|
// Sets the state reading from the specified source address
|
2609
|
-
size_t llama_set_state_data(struct llama_context * ctx,
|
2610
|
-
|
2680
|
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
2681
|
+
uint8_t * inp = src;
|
2611
2682
|
|
2612
2683
|
// set rng
|
2613
2684
|
{
|