llama_cpp 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -21,9 +21,13 @@
|
|
21
21
|
#endif
|
22
22
|
#ifdef GGML_USE_K_QUANTS
|
23
23
|
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
24
27
|
#define QK_K 256
|
25
28
|
#endif
|
26
29
|
#endif
|
30
|
+
#endif
|
27
31
|
|
28
32
|
#include <array>
|
29
33
|
#include <ctime>
|
@@ -182,6 +186,19 @@ struct llama_kv_cache {
|
|
182
186
|
}
|
183
187
|
};
|
184
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
185
202
|
struct llama_model {
|
186
203
|
e_model type = MODEL_UNKNOWN;
|
187
204
|
|
@@ -198,10 +215,6 @@ struct llama_model {
|
|
198
215
|
// context
|
199
216
|
struct ggml_context * ctx = NULL;
|
200
217
|
|
201
|
-
// key + value cache for the self attention
|
202
|
-
// TODO: move to llama_state
|
203
|
-
struct llama_kv_cache kv_self;
|
204
|
-
|
205
218
|
// the model memory buffer
|
206
219
|
llama_ctx_buffer buf;
|
207
220
|
|
@@ -215,6 +228,11 @@ struct llama_model {
|
|
215
228
|
// for quantize-stats only
|
216
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
217
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
218
236
|
~llama_model() {
|
219
237
|
if (ctx) {
|
220
238
|
ggml_free(ctx);
|
@@ -233,24 +251,11 @@ struct llama_model {
|
|
233
251
|
}
|
234
252
|
};
|
235
253
|
|
236
|
-
struct llama_vocab {
|
237
|
-
using id = int32_t;
|
238
|
-
using token = std::string;
|
239
|
-
|
240
|
-
struct token_score {
|
241
|
-
token tok;
|
242
|
-
float score;
|
243
|
-
};
|
244
|
-
|
245
|
-
std::unordered_map<token, id> token_to_id;
|
246
|
-
std::vector<token_score> id_to_token;
|
247
|
-
};
|
248
|
-
|
249
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
250
257
|
std::mt19937 rng;
|
251
258
|
|
252
|
-
int64_t t_load_us = 0;
|
253
|
-
int64_t t_start_us = 0;
|
254
259
|
bool has_evaluated_once = false;
|
255
260
|
|
256
261
|
int64_t t_sample_us = 0;
|
@@ -261,8 +266,16 @@ struct llama_context {
|
|
261
266
|
int32_t n_eval = 0; // number of eval calls
|
262
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
263
268
|
|
264
|
-
llama_model model;
|
265
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
266
279
|
|
267
280
|
size_t mem_per_token = 0;
|
268
281
|
|
@@ -351,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
351
364
|
return size / ggml_blck_size(type);
|
352
365
|
}
|
353
366
|
|
354
|
-
struct llama_load_tensor_shard {
|
355
|
-
std::vector<uint32_t> ne;
|
356
|
-
size_t size;
|
357
|
-
enum ggml_type type;
|
358
|
-
size_t file_idx;
|
359
|
-
size_t file_off;
|
360
|
-
|
361
|
-
void calc_size() {
|
362
|
-
size = llama_calc_tensor_size(ne, type);
|
363
|
-
}
|
364
|
-
};
|
365
|
-
|
366
|
-
enum llama_split_type {
|
367
|
-
SPLIT_NONE,
|
368
|
-
SPLIT_BY_COLUMNS,
|
369
|
-
SPLIT_BY_ROWS
|
370
|
-
};
|
371
|
-
|
372
367
|
struct llama_load_tensor {
|
373
|
-
std::vector<llama_load_tensor_shard> shards;
|
374
|
-
|
375
368
|
std::string name;
|
376
369
|
enum ggml_type type = GGML_TYPE_F32;
|
377
|
-
llama_split_type split_type = SPLIT_NONE;
|
378
370
|
std::vector<uint32_t> ne;
|
371
|
+
size_t file_off;
|
379
372
|
size_t size;
|
380
373
|
struct ggml_tensor * ggml_tensor = NULL;
|
381
374
|
uint8_t * data;
|
382
|
-
|
383
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
384
|
-
|
385
|
-
void calc_all() {
|
386
|
-
calc_type();
|
387
|
-
calc_split_type();
|
388
|
-
calc_ne();
|
389
|
-
calc_size();
|
390
|
-
}
|
391
|
-
|
392
|
-
void calc_type() {
|
393
|
-
const auto & first_shard = shards.at(0);
|
394
|
-
for (const auto & shard : shards) {
|
395
|
-
if (shard.type != first_shard.type) {
|
396
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
397
|
-
}
|
398
|
-
}
|
399
|
-
type = first_shard.type;
|
400
|
-
}
|
401
|
-
|
402
|
-
void calc_split_type() {
|
403
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
404
|
-
shards.size() == 1) { // only one file?
|
405
|
-
split_type = SPLIT_NONE;
|
406
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
407
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
408
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
409
|
-
split_type = SPLIT_BY_COLUMNS;
|
410
|
-
} else {
|
411
|
-
split_type = SPLIT_BY_ROWS;
|
412
|
-
}
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_ne() {
|
416
|
-
const auto & first_shard = shards.at(0);
|
417
|
-
for (const auto & shard : shards) {
|
418
|
-
if (shard.ne != first_shard.ne) {
|
419
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
420
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
421
|
-
}
|
422
|
-
}
|
423
|
-
ne = first_shard.ne;
|
424
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
425
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
426
|
-
switch (split_type) {
|
427
|
-
case SPLIT_NONE:
|
428
|
-
ne = first_shard.ne;
|
429
|
-
break;
|
430
|
-
case SPLIT_BY_COLUMNS:
|
431
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
432
|
-
first_shard.ne[1]};
|
433
|
-
break;
|
434
|
-
case SPLIT_BY_ROWS:
|
435
|
-
ne = {first_shard.ne[0],
|
436
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
437
|
-
break;
|
438
|
-
}
|
439
|
-
}
|
440
|
-
|
441
|
-
void calc_size() {
|
442
|
-
size = llama_calc_tensor_size(ne, type);
|
443
|
-
}
|
444
375
|
};
|
445
376
|
|
446
377
|
struct llama_load_tensors_map {
|
@@ -463,13 +394,13 @@ struct llama_file_loader {
|
|
463
394
|
llama_hparams hparams;
|
464
395
|
llama_vocab vocab;
|
465
396
|
|
466
|
-
llama_file_loader(const char * fname,
|
397
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
467
398
|
: file(fname, "rb") {
|
468
399
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
469
400
|
read_magic();
|
470
401
|
read_hparams();
|
471
402
|
read_vocab();
|
472
|
-
read_tensor_metadata(
|
403
|
+
read_tensor_metadata(tensors_map);
|
473
404
|
}
|
474
405
|
void read_magic() {
|
475
406
|
uint32_t magic = file.read_u32();
|
@@ -526,19 +457,19 @@ struct llama_file_loader {
|
|
526
457
|
tok_score.score = score;
|
527
458
|
}
|
528
459
|
}
|
529
|
-
void read_tensor_metadata(
|
460
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
530
461
|
while (file.tell() < file.size) {
|
531
|
-
|
462
|
+
llama_load_tensor tensor;
|
532
463
|
uint32_t n_dims = file.read_u32();
|
533
464
|
uint32_t name_len = file.read_u32();
|
534
|
-
|
535
|
-
|
536
|
-
file.read_raw(
|
465
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
466
|
+
tensor.ne.resize(n_dims);
|
467
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
537
468
|
std::string name = file.read_string(name_len);
|
538
469
|
if (n_dims < 1 || n_dims > 2) {
|
539
470
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
540
471
|
}
|
541
|
-
switch (
|
472
|
+
switch (tensor.type) {
|
542
473
|
case GGML_TYPE_F32:
|
543
474
|
case GGML_TYPE_F16:
|
544
475
|
case GGML_TYPE_Q4_0:
|
@@ -553,30 +484,20 @@ struct llama_file_loader {
|
|
553
484
|
case GGML_TYPE_Q6_K:
|
554
485
|
break;
|
555
486
|
default: {
|
556
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
487
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
557
488
|
}
|
558
489
|
}
|
559
490
|
|
560
|
-
|
561
|
-
|
562
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
563
|
-
}
|
564
|
-
shard.file_idx = file_idx;
|
565
|
-
shard.file_off = file.tell();
|
491
|
+
// skip to the next multiple of 32 bytes
|
492
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
566
493
|
|
567
|
-
|
568
|
-
|
494
|
+
tensor.file_off = file.tell();
|
495
|
+
tensor.name = name;
|
496
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
497
|
+
file.seek(tensor.size, SEEK_CUR);
|
569
498
|
|
570
|
-
|
571
|
-
|
572
|
-
if (it != tensors_map.name_to_idx.end()) {
|
573
|
-
idx = it->second;
|
574
|
-
} else {
|
575
|
-
tensors_map.tensors.emplace_back(name);
|
576
|
-
idx = tensors_map.tensors.size() - 1;
|
577
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
578
|
-
}
|
579
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
499
|
+
tensors_map.tensors.push_back(tensor);
|
500
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
580
501
|
}
|
581
502
|
}
|
582
503
|
};
|
@@ -646,56 +567,19 @@ struct llama_file_saver {
|
|
646
567
|
};
|
647
568
|
|
648
569
|
struct llama_model_loader {
|
649
|
-
std::
|
570
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
650
571
|
llama_load_tensors_map tensors_map;
|
651
572
|
bool use_mmap;
|
652
573
|
size_t num_ggml_tensors_created = 0;
|
653
574
|
struct ggml_context * ggml_ctx = NULL;
|
654
575
|
std::unique_ptr<llama_mmap> mapping;
|
655
576
|
|
656
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
657
|
-
|
658
|
-
file_loaders.emplace_back(first_file);
|
659
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
660
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
661
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
662
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
663
|
-
file_loaders.emplace_back(ith_file);
|
664
|
-
if (ith_file->hparams != first_file->hparams) {
|
665
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
666
|
-
}
|
667
|
-
}
|
577
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
578
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
668
579
|
if (!llama_mmap::SUPPORTED) {
|
669
580
|
use_mmap = false;
|
670
581
|
}
|
671
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
672
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
673
|
-
use_mmap = false;
|
674
|
-
}
|
675
582
|
this->use_mmap = use_mmap;
|
676
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
677
|
-
lt.calc_all();
|
678
|
-
}
|
679
|
-
}
|
680
|
-
|
681
|
-
bool alignment_prevents_mmap() {
|
682
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
683
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
684
|
-
if (shard.file_off & 3) {
|
685
|
-
return true;
|
686
|
-
}
|
687
|
-
}
|
688
|
-
}
|
689
|
-
return false;
|
690
|
-
}
|
691
|
-
|
692
|
-
uint32_t guess_n_parts() const {
|
693
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
694
|
-
if (it == tensors_map.name_to_idx.end()) {
|
695
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
696
|
-
}
|
697
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
698
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
699
583
|
}
|
700
584
|
|
701
585
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -761,7 +645,7 @@ struct llama_model_loader {
|
|
761
645
|
}
|
762
646
|
|
763
647
|
if (use_mmap) {
|
764
|
-
mapping.reset(new llama_mmap(&
|
648
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
765
649
|
if (lmlock) {
|
766
650
|
lmlock->init(mapping->addr);
|
767
651
|
}
|
@@ -817,45 +701,13 @@ struct llama_model_loader {
|
|
817
701
|
|
818
702
|
void load_data_for(llama_load_tensor & lt) {
|
819
703
|
if (use_mmap) {
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
704
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
705
|
+
} else {
|
706
|
+
llama_file & file = file_loader->file;
|
707
|
+
file.seek(lt.file_off, SEEK_SET);
|
825
708
|
file.read_raw(lt.data, lt.size);
|
826
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
827
|
-
size_t offset = 0;
|
828
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
829
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
830
|
-
file.seek(shard.file_off, SEEK_SET);
|
831
|
-
file.read_raw(lt.data + offset, shard.size);
|
832
|
-
offset += shard.size;
|
833
|
-
}
|
834
|
-
LLAMA_ASSERT(offset == lt.size);
|
835
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
836
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
837
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
838
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
839
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
840
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
841
|
-
file.seek(shard.file_off, SEEK_SET);
|
842
|
-
tmp_bufs.at(i).resize(shard.size);
|
843
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
844
|
-
}
|
845
|
-
// Then reshape.
|
846
|
-
size_t num_rows = lt.ne.at(1);
|
847
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
848
|
-
size_t out_offset = 0;
|
849
|
-
for (size_t row = 0; row < num_rows; row++) {
|
850
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
851
|
-
memcpy(lt.data + out_offset,
|
852
|
-
tmp_buf.addr + row * per_shard_row_size,
|
853
|
-
per_shard_row_size);
|
854
|
-
out_offset += per_shard_row_size;
|
855
|
-
}
|
856
|
-
}
|
857
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
858
709
|
}
|
710
|
+
|
859
711
|
if (0) {
|
860
712
|
print_checksum(lt);
|
861
713
|
}
|
@@ -925,7 +777,7 @@ static bool kv_cache_init(
|
|
925
777
|
|
926
778
|
struct llama_context_params llama_context_default_params() {
|
927
779
|
struct llama_context_params result = {
|
928
|
-
/*.seed =*/
|
780
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
929
781
|
/*.n_ctx =*/ 512,
|
930
782
|
/*.n_batch =*/ 512,
|
931
783
|
/*.gpu_layers =*/ 0,
|
@@ -964,7 +816,7 @@ bool llama_mlock_supported() {
|
|
964
816
|
return llama_mlock::SUPPORTED;
|
965
817
|
}
|
966
818
|
|
967
|
-
void llama_init_backend() {
|
819
|
+
void llama_init_backend(bool numa) {
|
968
820
|
ggml_time_init();
|
969
821
|
|
970
822
|
// needed to initialize f16 tables
|
@@ -973,6 +825,10 @@ void llama_init_backend() {
|
|
973
825
|
struct ggml_context * ctx = ggml_init(params);
|
974
826
|
ggml_free(ctx);
|
975
827
|
}
|
828
|
+
|
829
|
+
if (numa) {
|
830
|
+
ggml_numa_init();
|
831
|
+
}
|
976
832
|
}
|
977
833
|
|
978
834
|
int64_t llama_time_us() {
|
@@ -1033,7 +889,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1033
889
|
|
1034
890
|
static void llama_model_load_internal(
|
1035
891
|
const std::string & fname,
|
1036
|
-
|
892
|
+
llama_model & model,
|
893
|
+
llama_vocab & vocab,
|
1037
894
|
int n_ctx,
|
1038
895
|
int n_batch,
|
1039
896
|
int n_gpu_layers,
|
@@ -1047,15 +904,14 @@ static void llama_model_load_internal(
|
|
1047
904
|
llama_progress_callback progress_callback,
|
1048
905
|
void * progress_callback_user_data) {
|
1049
906
|
|
1050
|
-
|
907
|
+
model.t_start_us = ggml_time_us();
|
1051
908
|
|
1052
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
909
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1053
910
|
|
1054
|
-
|
1055
|
-
|
1056
|
-
model.hparams = ml->file_loaders.at(0)->hparams;
|
911
|
+
vocab = std::move(ml->file_loader->vocab);
|
912
|
+
model.hparams = ml->file_loader->hparams;
|
1057
913
|
model.n_gpu_layers = n_gpu_layers;
|
1058
|
-
llama_file_version file_version = ml->
|
914
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1059
915
|
auto & hparams = model.hparams;
|
1060
916
|
|
1061
917
|
{
|
@@ -1089,7 +945,6 @@ static void llama_model_load_internal(
|
|
1089
945
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1090
946
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1091
947
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1092
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1093
948
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1094
949
|
}
|
1095
950
|
|
@@ -1122,15 +977,15 @@ static void llama_model_load_internal(
|
|
1122
977
|
|
1123
978
|
// create the ggml context
|
1124
979
|
{
|
1125
|
-
|
980
|
+
model.buf.resize(ctx_size);
|
1126
981
|
if (use_mlock) {
|
1127
|
-
|
1128
|
-
|
982
|
+
model.mlock_buf.init(model.buf.addr);
|
983
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1129
984
|
}
|
1130
985
|
|
1131
986
|
struct ggml_init_params params = {
|
1132
|
-
/*.mem_size =*/
|
1133
|
-
/*.mem_buffer =*/
|
987
|
+
/*.mem_size =*/ model.buf.size,
|
988
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1134
989
|
/*.no_alloc =*/ ml->use_mmap,
|
1135
990
|
};
|
1136
991
|
|
@@ -1311,7 +1166,7 @@ static void llama_model_load_internal(
|
|
1311
1166
|
}
|
1312
1167
|
#endif
|
1313
1168
|
|
1314
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1169
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1315
1170
|
|
1316
1171
|
if (progress_callback) {
|
1317
1172
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1321,12 +1176,13 @@ static void llama_model_load_internal(
|
|
1321
1176
|
|
1322
1177
|
// loading time will be recalculate after the first eval, so
|
1323
1178
|
// we take page faults deferred by mmap() into consideration
|
1324
|
-
|
1179
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1325
1180
|
}
|
1326
1181
|
|
1327
1182
|
static bool llama_model_load(
|
1328
1183
|
const std::string & fname,
|
1329
|
-
|
1184
|
+
llama_model & model,
|
1185
|
+
llama_vocab & vocab,
|
1330
1186
|
int n_ctx,
|
1331
1187
|
int n_batch,
|
1332
1188
|
int n_gpu_layers,
|
@@ -1340,7 +1196,7 @@ static bool llama_model_load(
|
|
1340
1196
|
llama_progress_callback progress_callback,
|
1341
1197
|
void *progress_callback_user_data) {
|
1342
1198
|
try {
|
1343
|
-
llama_model_load_internal(fname,
|
1199
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1344
1200
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1345
1201
|
return true;
|
1346
1202
|
} catch (const std::exception & err) {
|
@@ -1351,22 +1207,26 @@ static bool llama_model_load(
|
|
1351
1207
|
|
1352
1208
|
// evaluate the transformer
|
1353
1209
|
//
|
1354
|
-
// - lctx:
|
1355
|
-
// - tokens:
|
1356
|
-
// -
|
1357
|
-
// -
|
1358
|
-
// -
|
1210
|
+
// - lctx: llama context
|
1211
|
+
// - tokens: new batch of tokens to process
|
1212
|
+
// - embd embeddings input
|
1213
|
+
// - n_tokens number of tokens
|
1214
|
+
// - n_past: the context size so far
|
1215
|
+
// - n_threads: number of threads to use
|
1359
1216
|
//
|
1360
1217
|
static bool llama_eval_internal(
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1218
|
+
llama_context & lctx,
|
1219
|
+
const llama_token * tokens,
|
1220
|
+
const float * embd,
|
1221
|
+
const int n_tokens,
|
1222
|
+
const int n_past,
|
1223
|
+
const int n_threads,
|
1366
1224
|
const char * cgraph_fname) {
|
1367
1225
|
|
1226
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
|
+
|
1368
1228
|
// enforce that the first token is BOS
|
1369
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1229
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1370
1230
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1371
1231
|
return false;
|
1372
1232
|
}
|
@@ -1378,7 +1238,7 @@ static bool llama_eval_internal(
|
|
1378
1238
|
const auto & model = lctx.model;
|
1379
1239
|
const auto & hparams = model.hparams;
|
1380
1240
|
|
1381
|
-
const auto & kv_self =
|
1241
|
+
const auto & kv_self = lctx.kv_self;
|
1382
1242
|
|
1383
1243
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1384
1244
|
|
@@ -1406,12 +1266,18 @@ static bool llama_eval_internal(
|
|
1406
1266
|
ggml_cgraph gf = {};
|
1407
1267
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1408
1268
|
|
1409
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1410
|
-
ggml_set_name(embd, "embd");
|
1411
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1412
|
-
|
1413
1269
|
struct ggml_tensor * cur;
|
1414
|
-
struct ggml_tensor * inpL
|
1270
|
+
struct ggml_tensor * inpL;
|
1271
|
+
|
1272
|
+
if (tokens) {
|
1273
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1274
|
+
ggml_set_name(embd, "embd");
|
1275
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1276
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1277
|
+
} else {
|
1278
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
|
+
}
|
1415
1281
|
|
1416
1282
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1417
1283
|
(void) i_gpu_start;
|
@@ -1473,11 +1339,11 @@ static bool llama_eval_internal(
|
|
1473
1339
|
offload_func_kq(tmpq);
|
1474
1340
|
ggml_set_name(tmpq, "tmpq");
|
1475
1341
|
|
1476
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1342
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1477
1343
|
offload_func_kq(Kcur);
|
1478
1344
|
ggml_set_name(Kcur, "Kcur");
|
1479
1345
|
|
1480
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1346
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1481
1347
|
offload_func_kq(Qcur);
|
1482
1348
|
ggml_set_name(Qcur, "Qcur");
|
1483
1349
|
|
@@ -1726,7 +1592,7 @@ static bool llama_eval_internal(
|
|
1726
1592
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1727
1593
|
|
1728
1594
|
// update kv token count
|
1729
|
-
lctx.
|
1595
|
+
lctx.kv_self.n = n_past + N;
|
1730
1596
|
|
1731
1597
|
// extract logits
|
1732
1598
|
{
|
@@ -2005,9 +1871,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2005
1871
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2006
1872
|
cum_sum += candidates->data[i].p;
|
2007
1873
|
|
2008
|
-
// Check if the running sum is
|
2009
|
-
|
2010
|
-
|
1874
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
1875
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
1876
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
1877
|
+
last_idx = i + 1;
|
2011
1878
|
break;
|
2012
1879
|
}
|
2013
1880
|
}
|
@@ -2432,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2432
2299
|
nthread = std::thread::hardware_concurrency();
|
2433
2300
|
}
|
2434
2301
|
|
2435
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2436
|
-
|
2437
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2302
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2303
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2438
2304
|
|
2439
2305
|
#ifdef GGML_USE_K_QUANTS
|
2440
2306
|
int n_attention_wv = 0;
|
@@ -2459,6 +2325,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2459
2325
|
std::vector<std::thread> workers;
|
2460
2326
|
std::mutex mutex;
|
2461
2327
|
|
2328
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2329
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2330
|
+
};
|
2331
|
+
|
2462
2332
|
size_t idx = 0;
|
2463
2333
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2464
2334
|
llama_buffer read_data;
|
@@ -2513,15 +2383,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2513
2383
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2514
2384
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2515
2385
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2516
|
-
|
2517
|
-
|
2386
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2387
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2388
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2518
2389
|
++i_attention_wv;
|
2519
2390
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2520
2391
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2521
2392
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2522
2393
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2523
|
-
(i_feed_forward_w2
|
2524
|
-
|
2394
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2395
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2525
2396
|
++i_feed_forward_w2;
|
2526
2397
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2527
2398
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2630,18 +2501,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2630
2501
|
}
|
2631
2502
|
}
|
2632
2503
|
|
2504
|
+
|
2505
|
+
|
2633
2506
|
//
|
2634
2507
|
// interface implementation
|
2635
2508
|
//
|
2636
2509
|
|
2637
|
-
struct
|
2510
|
+
struct llama_model * llama_load_model_from_file(
|
2638
2511
|
const char * path_model,
|
2639
2512
|
struct llama_context_params params) {
|
2640
2513
|
ggml_time_init();
|
2641
2514
|
|
2642
|
-
|
2515
|
+
llama_model * model = new llama_model;
|
2516
|
+
|
2517
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2518
|
+
|
2519
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2520
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2521
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2522
|
+
delete model;
|
2523
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2524
|
+
return nullptr;
|
2525
|
+
}
|
2526
|
+
|
2527
|
+
return model;
|
2528
|
+
}
|
2643
2529
|
|
2644
|
-
|
2530
|
+
void llama_free_model(struct llama_model * model) {
|
2531
|
+
delete model;
|
2532
|
+
}
|
2533
|
+
|
2534
|
+
struct llama_context * llama_new_context_with_model(
|
2535
|
+
struct llama_model * model,
|
2536
|
+
struct llama_context_params params) {
|
2537
|
+
|
2538
|
+
if (!model) {
|
2539
|
+
return nullptr;
|
2540
|
+
}
|
2541
|
+
|
2542
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2543
|
+
|
2544
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2645
2545
|
params.seed = time(NULL);
|
2646
2546
|
}
|
2647
2547
|
|
@@ -2667,24 +2567,16 @@ struct llama_context * llama_init_from_file(
|
|
2667
2567
|
|
2668
2568
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
2569
|
|
2670
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2671
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2674
|
-
llama_free(ctx);
|
2675
|
-
return nullptr;
|
2676
|
-
}
|
2677
|
-
|
2678
2570
|
// reserve memory for context buffers
|
2679
2571
|
if (!params.vocab_only) {
|
2680
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2572
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2681
2573
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2682
2574
|
llama_free(ctx);
|
2683
2575
|
return nullptr;
|
2684
2576
|
}
|
2685
2577
|
|
2686
2578
|
{
|
2687
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2579
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2688
2580
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2689
2581
|
}
|
2690
2582
|
|
@@ -2736,8 +2628,8 @@ struct llama_context * llama_init_from_file(
|
|
2736
2628
|
|
2737
2629
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2738
2630
|
|
2739
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,
|
2740
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->
|
2631
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2632
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2741
2633
|
|
2742
2634
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
2635
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
@@ -2748,7 +2640,23 @@ struct llama_context * llama_init_from_file(
|
|
2748
2640
|
return ctx;
|
2749
2641
|
}
|
2750
2642
|
|
2643
|
+
struct llama_context * llama_init_from_file(
|
2644
|
+
const char * path_model,
|
2645
|
+
struct llama_context_params params) {
|
2646
|
+
|
2647
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2648
|
+
if (!model) {
|
2649
|
+
return nullptr;
|
2650
|
+
}
|
2651
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2652
|
+
ctx->model_owner = true;
|
2653
|
+
return ctx;
|
2654
|
+
}
|
2655
|
+
|
2751
2656
|
void llama_free(struct llama_context * ctx) {
|
2657
|
+
if (ctx->model_owner) {
|
2658
|
+
delete &ctx->model;
|
2659
|
+
}
|
2752
2660
|
delete ctx;
|
2753
2661
|
}
|
2754
2662
|
|
@@ -2765,11 +2673,9 @@ int llama_model_quantize(
|
|
2765
2673
|
}
|
2766
2674
|
}
|
2767
2675
|
|
2768
|
-
int llama_apply_lora_from_file_internal(struct
|
2676
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2769
2677
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2770
2678
|
|
2771
|
-
auto & model = ctx->model;
|
2772
|
-
|
2773
2679
|
const int64_t t_start_lora_us = ggml_time_us();
|
2774
2680
|
|
2775
2681
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2817,7 +2723,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2817
2723
|
|
2818
2724
|
// create a name -> tensor map of the model to accelerate lookups
|
2819
2725
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2820
|
-
for (auto & kv: model.tensors_by_name) {
|
2726
|
+
for (const auto & kv: model.tensors_by_name) {
|
2821
2727
|
model_tensors.insert(kv);
|
2822
2728
|
}
|
2823
2729
|
|
@@ -2828,7 +2734,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2828
2734
|
llama_buffer base_buf;
|
2829
2735
|
if (path_base_model) {
|
2830
2736
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2831
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2737
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2832
2738
|
|
2833
2739
|
size_t ctx_size;
|
2834
2740
|
size_t mmapped_size;
|
@@ -2846,7 +2752,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2846
2752
|
|
2847
2753
|
// maybe this should in llama_model_loader
|
2848
2754
|
if (model_loader->use_mmap) {
|
2849
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2755
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2850
2756
|
}
|
2851
2757
|
}
|
2852
2758
|
|
@@ -2907,7 +2813,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2907
2813
|
return false;
|
2908
2814
|
}
|
2909
2815
|
}
|
2910
|
-
ggml_tensor* lora_tensor;
|
2816
|
+
ggml_tensor * lora_tensor;
|
2911
2817
|
if (n_dims == 2) {
|
2912
2818
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2913
2819
|
}
|
@@ -2915,6 +2821,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2915
2821
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2916
2822
|
return 1;
|
2917
2823
|
}
|
2824
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2918
2825
|
|
2919
2826
|
// load tensor data
|
2920
2827
|
size_t offset = fin.tellg();
|
@@ -2930,6 +2837,21 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2930
2837
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2931
2838
|
|
2932
2839
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2840
|
+
|
2841
|
+
offload_func_t offload_func = llama_nop;
|
2842
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2843
|
+
|
2844
|
+
#ifdef GGML_USE_CUBLAS
|
2845
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2846
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2847
|
+
throw std::runtime_error(format(
|
2848
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2849
|
+
}
|
2850
|
+
offload_func = ggml_cuda_assign_buffers;
|
2851
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2852
|
+
}
|
2853
|
+
#endif // GGML_USE_CUBLAS
|
2854
|
+
|
2933
2855
|
ggml_tensor * base_t;
|
2934
2856
|
if (model_loader) {
|
2935
2857
|
// load from base model
|
@@ -2957,7 +2879,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2957
2879
|
}
|
2958
2880
|
|
2959
2881
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2882
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2883
|
+
ggml_set_name(loraA, "loraA");
|
2884
|
+
|
2960
2885
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2886
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2887
|
+
ggml_set_name(loraB, "loraB");
|
2961
2888
|
|
2962
2889
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
2963
2890
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -2967,19 +2894,32 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2967
2894
|
|
2968
2895
|
// w = w + BA*s
|
2969
2896
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2897
|
+
offload_func(BA);
|
2898
|
+
ggml_set_name(BA, "BA");
|
2970
2899
|
|
2971
2900
|
if (scaling != 1.0f) {
|
2972
2901
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2902
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2903
|
+
|
2973
2904
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2905
|
+
offload_func(BA);
|
2906
|
+
ggml_set_name(BA, "BA_scaled");
|
2974
2907
|
}
|
2975
2908
|
|
2976
2909
|
ggml_tensor * r;
|
2977
2910
|
if (base_t == dest_t) {
|
2978
2911
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2912
|
+
offload_func_force_inplace(r);
|
2913
|
+
ggml_set_name(r, "r_add_inplace");
|
2979
2914
|
}
|
2980
2915
|
else {
|
2981
2916
|
r = ggml_add(lora_ctx, base_t, BA);
|
2917
|
+
offload_func(r);
|
2918
|
+
ggml_set_name(r, "r_add");
|
2919
|
+
|
2982
2920
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2921
|
+
offload_func(r);
|
2922
|
+
ggml_set_name(r, "r_cpy");
|
2983
2923
|
}
|
2984
2924
|
|
2985
2925
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3012,7 +2952,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
3012
2952
|
|
3013
2953
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
3014
2954
|
try {
|
3015
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2955
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
2956
|
+
} catch (const std::exception & err) {
|
2957
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2958
|
+
return 1;
|
2959
|
+
}
|
2960
|
+
}
|
2961
|
+
|
2962
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2963
|
+
try {
|
2964
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3016
2965
|
} catch (const std::exception & err) {
|
3017
2966
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3018
2967
|
return 1;
|
@@ -3020,13 +2969,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3020
2969
|
}
|
3021
2970
|
|
3022
2971
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
3023
|
-
return ctx->
|
2972
|
+
return ctx->kv_self.n;
|
3024
2973
|
}
|
3025
2974
|
|
3026
2975
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3027
2976
|
|
3028
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3029
|
-
if (seed
|
2977
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
2978
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3030
2979
|
seed = time(NULL);
|
3031
2980
|
}
|
3032
2981
|
ctx->rng.seed(seed);
|
@@ -3045,7 +2994,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3045
2994
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3046
2995
|
const size_t s_kv_size = sizeof(size_t);
|
3047
2996
|
const size_t s_kv_ntok = sizeof(int);
|
3048
|
-
const size_t s_kv = ctx->
|
2997
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3049
2998
|
|
3050
2999
|
const size_t s_total = (
|
3051
3000
|
+ s_rng_size
|
@@ -3111,7 +3060,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3111
3060
|
|
3112
3061
|
// copy kv cache
|
3113
3062
|
{
|
3114
|
-
const auto & kv_self = ctx->
|
3063
|
+
const auto & kv_self = ctx->kv_self;
|
3115
3064
|
const auto & hparams = ctx->model.hparams;
|
3116
3065
|
const int n_layer = hparams.n_layer;
|
3117
3066
|
const int n_embd = hparams.n_embd;
|
@@ -3215,7 +3164,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3215
3164
|
|
3216
3165
|
// set kv cache
|
3217
3166
|
{
|
3218
|
-
const auto & kv_self = ctx->
|
3167
|
+
const auto & kv_self = ctx->kv_self;
|
3219
3168
|
const auto & hparams = ctx->model.hparams;
|
3220
3169
|
const int n_layer = hparams.n_layer;
|
3221
3170
|
const int n_embd = hparams.n_embd;
|
@@ -3259,7 +3208,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3259
3208
|
ggml_free(cpy_ctx);
|
3260
3209
|
}
|
3261
3210
|
|
3262
|
-
ctx->
|
3211
|
+
ctx->kv_self.n = kv_ntok;
|
3263
3212
|
}
|
3264
3213
|
|
3265
3214
|
const size_t nread = inp - src;
|
@@ -3355,7 +3304,29 @@ int llama_eval(
|
|
3355
3304
|
int n_tokens,
|
3356
3305
|
int n_past,
|
3357
3306
|
int n_threads) {
|
3358
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3307
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3308
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3309
|
+
return 1;
|
3310
|
+
}
|
3311
|
+
|
3312
|
+
// get a more accurate load time, upon first eval
|
3313
|
+
// TODO: fix this
|
3314
|
+
if (!ctx->has_evaluated_once) {
|
3315
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3316
|
+
ctx->has_evaluated_once = true;
|
3317
|
+
}
|
3318
|
+
|
3319
|
+
return 0;
|
3320
|
+
}
|
3321
|
+
|
3322
|
+
|
3323
|
+
int llama_eval_embd(
|
3324
|
+
struct llama_context * ctx,
|
3325
|
+
const float * embd,
|
3326
|
+
int n_tokens,
|
3327
|
+
int n_past,
|
3328
|
+
int n_threads) {
|
3329
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3359
3330
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3360
3331
|
return 1;
|
3361
3332
|
}
|
@@ -3376,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3376
3347
|
|
3377
3348
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3378
3349
|
|
3379
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3350
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3380
3351
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3381
3352
|
return 1;
|
3382
3353
|
}
|
@@ -3506,6 +3477,6 @@ const char * llama_print_system_info(void) {
|
|
3506
3477
|
}
|
3507
3478
|
|
3508
3479
|
// For internal test use
|
3509
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3480
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3510
3481
|
return ctx->model.tensors_by_name;
|
3511
3482
|
}
|