llama_cpp 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -21,9 +21,13 @@
|
|
21
21
|
#endif
|
22
22
|
#ifdef GGML_USE_K_QUANTS
|
23
23
|
#ifndef QK_K
|
24
|
+
#ifdef GGML_QKK_64
|
25
|
+
#define QK_K 64
|
26
|
+
#else
|
24
27
|
#define QK_K 256
|
25
28
|
#endif
|
26
29
|
#endif
|
30
|
+
#endif
|
27
31
|
|
28
32
|
#include <array>
|
29
33
|
#include <ctime>
|
@@ -182,6 +186,19 @@ struct llama_kv_cache {
|
|
182
186
|
}
|
183
187
|
};
|
184
188
|
|
189
|
+
struct llama_vocab {
|
190
|
+
using id = int32_t;
|
191
|
+
using token = std::string;
|
192
|
+
|
193
|
+
struct token_score {
|
194
|
+
token tok;
|
195
|
+
float score;
|
196
|
+
};
|
197
|
+
|
198
|
+
std::unordered_map<token, id> token_to_id;
|
199
|
+
std::vector<token_score> id_to_token;
|
200
|
+
};
|
201
|
+
|
185
202
|
struct llama_model {
|
186
203
|
e_model type = MODEL_UNKNOWN;
|
187
204
|
|
@@ -198,10 +215,6 @@ struct llama_model {
|
|
198
215
|
// context
|
199
216
|
struct ggml_context * ctx = NULL;
|
200
217
|
|
201
|
-
// key + value cache for the self attention
|
202
|
-
// TODO: move to llama_state
|
203
|
-
struct llama_kv_cache kv_self;
|
204
|
-
|
205
218
|
// the model memory buffer
|
206
219
|
llama_ctx_buffer buf;
|
207
220
|
|
@@ -215,6 +228,11 @@ struct llama_model {
|
|
215
228
|
// for quantize-stats only
|
216
229
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
217
230
|
|
231
|
+
int64_t t_load_us = 0;
|
232
|
+
int64_t t_start_us = 0;
|
233
|
+
|
234
|
+
llama_vocab vocab;
|
235
|
+
|
218
236
|
~llama_model() {
|
219
237
|
if (ctx) {
|
220
238
|
ggml_free(ctx);
|
@@ -233,24 +251,11 @@ struct llama_model {
|
|
233
251
|
}
|
234
252
|
};
|
235
253
|
|
236
|
-
struct llama_vocab {
|
237
|
-
using id = int32_t;
|
238
|
-
using token = std::string;
|
239
|
-
|
240
|
-
struct token_score {
|
241
|
-
token tok;
|
242
|
-
float score;
|
243
|
-
};
|
244
|
-
|
245
|
-
std::unordered_map<token, id> token_to_id;
|
246
|
-
std::vector<token_score> id_to_token;
|
247
|
-
};
|
248
|
-
|
249
254
|
struct llama_context {
|
255
|
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
+
|
250
257
|
std::mt19937 rng;
|
251
258
|
|
252
|
-
int64_t t_load_us = 0;
|
253
|
-
int64_t t_start_us = 0;
|
254
259
|
bool has_evaluated_once = false;
|
255
260
|
|
256
261
|
int64_t t_sample_us = 0;
|
@@ -261,8 +266,16 @@ struct llama_context {
|
|
261
266
|
int32_t n_eval = 0; // number of eval calls
|
262
267
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
263
268
|
|
264
|
-
llama_model model;
|
265
|
-
llama_vocab vocab;
|
269
|
+
const llama_model & model;
|
270
|
+
const llama_vocab & vocab;
|
271
|
+
|
272
|
+
bool model_owner = false;
|
273
|
+
|
274
|
+
int64_t t_load_us;
|
275
|
+
int64_t t_start_us;
|
276
|
+
|
277
|
+
// key + value cache for the self attention
|
278
|
+
struct llama_kv_cache kv_self;
|
266
279
|
|
267
280
|
size_t mem_per_token = 0;
|
268
281
|
|
@@ -351,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
351
364
|
return size / ggml_blck_size(type);
|
352
365
|
}
|
353
366
|
|
354
|
-
struct llama_load_tensor_shard {
|
355
|
-
std::vector<uint32_t> ne;
|
356
|
-
size_t size;
|
357
|
-
enum ggml_type type;
|
358
|
-
size_t file_idx;
|
359
|
-
size_t file_off;
|
360
|
-
|
361
|
-
void calc_size() {
|
362
|
-
size = llama_calc_tensor_size(ne, type);
|
363
|
-
}
|
364
|
-
};
|
365
|
-
|
366
|
-
enum llama_split_type {
|
367
|
-
SPLIT_NONE,
|
368
|
-
SPLIT_BY_COLUMNS,
|
369
|
-
SPLIT_BY_ROWS
|
370
|
-
};
|
371
|
-
|
372
367
|
struct llama_load_tensor {
|
373
|
-
std::vector<llama_load_tensor_shard> shards;
|
374
|
-
|
375
368
|
std::string name;
|
376
369
|
enum ggml_type type = GGML_TYPE_F32;
|
377
|
-
llama_split_type split_type = SPLIT_NONE;
|
378
370
|
std::vector<uint32_t> ne;
|
371
|
+
size_t file_off;
|
379
372
|
size_t size;
|
380
373
|
struct ggml_tensor * ggml_tensor = NULL;
|
381
374
|
uint8_t * data;
|
382
|
-
|
383
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
384
|
-
|
385
|
-
void calc_all() {
|
386
|
-
calc_type();
|
387
|
-
calc_split_type();
|
388
|
-
calc_ne();
|
389
|
-
calc_size();
|
390
|
-
}
|
391
|
-
|
392
|
-
void calc_type() {
|
393
|
-
const auto & first_shard = shards.at(0);
|
394
|
-
for (const auto & shard : shards) {
|
395
|
-
if (shard.type != first_shard.type) {
|
396
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
397
|
-
}
|
398
|
-
}
|
399
|
-
type = first_shard.type;
|
400
|
-
}
|
401
|
-
|
402
|
-
void calc_split_type() {
|
403
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
404
|
-
shards.size() == 1) { // only one file?
|
405
|
-
split_type = SPLIT_NONE;
|
406
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
407
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
408
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
409
|
-
split_type = SPLIT_BY_COLUMNS;
|
410
|
-
} else {
|
411
|
-
split_type = SPLIT_BY_ROWS;
|
412
|
-
}
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_ne() {
|
416
|
-
const auto & first_shard = shards.at(0);
|
417
|
-
for (const auto & shard : shards) {
|
418
|
-
if (shard.ne != first_shard.ne) {
|
419
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
420
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
421
|
-
}
|
422
|
-
}
|
423
|
-
ne = first_shard.ne;
|
424
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
425
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
426
|
-
switch (split_type) {
|
427
|
-
case SPLIT_NONE:
|
428
|
-
ne = first_shard.ne;
|
429
|
-
break;
|
430
|
-
case SPLIT_BY_COLUMNS:
|
431
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
432
|
-
first_shard.ne[1]};
|
433
|
-
break;
|
434
|
-
case SPLIT_BY_ROWS:
|
435
|
-
ne = {first_shard.ne[0],
|
436
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
437
|
-
break;
|
438
|
-
}
|
439
|
-
}
|
440
|
-
|
441
|
-
void calc_size() {
|
442
|
-
size = llama_calc_tensor_size(ne, type);
|
443
|
-
}
|
444
375
|
};
|
445
376
|
|
446
377
|
struct llama_load_tensors_map {
|
@@ -463,13 +394,13 @@ struct llama_file_loader {
|
|
463
394
|
llama_hparams hparams;
|
464
395
|
llama_vocab vocab;
|
465
396
|
|
466
|
-
llama_file_loader(const char * fname,
|
397
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
467
398
|
: file(fname, "rb") {
|
468
399
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
469
400
|
read_magic();
|
470
401
|
read_hparams();
|
471
402
|
read_vocab();
|
472
|
-
read_tensor_metadata(
|
403
|
+
read_tensor_metadata(tensors_map);
|
473
404
|
}
|
474
405
|
void read_magic() {
|
475
406
|
uint32_t magic = file.read_u32();
|
@@ -526,19 +457,19 @@ struct llama_file_loader {
|
|
526
457
|
tok_score.score = score;
|
527
458
|
}
|
528
459
|
}
|
529
|
-
void read_tensor_metadata(
|
460
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
530
461
|
while (file.tell() < file.size) {
|
531
|
-
|
462
|
+
llama_load_tensor tensor;
|
532
463
|
uint32_t n_dims = file.read_u32();
|
533
464
|
uint32_t name_len = file.read_u32();
|
534
|
-
|
535
|
-
|
536
|
-
file.read_raw(
|
465
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
466
|
+
tensor.ne.resize(n_dims);
|
467
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
537
468
|
std::string name = file.read_string(name_len);
|
538
469
|
if (n_dims < 1 || n_dims > 2) {
|
539
470
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
540
471
|
}
|
541
|
-
switch (
|
472
|
+
switch (tensor.type) {
|
542
473
|
case GGML_TYPE_F32:
|
543
474
|
case GGML_TYPE_F16:
|
544
475
|
case GGML_TYPE_Q4_0:
|
@@ -553,30 +484,20 @@ struct llama_file_loader {
|
|
553
484
|
case GGML_TYPE_Q6_K:
|
554
485
|
break;
|
555
486
|
default: {
|
556
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
487
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
557
488
|
}
|
558
489
|
}
|
559
490
|
|
560
|
-
|
561
|
-
|
562
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
563
|
-
}
|
564
|
-
shard.file_idx = file_idx;
|
565
|
-
shard.file_off = file.tell();
|
491
|
+
// skip to the next multiple of 32 bytes
|
492
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
566
493
|
|
567
|
-
|
568
|
-
|
494
|
+
tensor.file_off = file.tell();
|
495
|
+
tensor.name = name;
|
496
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
497
|
+
file.seek(tensor.size, SEEK_CUR);
|
569
498
|
|
570
|
-
|
571
|
-
|
572
|
-
if (it != tensors_map.name_to_idx.end()) {
|
573
|
-
idx = it->second;
|
574
|
-
} else {
|
575
|
-
tensors_map.tensors.emplace_back(name);
|
576
|
-
idx = tensors_map.tensors.size() - 1;
|
577
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
578
|
-
}
|
579
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
499
|
+
tensors_map.tensors.push_back(tensor);
|
500
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
580
501
|
}
|
581
502
|
}
|
582
503
|
};
|
@@ -646,56 +567,19 @@ struct llama_file_saver {
|
|
646
567
|
};
|
647
568
|
|
648
569
|
struct llama_model_loader {
|
649
|
-
std::
|
570
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
650
571
|
llama_load_tensors_map tensors_map;
|
651
572
|
bool use_mmap;
|
652
573
|
size_t num_ggml_tensors_created = 0;
|
653
574
|
struct ggml_context * ggml_ctx = NULL;
|
654
575
|
std::unique_ptr<llama_mmap> mapping;
|
655
576
|
|
656
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
657
|
-
|
658
|
-
file_loaders.emplace_back(first_file);
|
659
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
660
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
661
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
662
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
663
|
-
file_loaders.emplace_back(ith_file);
|
664
|
-
if (ith_file->hparams != first_file->hparams) {
|
665
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
666
|
-
}
|
667
|
-
}
|
577
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
578
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
668
579
|
if (!llama_mmap::SUPPORTED) {
|
669
580
|
use_mmap = false;
|
670
581
|
}
|
671
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
672
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
673
|
-
use_mmap = false;
|
674
|
-
}
|
675
582
|
this->use_mmap = use_mmap;
|
676
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
677
|
-
lt.calc_all();
|
678
|
-
}
|
679
|
-
}
|
680
|
-
|
681
|
-
bool alignment_prevents_mmap() {
|
682
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
683
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
684
|
-
if (shard.file_off & 3) {
|
685
|
-
return true;
|
686
|
-
}
|
687
|
-
}
|
688
|
-
}
|
689
|
-
return false;
|
690
|
-
}
|
691
|
-
|
692
|
-
uint32_t guess_n_parts() const {
|
693
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
694
|
-
if (it == tensors_map.name_to_idx.end()) {
|
695
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
696
|
-
}
|
697
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
698
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
699
583
|
}
|
700
584
|
|
701
585
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -761,7 +645,7 @@ struct llama_model_loader {
|
|
761
645
|
}
|
762
646
|
|
763
647
|
if (use_mmap) {
|
764
|
-
mapping.reset(new llama_mmap(&
|
648
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
765
649
|
if (lmlock) {
|
766
650
|
lmlock->init(mapping->addr);
|
767
651
|
}
|
@@ -817,45 +701,13 @@ struct llama_model_loader {
|
|
817
701
|
|
818
702
|
void load_data_for(llama_load_tensor & lt) {
|
819
703
|
if (use_mmap) {
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
704
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
705
|
+
} else {
|
706
|
+
llama_file & file = file_loader->file;
|
707
|
+
file.seek(lt.file_off, SEEK_SET);
|
825
708
|
file.read_raw(lt.data, lt.size);
|
826
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
827
|
-
size_t offset = 0;
|
828
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
829
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
830
|
-
file.seek(shard.file_off, SEEK_SET);
|
831
|
-
file.read_raw(lt.data + offset, shard.size);
|
832
|
-
offset += shard.size;
|
833
|
-
}
|
834
|
-
LLAMA_ASSERT(offset == lt.size);
|
835
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
836
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
837
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
838
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
839
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
840
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
841
|
-
file.seek(shard.file_off, SEEK_SET);
|
842
|
-
tmp_bufs.at(i).resize(shard.size);
|
843
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
844
|
-
}
|
845
|
-
// Then reshape.
|
846
|
-
size_t num_rows = lt.ne.at(1);
|
847
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
848
|
-
size_t out_offset = 0;
|
849
|
-
for (size_t row = 0; row < num_rows; row++) {
|
850
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
851
|
-
memcpy(lt.data + out_offset,
|
852
|
-
tmp_buf.addr + row * per_shard_row_size,
|
853
|
-
per_shard_row_size);
|
854
|
-
out_offset += per_shard_row_size;
|
855
|
-
}
|
856
|
-
}
|
857
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
858
709
|
}
|
710
|
+
|
859
711
|
if (0) {
|
860
712
|
print_checksum(lt);
|
861
713
|
}
|
@@ -925,7 +777,7 @@ static bool kv_cache_init(
|
|
925
777
|
|
926
778
|
struct llama_context_params llama_context_default_params() {
|
927
779
|
struct llama_context_params result = {
|
928
|
-
/*.seed =*/
|
780
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
929
781
|
/*.n_ctx =*/ 512,
|
930
782
|
/*.n_batch =*/ 512,
|
931
783
|
/*.gpu_layers =*/ 0,
|
@@ -964,7 +816,7 @@ bool llama_mlock_supported() {
|
|
964
816
|
return llama_mlock::SUPPORTED;
|
965
817
|
}
|
966
818
|
|
967
|
-
void llama_init_backend() {
|
819
|
+
void llama_init_backend(bool numa) {
|
968
820
|
ggml_time_init();
|
969
821
|
|
970
822
|
// needed to initialize f16 tables
|
@@ -973,6 +825,10 @@ void llama_init_backend() {
|
|
973
825
|
struct ggml_context * ctx = ggml_init(params);
|
974
826
|
ggml_free(ctx);
|
975
827
|
}
|
828
|
+
|
829
|
+
if (numa) {
|
830
|
+
ggml_numa_init();
|
831
|
+
}
|
976
832
|
}
|
977
833
|
|
978
834
|
int64_t llama_time_us() {
|
@@ -1033,7 +889,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1033
889
|
|
1034
890
|
static void llama_model_load_internal(
|
1035
891
|
const std::string & fname,
|
1036
|
-
|
892
|
+
llama_model & model,
|
893
|
+
llama_vocab & vocab,
|
1037
894
|
int n_ctx,
|
1038
895
|
int n_batch,
|
1039
896
|
int n_gpu_layers,
|
@@ -1047,15 +904,14 @@ static void llama_model_load_internal(
|
|
1047
904
|
llama_progress_callback progress_callback,
|
1048
905
|
void * progress_callback_user_data) {
|
1049
906
|
|
1050
|
-
|
907
|
+
model.t_start_us = ggml_time_us();
|
1051
908
|
|
1052
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
909
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1053
910
|
|
1054
|
-
|
1055
|
-
|
1056
|
-
model.hparams = ml->file_loaders.at(0)->hparams;
|
911
|
+
vocab = std::move(ml->file_loader->vocab);
|
912
|
+
model.hparams = ml->file_loader->hparams;
|
1057
913
|
model.n_gpu_layers = n_gpu_layers;
|
1058
|
-
llama_file_version file_version = ml->
|
914
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1059
915
|
auto & hparams = model.hparams;
|
1060
916
|
|
1061
917
|
{
|
@@ -1089,7 +945,6 @@ static void llama_model_load_internal(
|
|
1089
945
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1090
946
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1091
947
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1092
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1093
948
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1094
949
|
}
|
1095
950
|
|
@@ -1122,15 +977,15 @@ static void llama_model_load_internal(
|
|
1122
977
|
|
1123
978
|
// create the ggml context
|
1124
979
|
{
|
1125
|
-
|
980
|
+
model.buf.resize(ctx_size);
|
1126
981
|
if (use_mlock) {
|
1127
|
-
|
1128
|
-
|
982
|
+
model.mlock_buf.init(model.buf.addr);
|
983
|
+
model.mlock_buf.grow_to(model.buf.size);
|
1129
984
|
}
|
1130
985
|
|
1131
986
|
struct ggml_init_params params = {
|
1132
|
-
/*.mem_size =*/
|
1133
|
-
/*.mem_buffer =*/
|
987
|
+
/*.mem_size =*/ model.buf.size,
|
988
|
+
/*.mem_buffer =*/ model.buf.addr,
|
1134
989
|
/*.no_alloc =*/ ml->use_mmap,
|
1135
990
|
};
|
1136
991
|
|
@@ -1311,7 +1166,7 @@ static void llama_model_load_internal(
|
|
1311
1166
|
}
|
1312
1167
|
#endif
|
1313
1168
|
|
1314
|
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1169
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1315
1170
|
|
1316
1171
|
if (progress_callback) {
|
1317
1172
|
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1321,12 +1176,13 @@ static void llama_model_load_internal(
|
|
1321
1176
|
|
1322
1177
|
// loading time will be recalculate after the first eval, so
|
1323
1178
|
// we take page faults deferred by mmap() into consideration
|
1324
|
-
|
1179
|
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1325
1180
|
}
|
1326
1181
|
|
1327
1182
|
static bool llama_model_load(
|
1328
1183
|
const std::string & fname,
|
1329
|
-
|
1184
|
+
llama_model & model,
|
1185
|
+
llama_vocab & vocab,
|
1330
1186
|
int n_ctx,
|
1331
1187
|
int n_batch,
|
1332
1188
|
int n_gpu_layers,
|
@@ -1340,7 +1196,7 @@ static bool llama_model_load(
|
|
1340
1196
|
llama_progress_callback progress_callback,
|
1341
1197
|
void *progress_callback_user_data) {
|
1342
1198
|
try {
|
1343
|
-
llama_model_load_internal(fname,
|
1199
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1344
1200
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1345
1201
|
return true;
|
1346
1202
|
} catch (const std::exception & err) {
|
@@ -1351,22 +1207,26 @@ static bool llama_model_load(
|
|
1351
1207
|
|
1352
1208
|
// evaluate the transformer
|
1353
1209
|
//
|
1354
|
-
// - lctx:
|
1355
|
-
// - tokens:
|
1356
|
-
// -
|
1357
|
-
// -
|
1358
|
-
// -
|
1210
|
+
// - lctx: llama context
|
1211
|
+
// - tokens: new batch of tokens to process
|
1212
|
+
// - embd embeddings input
|
1213
|
+
// - n_tokens number of tokens
|
1214
|
+
// - n_past: the context size so far
|
1215
|
+
// - n_threads: number of threads to use
|
1359
1216
|
//
|
1360
1217
|
static bool llama_eval_internal(
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1218
|
+
llama_context & lctx,
|
1219
|
+
const llama_token * tokens,
|
1220
|
+
const float * embd,
|
1221
|
+
const int n_tokens,
|
1222
|
+
const int n_past,
|
1223
|
+
const int n_threads,
|
1366
1224
|
const char * cgraph_fname) {
|
1367
1225
|
|
1226
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
|
+
|
1368
1228
|
// enforce that the first token is BOS
|
1369
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1229
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1370
1230
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1371
1231
|
return false;
|
1372
1232
|
}
|
@@ -1378,7 +1238,7 @@ static bool llama_eval_internal(
|
|
1378
1238
|
const auto & model = lctx.model;
|
1379
1239
|
const auto & hparams = model.hparams;
|
1380
1240
|
|
1381
|
-
const auto & kv_self =
|
1241
|
+
const auto & kv_self = lctx.kv_self;
|
1382
1242
|
|
1383
1243
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1384
1244
|
|
@@ -1406,12 +1266,18 @@ static bool llama_eval_internal(
|
|
1406
1266
|
ggml_cgraph gf = {};
|
1407
1267
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1408
1268
|
|
1409
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1410
|
-
ggml_set_name(embd, "embd");
|
1411
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1412
|
-
|
1413
1269
|
struct ggml_tensor * cur;
|
1414
|
-
struct ggml_tensor * inpL
|
1270
|
+
struct ggml_tensor * inpL;
|
1271
|
+
|
1272
|
+
if (tokens) {
|
1273
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1274
|
+
ggml_set_name(embd, "embd");
|
1275
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1276
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1277
|
+
} else {
|
1278
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
|
+
}
|
1415
1281
|
|
1416
1282
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1417
1283
|
(void) i_gpu_start;
|
@@ -1473,11 +1339,11 @@ static bool llama_eval_internal(
|
|
1473
1339
|
offload_func_kq(tmpq);
|
1474
1340
|
ggml_set_name(tmpq, "tmpq");
|
1475
1341
|
|
1476
|
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1342
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1477
1343
|
offload_func_kq(Kcur);
|
1478
1344
|
ggml_set_name(Kcur, "Kcur");
|
1479
1345
|
|
1480
|
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1346
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
1481
1347
|
offload_func_kq(Qcur);
|
1482
1348
|
ggml_set_name(Qcur, "Qcur");
|
1483
1349
|
|
@@ -1726,7 +1592,7 @@ static bool llama_eval_internal(
|
|
1726
1592
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1727
1593
|
|
1728
1594
|
// update kv token count
|
1729
|
-
lctx.
|
1595
|
+
lctx.kv_self.n = n_past + N;
|
1730
1596
|
|
1731
1597
|
// extract logits
|
1732
1598
|
{
|
@@ -2005,9 +1871,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2005
1871
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2006
1872
|
cum_sum += candidates->data[i].p;
|
2007
1873
|
|
2008
|
-
// Check if the running sum is
|
2009
|
-
|
2010
|
-
|
1874
|
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
1875
|
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
1876
|
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
1877
|
+
last_idx = i + 1;
|
2011
1878
|
break;
|
2012
1879
|
}
|
2013
1880
|
}
|
@@ -2432,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2432
2299
|
nthread = std::thread::hardware_concurrency();
|
2433
2300
|
}
|
2434
2301
|
|
2435
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2436
|
-
|
2437
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2302
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2303
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2438
2304
|
|
2439
2305
|
#ifdef GGML_USE_K_QUANTS
|
2440
2306
|
int n_attention_wv = 0;
|
@@ -2459,6 +2325,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2459
2325
|
std::vector<std::thread> workers;
|
2460
2326
|
std::mutex mutex;
|
2461
2327
|
|
2328
|
+
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
2329
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
2330
|
+
};
|
2331
|
+
|
2462
2332
|
size_t idx = 0;
|
2463
2333
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
2464
2334
|
llama_buffer read_data;
|
@@ -2513,15 +2383,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2513
2383
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2514
2384
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2515
2385
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2516
|
-
|
2517
|
-
|
2386
|
+
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
2387
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
2388
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
2518
2389
|
++i_attention_wv;
|
2519
2390
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2520
2391
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2521
2392
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2522
2393
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2523
|
-
(i_feed_forward_w2
|
2524
|
-
|
2394
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
2395
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
2525
2396
|
++i_feed_forward_w2;
|
2526
2397
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2527
2398
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
@@ -2630,18 +2501,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2630
2501
|
}
|
2631
2502
|
}
|
2632
2503
|
|
2504
|
+
|
2505
|
+
|
2633
2506
|
//
|
2634
2507
|
// interface implementation
|
2635
2508
|
//
|
2636
2509
|
|
2637
|
-
struct
|
2510
|
+
struct llama_model * llama_load_model_from_file(
|
2638
2511
|
const char * path_model,
|
2639
2512
|
struct llama_context_params params) {
|
2640
2513
|
ggml_time_init();
|
2641
2514
|
|
2642
|
-
|
2515
|
+
llama_model * model = new llama_model;
|
2516
|
+
|
2517
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2518
|
+
|
2519
|
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2520
|
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2521
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2522
|
+
delete model;
|
2523
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2524
|
+
return nullptr;
|
2525
|
+
}
|
2526
|
+
|
2527
|
+
return model;
|
2528
|
+
}
|
2643
2529
|
|
2644
|
-
|
2530
|
+
void llama_free_model(struct llama_model * model) {
|
2531
|
+
delete model;
|
2532
|
+
}
|
2533
|
+
|
2534
|
+
struct llama_context * llama_new_context_with_model(
|
2535
|
+
struct llama_model * model,
|
2536
|
+
struct llama_context_params params) {
|
2537
|
+
|
2538
|
+
if (!model) {
|
2539
|
+
return nullptr;
|
2540
|
+
}
|
2541
|
+
|
2542
|
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2543
|
+
|
2544
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2645
2545
|
params.seed = time(NULL);
|
2646
2546
|
}
|
2647
2547
|
|
@@ -2667,24 +2567,16 @@ struct llama_context * llama_init_from_file(
|
|
2667
2567
|
|
2668
2568
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2669
2569
|
|
2670
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2671
|
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2672
|
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2673
|
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2674
|
-
llama_free(ctx);
|
2675
|
-
return nullptr;
|
2676
|
-
}
|
2677
|
-
|
2678
2570
|
// reserve memory for context buffers
|
2679
2571
|
if (!params.vocab_only) {
|
2680
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2572
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2681
2573
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2682
2574
|
llama_free(ctx);
|
2683
2575
|
return nullptr;
|
2684
2576
|
}
|
2685
2577
|
|
2686
2578
|
{
|
2687
|
-
const size_t memory_size = ggml_nbytes(ctx->
|
2579
|
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2688
2580
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2689
2581
|
}
|
2690
2582
|
|
@@ -2736,8 +2628,8 @@ struct llama_context * llama_init_from_file(
|
|
2736
2628
|
|
2737
2629
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2738
2630
|
|
2739
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr,
|
2740
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->
|
2631
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2632
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2741
2633
|
|
2742
2634
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
2635
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
@@ -2748,7 +2640,23 @@ struct llama_context * llama_init_from_file(
|
|
2748
2640
|
return ctx;
|
2749
2641
|
}
|
2750
2642
|
|
2643
|
+
struct llama_context * llama_init_from_file(
|
2644
|
+
const char * path_model,
|
2645
|
+
struct llama_context_params params) {
|
2646
|
+
|
2647
|
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2648
|
+
if (!model) {
|
2649
|
+
return nullptr;
|
2650
|
+
}
|
2651
|
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2652
|
+
ctx->model_owner = true;
|
2653
|
+
return ctx;
|
2654
|
+
}
|
2655
|
+
|
2751
2656
|
void llama_free(struct llama_context * ctx) {
|
2657
|
+
if (ctx->model_owner) {
|
2658
|
+
delete &ctx->model;
|
2659
|
+
}
|
2752
2660
|
delete ctx;
|
2753
2661
|
}
|
2754
2662
|
|
@@ -2765,11 +2673,9 @@ int llama_model_quantize(
|
|
2765
2673
|
}
|
2766
2674
|
}
|
2767
2675
|
|
2768
|
-
int llama_apply_lora_from_file_internal(struct
|
2676
|
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2769
2677
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2770
2678
|
|
2771
|
-
auto & model = ctx->model;
|
2772
|
-
|
2773
2679
|
const int64_t t_start_lora_us = ggml_time_us();
|
2774
2680
|
|
2775
2681
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2817,7 +2723,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2817
2723
|
|
2818
2724
|
// create a name -> tensor map of the model to accelerate lookups
|
2819
2725
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2820
|
-
for (auto & kv: model.tensors_by_name) {
|
2726
|
+
for (const auto & kv: model.tensors_by_name) {
|
2821
2727
|
model_tensors.insert(kv);
|
2822
2728
|
}
|
2823
2729
|
|
@@ -2828,7 +2734,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2828
2734
|
llama_buffer base_buf;
|
2829
2735
|
if (path_base_model) {
|
2830
2736
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2831
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2737
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2832
2738
|
|
2833
2739
|
size_t ctx_size;
|
2834
2740
|
size_t mmapped_size;
|
@@ -2846,7 +2752,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2846
2752
|
|
2847
2753
|
// maybe this should in llama_model_loader
|
2848
2754
|
if (model_loader->use_mmap) {
|
2849
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2755
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2850
2756
|
}
|
2851
2757
|
}
|
2852
2758
|
|
@@ -2907,7 +2813,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2907
2813
|
return false;
|
2908
2814
|
}
|
2909
2815
|
}
|
2910
|
-
ggml_tensor* lora_tensor;
|
2816
|
+
ggml_tensor * lora_tensor;
|
2911
2817
|
if (n_dims == 2) {
|
2912
2818
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2913
2819
|
}
|
@@ -2915,6 +2821,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2915
2821
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2916
2822
|
return 1;
|
2917
2823
|
}
|
2824
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2918
2825
|
|
2919
2826
|
// load tensor data
|
2920
2827
|
size_t offset = fin.tellg();
|
@@ -2930,6 +2837,21 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2930
2837
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2931
2838
|
|
2932
2839
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2840
|
+
|
2841
|
+
offload_func_t offload_func = llama_nop;
|
2842
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2843
|
+
|
2844
|
+
#ifdef GGML_USE_CUBLAS
|
2845
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2846
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2847
|
+
throw std::runtime_error(format(
|
2848
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2849
|
+
}
|
2850
|
+
offload_func = ggml_cuda_assign_buffers;
|
2851
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2852
|
+
}
|
2853
|
+
#endif // GGML_USE_CUBLAS
|
2854
|
+
|
2933
2855
|
ggml_tensor * base_t;
|
2934
2856
|
if (model_loader) {
|
2935
2857
|
// load from base model
|
@@ -2957,7 +2879,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2957
2879
|
}
|
2958
2880
|
|
2959
2881
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2882
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2883
|
+
ggml_set_name(loraA, "loraA");
|
2884
|
+
|
2960
2885
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2886
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2887
|
+
ggml_set_name(loraB, "loraB");
|
2961
2888
|
|
2962
2889
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
2963
2890
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -2967,19 +2894,32 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2967
2894
|
|
2968
2895
|
// w = w + BA*s
|
2969
2896
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2897
|
+
offload_func(BA);
|
2898
|
+
ggml_set_name(BA, "BA");
|
2970
2899
|
|
2971
2900
|
if (scaling != 1.0f) {
|
2972
2901
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2902
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2903
|
+
|
2973
2904
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2905
|
+
offload_func(BA);
|
2906
|
+
ggml_set_name(BA, "BA_scaled");
|
2974
2907
|
}
|
2975
2908
|
|
2976
2909
|
ggml_tensor * r;
|
2977
2910
|
if (base_t == dest_t) {
|
2978
2911
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2912
|
+
offload_func_force_inplace(r);
|
2913
|
+
ggml_set_name(r, "r_add_inplace");
|
2979
2914
|
}
|
2980
2915
|
else {
|
2981
2916
|
r = ggml_add(lora_ctx, base_t, BA);
|
2917
|
+
offload_func(r);
|
2918
|
+
ggml_set_name(r, "r_add");
|
2919
|
+
|
2982
2920
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2921
|
+
offload_func(r);
|
2922
|
+
ggml_set_name(r, "r_cpy");
|
2983
2923
|
}
|
2984
2924
|
|
2985
2925
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3012,7 +2952,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
3012
2952
|
|
3013
2953
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
3014
2954
|
try {
|
3015
|
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2955
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
2956
|
+
} catch (const std::exception & err) {
|
2957
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2958
|
+
return 1;
|
2959
|
+
}
|
2960
|
+
}
|
2961
|
+
|
2962
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2963
|
+
try {
|
2964
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3016
2965
|
} catch (const std::exception & err) {
|
3017
2966
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3018
2967
|
return 1;
|
@@ -3020,13 +2969,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
3020
2969
|
}
|
3021
2970
|
|
3022
2971
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
3023
|
-
return ctx->
|
2972
|
+
return ctx->kv_self.n;
|
3024
2973
|
}
|
3025
2974
|
|
3026
2975
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3027
2976
|
|
3028
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3029
|
-
if (seed
|
2977
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
2978
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3030
2979
|
seed = time(NULL);
|
3031
2980
|
}
|
3032
2981
|
ctx->rng.seed(seed);
|
@@ -3045,7 +2994,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3045
2994
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3046
2995
|
const size_t s_kv_size = sizeof(size_t);
|
3047
2996
|
const size_t s_kv_ntok = sizeof(int);
|
3048
|
-
const size_t s_kv = ctx->
|
2997
|
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3049
2998
|
|
3050
2999
|
const size_t s_total = (
|
3051
3000
|
+ s_rng_size
|
@@ -3111,7 +3060,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3111
3060
|
|
3112
3061
|
// copy kv cache
|
3113
3062
|
{
|
3114
|
-
const auto & kv_self = ctx->
|
3063
|
+
const auto & kv_self = ctx->kv_self;
|
3115
3064
|
const auto & hparams = ctx->model.hparams;
|
3116
3065
|
const int n_layer = hparams.n_layer;
|
3117
3066
|
const int n_embd = hparams.n_embd;
|
@@ -3215,7 +3164,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3215
3164
|
|
3216
3165
|
// set kv cache
|
3217
3166
|
{
|
3218
|
-
const auto & kv_self = ctx->
|
3167
|
+
const auto & kv_self = ctx->kv_self;
|
3219
3168
|
const auto & hparams = ctx->model.hparams;
|
3220
3169
|
const int n_layer = hparams.n_layer;
|
3221
3170
|
const int n_embd = hparams.n_embd;
|
@@ -3259,7 +3208,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3259
3208
|
ggml_free(cpy_ctx);
|
3260
3209
|
}
|
3261
3210
|
|
3262
|
-
ctx->
|
3211
|
+
ctx->kv_self.n = kv_ntok;
|
3263
3212
|
}
|
3264
3213
|
|
3265
3214
|
const size_t nread = inp - src;
|
@@ -3355,7 +3304,29 @@ int llama_eval(
|
|
3355
3304
|
int n_tokens,
|
3356
3305
|
int n_past,
|
3357
3306
|
int n_threads) {
|
3358
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3307
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3308
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3309
|
+
return 1;
|
3310
|
+
}
|
3311
|
+
|
3312
|
+
// get a more accurate load time, upon first eval
|
3313
|
+
// TODO: fix this
|
3314
|
+
if (!ctx->has_evaluated_once) {
|
3315
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3316
|
+
ctx->has_evaluated_once = true;
|
3317
|
+
}
|
3318
|
+
|
3319
|
+
return 0;
|
3320
|
+
}
|
3321
|
+
|
3322
|
+
|
3323
|
+
int llama_eval_embd(
|
3324
|
+
struct llama_context * ctx,
|
3325
|
+
const float * embd,
|
3326
|
+
int n_tokens,
|
3327
|
+
int n_past,
|
3328
|
+
int n_threads) {
|
3329
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3359
3330
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3360
3331
|
return 1;
|
3361
3332
|
}
|
@@ -3376,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3376
3347
|
|
3377
3348
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3378
3349
|
|
3379
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3350
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3380
3351
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3381
3352
|
return 1;
|
3382
3353
|
}
|
@@ -3506,6 +3477,6 @@ const char * llama_print_system_info(void) {
|
|
3506
3477
|
}
|
3507
3478
|
|
3508
3479
|
// For internal test use
|
3509
|
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3480
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3510
3481
|
return ctx->model.tensors_by_name;
|
3511
3482
|
}
|