llama_cpp 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +74 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +48 -17
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/llama.cpp +127 -222
- data/ext/llama_cpp/src/llama.h +16 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
364
364
|
return size / ggml_blck_size(type);
|
365
365
|
}
|
366
366
|
|
367
|
-
struct llama_load_tensor_shard {
|
368
|
-
std::vector<uint32_t> ne;
|
369
|
-
size_t size;
|
370
|
-
enum ggml_type type;
|
371
|
-
size_t file_idx;
|
372
|
-
size_t file_off;
|
373
|
-
|
374
|
-
void calc_size() {
|
375
|
-
size = llama_calc_tensor_size(ne, type);
|
376
|
-
}
|
377
|
-
};
|
378
|
-
|
379
|
-
enum llama_split_type {
|
380
|
-
SPLIT_NONE,
|
381
|
-
SPLIT_BY_COLUMNS,
|
382
|
-
SPLIT_BY_ROWS
|
383
|
-
};
|
384
|
-
|
385
367
|
struct llama_load_tensor {
|
386
|
-
std::vector<llama_load_tensor_shard> shards;
|
387
|
-
|
388
368
|
std::string name;
|
389
369
|
enum ggml_type type = GGML_TYPE_F32;
|
390
|
-
llama_split_type split_type = SPLIT_NONE;
|
391
370
|
std::vector<uint32_t> ne;
|
371
|
+
size_t file_off;
|
392
372
|
size_t size;
|
393
373
|
struct ggml_tensor * ggml_tensor = NULL;
|
394
374
|
uint8_t * data;
|
395
|
-
|
396
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
397
|
-
|
398
|
-
void calc_all() {
|
399
|
-
calc_type();
|
400
|
-
calc_split_type();
|
401
|
-
calc_ne();
|
402
|
-
calc_size();
|
403
|
-
}
|
404
|
-
|
405
|
-
void calc_type() {
|
406
|
-
const auto & first_shard = shards.at(0);
|
407
|
-
for (const auto & shard : shards) {
|
408
|
-
if (shard.type != first_shard.type) {
|
409
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
410
|
-
}
|
411
|
-
}
|
412
|
-
type = first_shard.type;
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_split_type() {
|
416
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
417
|
-
shards.size() == 1) { // only one file?
|
418
|
-
split_type = SPLIT_NONE;
|
419
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
420
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
421
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
422
|
-
split_type = SPLIT_BY_COLUMNS;
|
423
|
-
} else {
|
424
|
-
split_type = SPLIT_BY_ROWS;
|
425
|
-
}
|
426
|
-
}
|
427
|
-
|
428
|
-
void calc_ne() {
|
429
|
-
const auto & first_shard = shards.at(0);
|
430
|
-
for (const auto & shard : shards) {
|
431
|
-
if (shard.ne != first_shard.ne) {
|
432
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
433
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
434
|
-
}
|
435
|
-
}
|
436
|
-
ne = first_shard.ne;
|
437
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
438
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
439
|
-
switch (split_type) {
|
440
|
-
case SPLIT_NONE:
|
441
|
-
ne = first_shard.ne;
|
442
|
-
break;
|
443
|
-
case SPLIT_BY_COLUMNS:
|
444
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
445
|
-
first_shard.ne[1]};
|
446
|
-
break;
|
447
|
-
case SPLIT_BY_ROWS:
|
448
|
-
ne = {first_shard.ne[0],
|
449
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
450
|
-
break;
|
451
|
-
}
|
452
|
-
}
|
453
|
-
|
454
|
-
void calc_size() {
|
455
|
-
size = llama_calc_tensor_size(ne, type);
|
456
|
-
}
|
457
375
|
};
|
458
376
|
|
459
377
|
struct llama_load_tensors_map {
|
@@ -476,13 +394,13 @@ struct llama_file_loader {
|
|
476
394
|
llama_hparams hparams;
|
477
395
|
llama_vocab vocab;
|
478
396
|
|
479
|
-
llama_file_loader(const char * fname,
|
397
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
480
398
|
: file(fname, "rb") {
|
481
399
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
482
400
|
read_magic();
|
483
401
|
read_hparams();
|
484
402
|
read_vocab();
|
485
|
-
read_tensor_metadata(
|
403
|
+
read_tensor_metadata(tensors_map);
|
486
404
|
}
|
487
405
|
void read_magic() {
|
488
406
|
uint32_t magic = file.read_u32();
|
@@ -539,19 +457,19 @@ struct llama_file_loader {
|
|
539
457
|
tok_score.score = score;
|
540
458
|
}
|
541
459
|
}
|
542
|
-
void read_tensor_metadata(
|
460
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
543
461
|
while (file.tell() < file.size) {
|
544
|
-
|
462
|
+
llama_load_tensor tensor;
|
545
463
|
uint32_t n_dims = file.read_u32();
|
546
464
|
uint32_t name_len = file.read_u32();
|
547
|
-
|
548
|
-
|
549
|
-
file.read_raw(
|
465
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
466
|
+
tensor.ne.resize(n_dims);
|
467
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
550
468
|
std::string name = file.read_string(name_len);
|
551
469
|
if (n_dims < 1 || n_dims > 2) {
|
552
470
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
553
471
|
}
|
554
|
-
switch (
|
472
|
+
switch (tensor.type) {
|
555
473
|
case GGML_TYPE_F32:
|
556
474
|
case GGML_TYPE_F16:
|
557
475
|
case GGML_TYPE_Q4_0:
|
@@ -566,30 +484,20 @@ struct llama_file_loader {
|
|
566
484
|
case GGML_TYPE_Q6_K:
|
567
485
|
break;
|
568
486
|
default: {
|
569
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
487
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
570
488
|
}
|
571
489
|
}
|
572
490
|
|
573
|
-
|
574
|
-
|
575
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
576
|
-
}
|
577
|
-
shard.file_idx = file_idx;
|
578
|
-
shard.file_off = file.tell();
|
491
|
+
// skip to the next multiple of 32 bytes
|
492
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
579
493
|
|
580
|
-
|
581
|
-
|
494
|
+
tensor.file_off = file.tell();
|
495
|
+
tensor.name = name;
|
496
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
497
|
+
file.seek(tensor.size, SEEK_CUR);
|
582
498
|
|
583
|
-
|
584
|
-
|
585
|
-
if (it != tensors_map.name_to_idx.end()) {
|
586
|
-
idx = it->second;
|
587
|
-
} else {
|
588
|
-
tensors_map.tensors.emplace_back(name);
|
589
|
-
idx = tensors_map.tensors.size() - 1;
|
590
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
591
|
-
}
|
592
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
499
|
+
tensors_map.tensors.push_back(tensor);
|
500
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
593
501
|
}
|
594
502
|
}
|
595
503
|
};
|
@@ -659,56 +567,19 @@ struct llama_file_saver {
|
|
659
567
|
};
|
660
568
|
|
661
569
|
struct llama_model_loader {
|
662
|
-
std::
|
570
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
663
571
|
llama_load_tensors_map tensors_map;
|
664
572
|
bool use_mmap;
|
665
573
|
size_t num_ggml_tensors_created = 0;
|
666
574
|
struct ggml_context * ggml_ctx = NULL;
|
667
575
|
std::unique_ptr<llama_mmap> mapping;
|
668
576
|
|
669
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
670
|
-
|
671
|
-
file_loaders.emplace_back(first_file);
|
672
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
673
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
674
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
675
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
676
|
-
file_loaders.emplace_back(ith_file);
|
677
|
-
if (ith_file->hparams != first_file->hparams) {
|
678
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
679
|
-
}
|
680
|
-
}
|
577
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
578
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
681
579
|
if (!llama_mmap::SUPPORTED) {
|
682
580
|
use_mmap = false;
|
683
581
|
}
|
684
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
685
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
686
|
-
use_mmap = false;
|
687
|
-
}
|
688
582
|
this->use_mmap = use_mmap;
|
689
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
690
|
-
lt.calc_all();
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
bool alignment_prevents_mmap() {
|
695
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
696
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
697
|
-
if (shard.file_off & 3) {
|
698
|
-
return true;
|
699
|
-
}
|
700
|
-
}
|
701
|
-
}
|
702
|
-
return false;
|
703
|
-
}
|
704
|
-
|
705
|
-
uint32_t guess_n_parts() const {
|
706
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
707
|
-
if (it == tensors_map.name_to_idx.end()) {
|
708
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
709
|
-
}
|
710
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
711
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
712
583
|
}
|
713
584
|
|
714
585
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -774,7 +645,7 @@ struct llama_model_loader {
|
|
774
645
|
}
|
775
646
|
|
776
647
|
if (use_mmap) {
|
777
|
-
mapping.reset(new llama_mmap(&
|
648
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
778
649
|
if (lmlock) {
|
779
650
|
lmlock->init(mapping->addr);
|
780
651
|
}
|
@@ -830,45 +701,13 @@ struct llama_model_loader {
|
|
830
701
|
|
831
702
|
void load_data_for(llama_load_tensor & lt) {
|
832
703
|
if (use_mmap) {
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
704
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
705
|
+
} else {
|
706
|
+
llama_file & file = file_loader->file;
|
707
|
+
file.seek(lt.file_off, SEEK_SET);
|
838
708
|
file.read_raw(lt.data, lt.size);
|
839
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
840
|
-
size_t offset = 0;
|
841
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
842
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
843
|
-
file.seek(shard.file_off, SEEK_SET);
|
844
|
-
file.read_raw(lt.data + offset, shard.size);
|
845
|
-
offset += shard.size;
|
846
|
-
}
|
847
|
-
LLAMA_ASSERT(offset == lt.size);
|
848
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
849
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
850
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
851
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
852
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
853
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
854
|
-
file.seek(shard.file_off, SEEK_SET);
|
855
|
-
tmp_bufs.at(i).resize(shard.size);
|
856
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
857
|
-
}
|
858
|
-
// Then reshape.
|
859
|
-
size_t num_rows = lt.ne.at(1);
|
860
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
861
|
-
size_t out_offset = 0;
|
862
|
-
for (size_t row = 0; row < num_rows; row++) {
|
863
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
864
|
-
memcpy(lt.data + out_offset,
|
865
|
-
tmp_buf.addr + row * per_shard_row_size,
|
866
|
-
per_shard_row_size);
|
867
|
-
out_offset += per_shard_row_size;
|
868
|
-
}
|
869
|
-
}
|
870
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
871
709
|
}
|
710
|
+
|
872
711
|
if (0) {
|
873
712
|
print_checksum(lt);
|
874
713
|
}
|
@@ -938,7 +777,7 @@ static bool kv_cache_init(
|
|
938
777
|
|
939
778
|
struct llama_context_params llama_context_default_params() {
|
940
779
|
struct llama_context_params result = {
|
941
|
-
/*.seed =*/
|
780
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
942
781
|
/*.n_ctx =*/ 512,
|
943
782
|
/*.n_batch =*/ 512,
|
944
783
|
/*.gpu_layers =*/ 0,
|
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
|
|
1067
906
|
|
1068
907
|
model.t_start_us = ggml_time_us();
|
1069
908
|
|
1070
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
909
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1071
910
|
|
1072
|
-
vocab = std::move(ml->
|
1073
|
-
model.hparams = ml->
|
911
|
+
vocab = std::move(ml->file_loader->vocab);
|
912
|
+
model.hparams = ml->file_loader->hparams;
|
1074
913
|
model.n_gpu_layers = n_gpu_layers;
|
1075
|
-
llama_file_version file_version = ml->
|
914
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1076
915
|
auto & hparams = model.hparams;
|
1077
916
|
|
1078
917
|
{
|
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
|
|
1106
945
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1107
946
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1108
947
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1109
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1110
948
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
949
|
}
|
1112
950
|
|
@@ -1369,22 +1207,26 @@ static bool llama_model_load(
|
|
1369
1207
|
|
1370
1208
|
// evaluate the transformer
|
1371
1209
|
//
|
1372
|
-
// - lctx:
|
1373
|
-
// - tokens:
|
1374
|
-
// -
|
1375
|
-
// -
|
1376
|
-
// -
|
1210
|
+
// - lctx: llama context
|
1211
|
+
// - tokens: new batch of tokens to process
|
1212
|
+
// - embd embeddings input
|
1213
|
+
// - n_tokens number of tokens
|
1214
|
+
// - n_past: the context size so far
|
1215
|
+
// - n_threads: number of threads to use
|
1377
1216
|
//
|
1378
1217
|
static bool llama_eval_internal(
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1218
|
+
llama_context & lctx,
|
1219
|
+
const llama_token * tokens,
|
1220
|
+
const float * embd,
|
1221
|
+
const int n_tokens,
|
1222
|
+
const int n_past,
|
1223
|
+
const int n_threads,
|
1384
1224
|
const char * cgraph_fname) {
|
1385
1225
|
|
1226
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
|
+
|
1386
1228
|
// enforce that the first token is BOS
|
1387
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1229
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1388
1230
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1389
1231
|
return false;
|
1390
1232
|
}
|
@@ -1424,12 +1266,18 @@ static bool llama_eval_internal(
|
|
1424
1266
|
ggml_cgraph gf = {};
|
1425
1267
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1426
1268
|
|
1427
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1428
|
-
ggml_set_name(embd, "embd");
|
1429
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1430
|
-
|
1431
1269
|
struct ggml_tensor * cur;
|
1432
|
-
struct ggml_tensor * inpL
|
1270
|
+
struct ggml_tensor * inpL;
|
1271
|
+
|
1272
|
+
if (tokens) {
|
1273
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1274
|
+
ggml_set_name(embd, "embd");
|
1275
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1276
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1277
|
+
} else {
|
1278
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
|
+
}
|
1433
1281
|
|
1434
1282
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1435
1283
|
(void) i_gpu_start;
|
@@ -2451,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2451
2299
|
nthread = std::thread::hardware_concurrency();
|
2452
2300
|
}
|
2453
2301
|
|
2454
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2455
|
-
|
2456
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2302
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2303
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2457
2304
|
|
2458
2305
|
#ifdef GGML_USE_K_QUANTS
|
2459
2306
|
int n_attention_wv = 0;
|
@@ -2654,6 +2501,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2654
2501
|
}
|
2655
2502
|
}
|
2656
2503
|
|
2504
|
+
|
2505
|
+
|
2657
2506
|
//
|
2658
2507
|
// interface implementation
|
2659
2508
|
//
|
@@ -2692,7 +2541,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2692
2541
|
|
2693
2542
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
2694
2543
|
|
2695
|
-
if (params.seed
|
2544
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2696
2545
|
params.seed = time(NULL);
|
2697
2546
|
}
|
2698
2547
|
|
@@ -2874,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2874
2723
|
|
2875
2724
|
// create a name -> tensor map of the model to accelerate lookups
|
2876
2725
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2877
|
-
for (auto & kv: model.tensors_by_name) {
|
2726
|
+
for (const auto & kv: model.tensors_by_name) {
|
2878
2727
|
model_tensors.insert(kv);
|
2879
2728
|
}
|
2880
2729
|
|
@@ -2885,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2885
2734
|
llama_buffer base_buf;
|
2886
2735
|
if (path_base_model) {
|
2887
2736
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2888
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2737
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2889
2738
|
|
2890
2739
|
size_t ctx_size;
|
2891
2740
|
size_t mmapped_size;
|
@@ -2903,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2903
2752
|
|
2904
2753
|
// maybe this should in llama_model_loader
|
2905
2754
|
if (model_loader->use_mmap) {
|
2906
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2755
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2907
2756
|
}
|
2908
2757
|
}
|
2909
2758
|
|
@@ -2964,7 +2813,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2964
2813
|
return false;
|
2965
2814
|
}
|
2966
2815
|
}
|
2967
|
-
ggml_tensor* lora_tensor;
|
2816
|
+
ggml_tensor * lora_tensor;
|
2968
2817
|
if (n_dims == 2) {
|
2969
2818
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2970
2819
|
}
|
@@ -2972,6 +2821,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2972
2821
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2973
2822
|
return 1;
|
2974
2823
|
}
|
2824
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2975
2825
|
|
2976
2826
|
// load tensor data
|
2977
2827
|
size_t offset = fin.tellg();
|
@@ -2987,6 +2837,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2987
2837
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2988
2838
|
|
2989
2839
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2840
|
+
|
2841
|
+
offload_func_t offload_func = llama_nop;
|
2842
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2843
|
+
|
2844
|
+
#ifdef GGML_USE_CUBLAS
|
2845
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2846
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2847
|
+
throw std::runtime_error(format(
|
2848
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2849
|
+
}
|
2850
|
+
offload_func = ggml_cuda_assign_buffers;
|
2851
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2852
|
+
}
|
2853
|
+
#endif // GGML_USE_CUBLAS
|
2854
|
+
|
2990
2855
|
ggml_tensor * base_t;
|
2991
2856
|
if (model_loader) {
|
2992
2857
|
// load from base model
|
@@ -3014,7 +2879,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3014
2879
|
}
|
3015
2880
|
|
3016
2881
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2882
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2883
|
+
ggml_set_name(loraA, "loraA");
|
2884
|
+
|
3017
2885
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2886
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2887
|
+
ggml_set_name(loraB, "loraB");
|
3018
2888
|
|
3019
2889
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3020
2890
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -3024,19 +2894,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3024
2894
|
|
3025
2895
|
// w = w + BA*s
|
3026
2896
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2897
|
+
offload_func(BA);
|
2898
|
+
ggml_set_name(BA, "BA");
|
3027
2899
|
|
3028
2900
|
if (scaling != 1.0f) {
|
3029
2901
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2902
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2903
|
+
|
3030
2904
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2905
|
+
offload_func(BA);
|
2906
|
+
ggml_set_name(BA, "BA_scaled");
|
3031
2907
|
}
|
3032
2908
|
|
3033
2909
|
ggml_tensor * r;
|
3034
2910
|
if (base_t == dest_t) {
|
3035
2911
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2912
|
+
offload_func_force_inplace(r);
|
2913
|
+
ggml_set_name(r, "r_add_inplace");
|
3036
2914
|
}
|
3037
2915
|
else {
|
3038
2916
|
r = ggml_add(lora_ctx, base_t, BA);
|
2917
|
+
offload_func(r);
|
2918
|
+
ggml_set_name(r, "r_add");
|
2919
|
+
|
3039
2920
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2921
|
+
offload_func(r);
|
2922
|
+
ggml_set_name(r, "r_cpy");
|
3040
2923
|
}
|
3041
2924
|
|
3042
2925
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3091,8 +2974,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
3091
2974
|
|
3092
2975
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3093
2976
|
|
3094
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3095
|
-
if (seed
|
2977
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
2978
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3096
2979
|
seed = time(NULL);
|
3097
2980
|
}
|
3098
2981
|
ctx->rng.seed(seed);
|
@@ -3421,7 +3304,29 @@ int llama_eval(
|
|
3421
3304
|
int n_tokens,
|
3422
3305
|
int n_past,
|
3423
3306
|
int n_threads) {
|
3424
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3307
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3308
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3309
|
+
return 1;
|
3310
|
+
}
|
3311
|
+
|
3312
|
+
// get a more accurate load time, upon first eval
|
3313
|
+
// TODO: fix this
|
3314
|
+
if (!ctx->has_evaluated_once) {
|
3315
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3316
|
+
ctx->has_evaluated_once = true;
|
3317
|
+
}
|
3318
|
+
|
3319
|
+
return 0;
|
3320
|
+
}
|
3321
|
+
|
3322
|
+
|
3323
|
+
int llama_eval_embd(
|
3324
|
+
struct llama_context * ctx,
|
3325
|
+
const float * embd,
|
3326
|
+
int n_tokens,
|
3327
|
+
int n_past,
|
3328
|
+
int n_threads) {
|
3329
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3425
3330
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3426
3331
|
return 1;
|
3427
3332
|
}
|
@@ -3442,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3442
3347
|
|
3443
3348
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3444
3349
|
|
3445
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3350
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3446
3351
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3447
3352
|
return 1;
|
3448
3353
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -46,6 +46,8 @@
|
|
46
46
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
47
|
#define LLAMA_SESSION_VERSION 1
|
48
48
|
|
49
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
50
|
+
|
49
51
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
50
52
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
51
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
@@ -81,11 +83,11 @@ extern "C" {
|
|
81
83
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
82
84
|
|
83
85
|
struct llama_context_params {
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
86
|
+
uint32_t seed; // RNG seed, -1 for random
|
87
|
+
int32_t n_ctx; // text context
|
88
|
+
int32_t n_batch; // prompt processing batch size
|
89
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
89
91
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
90
92
|
// called with a progress value between 0 and 1, pass NULL to disable
|
91
93
|
llama_progress_callback progress_callback;
|
@@ -196,7 +198,7 @@ extern "C" {
|
|
196
198
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
197
199
|
|
198
200
|
// Sets the current rng seed.
|
199
|
-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx,
|
201
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
200
202
|
|
201
203
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
202
204
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -226,6 +228,14 @@ extern "C" {
|
|
226
228
|
int n_past,
|
227
229
|
int n_threads);
|
228
230
|
|
231
|
+
// Same as llama_eval, but use float matrix input directly.
|
232
|
+
LLAMA_API int llama_eval_embd(
|
233
|
+
struct llama_context * ctx,
|
234
|
+
const float * embd,
|
235
|
+
int n_tokens,
|
236
|
+
int n_past,
|
237
|
+
int n_threads);
|
238
|
+
|
229
239
|
// Export a static computation graph for context of 511 and batch size of 1
|
230
240
|
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
231
241
|
// parameters here to keep things simple
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-b8c8dda'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
|
|
4
4
|
LLAMA_FILE_VERSION: String
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
|
+
LLAMA_DEFALUT_SEED: String
|
7
8
|
|
8
9
|
LLAMA_MAX_DEVICES: Integer
|
9
10
|
|
@@ -72,6 +73,7 @@ module LLaMACpp
|
|
72
73
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
73
74
|
def embeddings: () -> Array[Float]
|
74
75
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
76
|
+
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
75
77
|
def eval_export: (String) -> bool
|
76
78
|
def logits: () -> Array[Float]
|
77
79
|
def n_ctx: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|