llama_cpp 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +74 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +48 -17
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/llama.cpp +127 -222
- data/ext/llama_cpp/src/llama.h +16 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
|
|
364
364
|
return size / ggml_blck_size(type);
|
365
365
|
}
|
366
366
|
|
367
|
-
struct llama_load_tensor_shard {
|
368
|
-
std::vector<uint32_t> ne;
|
369
|
-
size_t size;
|
370
|
-
enum ggml_type type;
|
371
|
-
size_t file_idx;
|
372
|
-
size_t file_off;
|
373
|
-
|
374
|
-
void calc_size() {
|
375
|
-
size = llama_calc_tensor_size(ne, type);
|
376
|
-
}
|
377
|
-
};
|
378
|
-
|
379
|
-
enum llama_split_type {
|
380
|
-
SPLIT_NONE,
|
381
|
-
SPLIT_BY_COLUMNS,
|
382
|
-
SPLIT_BY_ROWS
|
383
|
-
};
|
384
|
-
|
385
367
|
struct llama_load_tensor {
|
386
|
-
std::vector<llama_load_tensor_shard> shards;
|
387
|
-
|
388
368
|
std::string name;
|
389
369
|
enum ggml_type type = GGML_TYPE_F32;
|
390
|
-
llama_split_type split_type = SPLIT_NONE;
|
391
370
|
std::vector<uint32_t> ne;
|
371
|
+
size_t file_off;
|
392
372
|
size_t size;
|
393
373
|
struct ggml_tensor * ggml_tensor = NULL;
|
394
374
|
uint8_t * data;
|
395
|
-
|
396
|
-
llama_load_tensor(const std::string & name) : name(name) {}
|
397
|
-
|
398
|
-
void calc_all() {
|
399
|
-
calc_type();
|
400
|
-
calc_split_type();
|
401
|
-
calc_ne();
|
402
|
-
calc_size();
|
403
|
-
}
|
404
|
-
|
405
|
-
void calc_type() {
|
406
|
-
const auto & first_shard = shards.at(0);
|
407
|
-
for (const auto & shard : shards) {
|
408
|
-
if (shard.type != first_shard.type) {
|
409
|
-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
410
|
-
}
|
411
|
-
}
|
412
|
-
type = first_shard.type;
|
413
|
-
}
|
414
|
-
|
415
|
-
void calc_split_type() {
|
416
|
-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
417
|
-
shards.size() == 1) { // only one file?
|
418
|
-
split_type = SPLIT_NONE;
|
419
|
-
} else if (name.find("tok_embeddings.") == 0 ||
|
420
|
-
name.find(".attention.wo.weight") != std::string::npos ||
|
421
|
-
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
422
|
-
split_type = SPLIT_BY_COLUMNS;
|
423
|
-
} else {
|
424
|
-
split_type = SPLIT_BY_ROWS;
|
425
|
-
}
|
426
|
-
}
|
427
|
-
|
428
|
-
void calc_ne() {
|
429
|
-
const auto & first_shard = shards.at(0);
|
430
|
-
for (const auto & shard : shards) {
|
431
|
-
if (shard.ne != first_shard.ne) {
|
432
|
-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
433
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
434
|
-
}
|
435
|
-
}
|
436
|
-
ne = first_shard.ne;
|
437
|
-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
|
438
|
-
uint32_t n_shards = (uint32_t) shards.size();
|
439
|
-
switch (split_type) {
|
440
|
-
case SPLIT_NONE:
|
441
|
-
ne = first_shard.ne;
|
442
|
-
break;
|
443
|
-
case SPLIT_BY_COLUMNS:
|
444
|
-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
445
|
-
first_shard.ne[1]};
|
446
|
-
break;
|
447
|
-
case SPLIT_BY_ROWS:
|
448
|
-
ne = {first_shard.ne[0],
|
449
|
-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
450
|
-
break;
|
451
|
-
}
|
452
|
-
}
|
453
|
-
|
454
|
-
void calc_size() {
|
455
|
-
size = llama_calc_tensor_size(ne, type);
|
456
|
-
}
|
457
375
|
};
|
458
376
|
|
459
377
|
struct llama_load_tensors_map {
|
@@ -476,13 +394,13 @@ struct llama_file_loader {
|
|
476
394
|
llama_hparams hparams;
|
477
395
|
llama_vocab vocab;
|
478
396
|
|
479
|
-
llama_file_loader(const char * fname,
|
397
|
+
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
|
480
398
|
: file(fname, "rb") {
|
481
399
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
482
400
|
read_magic();
|
483
401
|
read_hparams();
|
484
402
|
read_vocab();
|
485
|
-
read_tensor_metadata(
|
403
|
+
read_tensor_metadata(tensors_map);
|
486
404
|
}
|
487
405
|
void read_magic() {
|
488
406
|
uint32_t magic = file.read_u32();
|
@@ -539,19 +457,19 @@ struct llama_file_loader {
|
|
539
457
|
tok_score.score = score;
|
540
458
|
}
|
541
459
|
}
|
542
|
-
void read_tensor_metadata(
|
460
|
+
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
543
461
|
while (file.tell() < file.size) {
|
544
|
-
|
462
|
+
llama_load_tensor tensor;
|
545
463
|
uint32_t n_dims = file.read_u32();
|
546
464
|
uint32_t name_len = file.read_u32();
|
547
|
-
|
548
|
-
|
549
|
-
file.read_raw(
|
465
|
+
tensor.type = (enum ggml_type) file.read_u32();
|
466
|
+
tensor.ne.resize(n_dims);
|
467
|
+
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
550
468
|
std::string name = file.read_string(name_len);
|
551
469
|
if (n_dims < 1 || n_dims > 2) {
|
552
470
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
553
471
|
}
|
554
|
-
switch (
|
472
|
+
switch (tensor.type) {
|
555
473
|
case GGML_TYPE_F32:
|
556
474
|
case GGML_TYPE_F16:
|
557
475
|
case GGML_TYPE_Q4_0:
|
@@ -566,30 +484,20 @@ struct llama_file_loader {
|
|
566
484
|
case GGML_TYPE_Q6_K:
|
567
485
|
break;
|
568
486
|
default: {
|
569
|
-
throw std::runtime_error(format("unrecognized tensor type %u\n",
|
487
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
570
488
|
}
|
571
489
|
}
|
572
490
|
|
573
|
-
|
574
|
-
|
575
|
-
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
576
|
-
}
|
577
|
-
shard.file_idx = file_idx;
|
578
|
-
shard.file_off = file.tell();
|
491
|
+
// skip to the next multiple of 32 bytes
|
492
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
579
493
|
|
580
|
-
|
581
|
-
|
494
|
+
tensor.file_off = file.tell();
|
495
|
+
tensor.name = name;
|
496
|
+
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
497
|
+
file.seek(tensor.size, SEEK_CUR);
|
582
498
|
|
583
|
-
|
584
|
-
|
585
|
-
if (it != tensors_map.name_to_idx.end()) {
|
586
|
-
idx = it->second;
|
587
|
-
} else {
|
588
|
-
tensors_map.tensors.emplace_back(name);
|
589
|
-
idx = tensors_map.tensors.size() - 1;
|
590
|
-
tensors_map.name_to_idx.emplace(name, idx);
|
591
|
-
}
|
592
|
-
tensors_map.tensors.at(idx).shards.push_back(shard);
|
499
|
+
tensors_map.tensors.push_back(tensor);
|
500
|
+
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
593
501
|
}
|
594
502
|
}
|
595
503
|
};
|
@@ -659,56 +567,19 @@ struct llama_file_saver {
|
|
659
567
|
};
|
660
568
|
|
661
569
|
struct llama_model_loader {
|
662
|
-
std::
|
570
|
+
std::unique_ptr<llama_file_loader> file_loader;
|
663
571
|
llama_load_tensors_map tensors_map;
|
664
572
|
bool use_mmap;
|
665
573
|
size_t num_ggml_tensors_created = 0;
|
666
574
|
struct ggml_context * ggml_ctx = NULL;
|
667
575
|
std::unique_ptr<llama_mmap> mapping;
|
668
576
|
|
669
|
-
llama_model_loader(const std::string & fname_base, bool use_mmap
|
670
|
-
|
671
|
-
file_loaders.emplace_back(first_file);
|
672
|
-
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
673
|
-
for (uint32_t i = 1; i < n_parts; i++) {
|
674
|
-
std::string fname = fname_base + "." + std::to_string(i);
|
675
|
-
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
676
|
-
file_loaders.emplace_back(ith_file);
|
677
|
-
if (ith_file->hparams != first_file->hparams) {
|
678
|
-
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
679
|
-
}
|
680
|
-
}
|
577
|
+
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
578
|
+
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
681
579
|
if (!llama_mmap::SUPPORTED) {
|
682
580
|
use_mmap = false;
|
683
581
|
}
|
684
|
-
if (use_mmap && alignment_prevents_mmap()) {
|
685
|
-
fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
|
686
|
-
use_mmap = false;
|
687
|
-
}
|
688
582
|
this->use_mmap = use_mmap;
|
689
|
-
for (llama_load_tensor & lt : tensors_map.tensors) {
|
690
|
-
lt.calc_all();
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
bool alignment_prevents_mmap() {
|
695
|
-
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
696
|
-
for (const llama_load_tensor_shard & shard : lt.shards) {
|
697
|
-
if (shard.file_off & 3) {
|
698
|
-
return true;
|
699
|
-
}
|
700
|
-
}
|
701
|
-
}
|
702
|
-
return false;
|
703
|
-
}
|
704
|
-
|
705
|
-
uint32_t guess_n_parts() const {
|
706
|
-
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
707
|
-
if (it == tensors_map.name_to_idx.end()) {
|
708
|
-
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
709
|
-
}
|
710
|
-
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
711
|
-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
712
583
|
}
|
713
584
|
|
714
585
|
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
|
@@ -774,7 +645,7 @@ struct llama_model_loader {
|
|
774
645
|
}
|
775
646
|
|
776
647
|
if (use_mmap) {
|
777
|
-
mapping.reset(new llama_mmap(&
|
648
|
+
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
|
778
649
|
if (lmlock) {
|
779
650
|
lmlock->init(mapping->addr);
|
780
651
|
}
|
@@ -830,45 +701,13 @@ struct llama_model_loader {
|
|
830
701
|
|
831
702
|
void load_data_for(llama_load_tensor & lt) {
|
832
703
|
if (use_mmap) {
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
704
|
+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
705
|
+
} else {
|
706
|
+
llama_file & file = file_loader->file;
|
707
|
+
file.seek(lt.file_off, SEEK_SET);
|
838
708
|
file.read_raw(lt.data, lt.size);
|
839
|
-
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
840
|
-
size_t offset = 0;
|
841
|
-
for (llama_load_tensor_shard & shard : lt.shards) {
|
842
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
843
|
-
file.seek(shard.file_off, SEEK_SET);
|
844
|
-
file.read_raw(lt.data + offset, shard.size);
|
845
|
-
offset += shard.size;
|
846
|
-
}
|
847
|
-
LLAMA_ASSERT(offset == lt.size);
|
848
|
-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
849
|
-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
850
|
-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
|
851
|
-
for (size_t i = 0; i < lt.shards.size(); i++) {
|
852
|
-
llama_load_tensor_shard & shard = lt.shards.at(i);
|
853
|
-
llama_file & file = file_loaders.at(shard.file_idx)->file;
|
854
|
-
file.seek(shard.file_off, SEEK_SET);
|
855
|
-
tmp_bufs.at(i).resize(shard.size);
|
856
|
-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
|
857
|
-
}
|
858
|
-
// Then reshape.
|
859
|
-
size_t num_rows = lt.ne.at(1);
|
860
|
-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
|
861
|
-
size_t out_offset = 0;
|
862
|
-
for (size_t row = 0; row < num_rows; row++) {
|
863
|
-
for (llama_buffer & tmp_buf : tmp_bufs) {
|
864
|
-
memcpy(lt.data + out_offset,
|
865
|
-
tmp_buf.addr + row * per_shard_row_size,
|
866
|
-
per_shard_row_size);
|
867
|
-
out_offset += per_shard_row_size;
|
868
|
-
}
|
869
|
-
}
|
870
|
-
LLAMA_ASSERT(out_offset == lt.size);
|
871
709
|
}
|
710
|
+
|
872
711
|
if (0) {
|
873
712
|
print_checksum(lt);
|
874
713
|
}
|
@@ -938,7 +777,7 @@ static bool kv_cache_init(
|
|
938
777
|
|
939
778
|
struct llama_context_params llama_context_default_params() {
|
940
779
|
struct llama_context_params result = {
|
941
|
-
/*.seed =*/
|
780
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
942
781
|
/*.n_ctx =*/ 512,
|
943
782
|
/*.n_batch =*/ 512,
|
944
783
|
/*.gpu_layers =*/ 0,
|
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
|
|
1067
906
|
|
1068
907
|
model.t_start_us = ggml_time_us();
|
1069
908
|
|
1070
|
-
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap
|
909
|
+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
1071
910
|
|
1072
|
-
vocab = std::move(ml->
|
1073
|
-
model.hparams = ml->
|
911
|
+
vocab = std::move(ml->file_loader->vocab);
|
912
|
+
model.hparams = ml->file_loader->hparams;
|
1074
913
|
model.n_gpu_layers = n_gpu_layers;
|
1075
|
-
llama_file_version file_version = ml->
|
914
|
+
llama_file_version file_version = ml->file_loader->file_version;
|
1076
915
|
auto & hparams = model.hparams;
|
1077
916
|
|
1078
917
|
{
|
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
|
|
1106
945
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1107
946
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1108
947
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1109
|
-
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
1110
948
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1111
949
|
}
|
1112
950
|
|
@@ -1369,22 +1207,26 @@ static bool llama_model_load(
|
|
1369
1207
|
|
1370
1208
|
// evaluate the transformer
|
1371
1209
|
//
|
1372
|
-
// - lctx:
|
1373
|
-
// - tokens:
|
1374
|
-
// -
|
1375
|
-
// -
|
1376
|
-
// -
|
1210
|
+
// - lctx: llama context
|
1211
|
+
// - tokens: new batch of tokens to process
|
1212
|
+
// - embd embeddings input
|
1213
|
+
// - n_tokens number of tokens
|
1214
|
+
// - n_past: the context size so far
|
1215
|
+
// - n_threads: number of threads to use
|
1377
1216
|
//
|
1378
1217
|
static bool llama_eval_internal(
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1218
|
+
llama_context & lctx,
|
1219
|
+
const llama_token * tokens,
|
1220
|
+
const float * embd,
|
1221
|
+
const int n_tokens,
|
1222
|
+
const int n_past,
|
1223
|
+
const int n_threads,
|
1384
1224
|
const char * cgraph_fname) {
|
1385
1225
|
|
1226
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1227
|
+
|
1386
1228
|
// enforce that the first token is BOS
|
1387
|
-
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
1229
|
+
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
1388
1230
|
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
1389
1231
|
return false;
|
1390
1232
|
}
|
@@ -1424,12 +1266,18 @@ static bool llama_eval_internal(
|
|
1424
1266
|
ggml_cgraph gf = {};
|
1425
1267
|
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1426
1268
|
|
1427
|
-
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1428
|
-
ggml_set_name(embd, "embd");
|
1429
|
-
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1430
|
-
|
1431
1269
|
struct ggml_tensor * cur;
|
1432
|
-
struct ggml_tensor * inpL
|
1270
|
+
struct ggml_tensor * inpL;
|
1271
|
+
|
1272
|
+
if (tokens) {
|
1273
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1274
|
+
ggml_set_name(embd, "embd");
|
1275
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1276
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1277
|
+
} else {
|
1278
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1279
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1280
|
+
}
|
1433
1281
|
|
1434
1282
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1435
1283
|
(void) i_gpu_start;
|
@@ -2451,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2451
2299
|
nthread = std::thread::hardware_concurrency();
|
2452
2300
|
}
|
2453
2301
|
|
2454
|
-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false
|
2455
|
-
|
2456
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2302
|
+
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
2303
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
|
2457
2304
|
|
2458
2305
|
#ifdef GGML_USE_K_QUANTS
|
2459
2306
|
int n_attention_wv = 0;
|
@@ -2654,6 +2501,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2654
2501
|
}
|
2655
2502
|
}
|
2656
2503
|
|
2504
|
+
|
2505
|
+
|
2657
2506
|
//
|
2658
2507
|
// interface implementation
|
2659
2508
|
//
|
@@ -2692,7 +2541,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2692
2541
|
|
2693
2542
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
2694
2543
|
|
2695
|
-
if (params.seed
|
2544
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2696
2545
|
params.seed = time(NULL);
|
2697
2546
|
}
|
2698
2547
|
|
@@ -2874,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2874
2723
|
|
2875
2724
|
// create a name -> tensor map of the model to accelerate lookups
|
2876
2725
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
2877
|
-
for (auto & kv: model.tensors_by_name) {
|
2726
|
+
for (const auto & kv: model.tensors_by_name) {
|
2878
2727
|
model_tensors.insert(kv);
|
2879
2728
|
}
|
2880
2729
|
|
@@ -2885,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2885
2734
|
llama_buffer base_buf;
|
2886
2735
|
if (path_base_model) {
|
2887
2736
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
2888
|
-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true
|
2737
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
2889
2738
|
|
2890
2739
|
size_t ctx_size;
|
2891
2740
|
size_t mmapped_size;
|
@@ -2903,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2903
2752
|
|
2904
2753
|
// maybe this should in llama_model_loader
|
2905
2754
|
if (model_loader->use_mmap) {
|
2906
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->
|
2755
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
2907
2756
|
}
|
2908
2757
|
}
|
2909
2758
|
|
@@ -2964,7 +2813,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2964
2813
|
return false;
|
2965
2814
|
}
|
2966
2815
|
}
|
2967
|
-
ggml_tensor* lora_tensor;
|
2816
|
+
ggml_tensor * lora_tensor;
|
2968
2817
|
if (n_dims == 2) {
|
2969
2818
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
2970
2819
|
}
|
@@ -2972,6 +2821,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2972
2821
|
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
2973
2822
|
return 1;
|
2974
2823
|
}
|
2824
|
+
ggml_set_name(lora_tensor, "lora_tensor");
|
2975
2825
|
|
2976
2826
|
// load tensor data
|
2977
2827
|
size_t offset = fin.tellg();
|
@@ -2987,6 +2837,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2987
2837
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
2988
2838
|
|
2989
2839
|
ggml_tensor * dest_t = model_tensors[base_name];
|
2840
|
+
|
2841
|
+
offload_func_t offload_func = llama_nop;
|
2842
|
+
offload_func_t offload_func_force_inplace = llama_nop;
|
2843
|
+
|
2844
|
+
#ifdef GGML_USE_CUBLAS
|
2845
|
+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
2846
|
+
if (dest_t->type != GGML_TYPE_F16) {
|
2847
|
+
throw std::runtime_error(format(
|
2848
|
+
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
2849
|
+
}
|
2850
|
+
offload_func = ggml_cuda_assign_buffers;
|
2851
|
+
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
2852
|
+
}
|
2853
|
+
#endif // GGML_USE_CUBLAS
|
2854
|
+
|
2990
2855
|
ggml_tensor * base_t;
|
2991
2856
|
if (model_loader) {
|
2992
2857
|
// load from base model
|
@@ -3014,7 +2879,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3014
2879
|
}
|
3015
2880
|
|
3016
2881
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
2882
|
+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
2883
|
+
ggml_set_name(loraA, "loraA");
|
2884
|
+
|
3017
2885
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
2886
|
+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
2887
|
+
ggml_set_name(loraB, "loraB");
|
3018
2888
|
|
3019
2889
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
3020
2890
|
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
@@ -3024,19 +2894,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
3024
2894
|
|
3025
2895
|
// w = w + BA*s
|
3026
2896
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
2897
|
+
offload_func(BA);
|
2898
|
+
ggml_set_name(BA, "BA");
|
3027
2899
|
|
3028
2900
|
if (scaling != 1.0f) {
|
3029
2901
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
2902
|
+
ggml_set_name(scale_tensor, "scale_tensor");
|
2903
|
+
|
3030
2904
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
2905
|
+
offload_func(BA);
|
2906
|
+
ggml_set_name(BA, "BA_scaled");
|
3031
2907
|
}
|
3032
2908
|
|
3033
2909
|
ggml_tensor * r;
|
3034
2910
|
if (base_t == dest_t) {
|
3035
2911
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
2912
|
+
offload_func_force_inplace(r);
|
2913
|
+
ggml_set_name(r, "r_add_inplace");
|
3036
2914
|
}
|
3037
2915
|
else {
|
3038
2916
|
r = ggml_add(lora_ctx, base_t, BA);
|
2917
|
+
offload_func(r);
|
2918
|
+
ggml_set_name(r, "r_add");
|
2919
|
+
|
3039
2920
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
2921
|
+
offload_func(r);
|
2922
|
+
ggml_set_name(r, "r_cpy");
|
3040
2923
|
}
|
3041
2924
|
|
3042
2925
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
@@ -3091,8 +2974,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
3091
2974
|
|
3092
2975
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
3093
2976
|
|
3094
|
-
void llama_set_rng_seed(struct llama_context * ctx,
|
3095
|
-
if (seed
|
2977
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
2978
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
3096
2979
|
seed = time(NULL);
|
3097
2980
|
}
|
3098
2981
|
ctx->rng.seed(seed);
|
@@ -3421,7 +3304,29 @@ int llama_eval(
|
|
3421
3304
|
int n_tokens,
|
3422
3305
|
int n_past,
|
3423
3306
|
int n_threads) {
|
3424
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
3307
|
+
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
3308
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3309
|
+
return 1;
|
3310
|
+
}
|
3311
|
+
|
3312
|
+
// get a more accurate load time, upon first eval
|
3313
|
+
// TODO: fix this
|
3314
|
+
if (!ctx->has_evaluated_once) {
|
3315
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
3316
|
+
ctx->has_evaluated_once = true;
|
3317
|
+
}
|
3318
|
+
|
3319
|
+
return 0;
|
3320
|
+
}
|
3321
|
+
|
3322
|
+
|
3323
|
+
int llama_eval_embd(
|
3324
|
+
struct llama_context * ctx,
|
3325
|
+
const float * embd,
|
3326
|
+
int n_tokens,
|
3327
|
+
int n_past,
|
3328
|
+
int n_threads) {
|
3329
|
+
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
3425
3330
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3426
3331
|
return 1;
|
3427
3332
|
}
|
@@ -3442,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3442
3347
|
|
3443
3348
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3444
3349
|
|
3445
|
-
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3350
|
+
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
3446
3351
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3447
3352
|
return 1;
|
3448
3353
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -46,6 +46,8 @@
|
|
46
46
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
47
|
#define LLAMA_SESSION_VERSION 1
|
48
48
|
|
49
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
50
|
+
|
49
51
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
50
52
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
51
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
@@ -81,11 +83,11 @@ extern "C" {
|
|
81
83
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
82
84
|
|
83
85
|
struct llama_context_params {
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
86
|
+
uint32_t seed; // RNG seed, -1 for random
|
87
|
+
int32_t n_ctx; // text context
|
88
|
+
int32_t n_batch; // prompt processing batch size
|
89
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
89
91
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
90
92
|
// called with a progress value between 0 and 1, pass NULL to disable
|
91
93
|
llama_progress_callback progress_callback;
|
@@ -196,7 +198,7 @@ extern "C" {
|
|
196
198
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
197
199
|
|
198
200
|
// Sets the current rng seed.
|
199
|
-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx,
|
201
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
200
202
|
|
201
203
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
202
204
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -226,6 +228,14 @@ extern "C" {
|
|
226
228
|
int n_past,
|
227
229
|
int n_threads);
|
228
230
|
|
231
|
+
// Same as llama_eval, but use float matrix input directly.
|
232
|
+
LLAMA_API int llama_eval_embd(
|
233
|
+
struct llama_context * ctx,
|
234
|
+
const float * embd,
|
235
|
+
int n_tokens,
|
236
|
+
int n_past,
|
237
|
+
int n_threads);
|
238
|
+
|
229
239
|
// Export a static computation graph for context of 511 and batch size of 1
|
230
240
|
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
231
241
|
// parameters here to keep things simple
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-b8c8dda'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
|
|
4
4
|
LLAMA_FILE_VERSION: String
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
|
+
LLAMA_DEFALUT_SEED: String
|
7
8
|
|
8
9
|
LLAMA_MAX_DEVICES: Integer
|
9
10
|
|
@@ -72,6 +73,7 @@ module LLaMACpp
|
|
72
73
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
73
74
|
def embeddings: () -> Array[Float]
|
74
75
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
76
|
+
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
75
77
|
def eval_export: (String) -> bool
|
76
78
|
def logits: () -> Array[Float]
|
77
79
|
def n_ctx: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|