llama_cpp 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364
364
  return size / ggml_blck_size(type);
365
365
  }
366
366
 
367
- struct llama_load_tensor_shard {
368
- std::vector<uint32_t> ne;
369
- size_t size;
370
- enum ggml_type type;
371
- size_t file_idx;
372
- size_t file_off;
373
-
374
- void calc_size() {
375
- size = llama_calc_tensor_size(ne, type);
376
- }
377
- };
378
-
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
367
  struct llama_load_tensor {
386
- std::vector<llama_load_tensor_shard> shards;
387
-
388
368
  std::string name;
389
369
  enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
370
  std::vector<uint32_t> ne;
371
+ size_t file_off;
392
372
  size_t size;
393
373
  struct ggml_tensor * ggml_tensor = NULL;
394
374
  uint8_t * data;
395
-
396
- llama_load_tensor(const std::string & name) : name(name) {}
397
-
398
- void calc_all() {
399
- calc_type();
400
- calc_split_type();
401
- calc_ne();
402
- calc_size();
403
- }
404
-
405
- void calc_type() {
406
- const auto & first_shard = shards.at(0);
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type) {
409
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410
- }
411
- }
412
- type = first_shard.type;
413
- }
414
-
415
- void calc_split_type() {
416
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417
- shards.size() == 1) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find("tok_embeddings.") == 0 ||
420
- name.find(".attention.wo.weight") != std::string::npos ||
421
- name.find(".feed_forward.w2.weight") != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
- void calc_ne() {
429
- const auto & first_shard = shards.at(0);
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne) {
432
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434
- }
435
- }
436
- ne = first_shard.ne;
437
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t) shards.size();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne;
442
- break;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445
- first_shard.ne[1]};
446
- break;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne[0],
449
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450
- break;
451
- }
452
- }
453
-
454
- void calc_size() {
455
- size = llama_calc_tensor_size(ne, type);
456
- }
457
375
  };
458
376
 
459
377
  struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
476
394
  llama_hparams hparams;
477
395
  llama_vocab vocab;
478
396
 
479
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
480
398
  : file(fname, "rb") {
481
399
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
482
400
  read_magic();
483
401
  read_hparams();
484
402
  read_vocab();
485
- read_tensor_metadata(file_idx, tensors_map);
403
+ read_tensor_metadata(tensors_map);
486
404
  }
487
405
  void read_magic() {
488
406
  uint32_t magic = file.read_u32();
@@ -539,19 +457,19 @@ struct llama_file_loader {
539
457
  tok_score.score = score;
540
458
  }
541
459
  }
542
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
460
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
543
461
  while (file.tell() < file.size) {
544
- llama_load_tensor_shard shard;
462
+ llama_load_tensor tensor;
545
463
  uint32_t n_dims = file.read_u32();
546
464
  uint32_t name_len = file.read_u32();
547
- shard.type = (enum ggml_type) file.read_u32();
548
- shard.ne.resize(n_dims);
549
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
465
+ tensor.type = (enum ggml_type) file.read_u32();
466
+ tensor.ne.resize(n_dims);
467
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
550
468
  std::string name = file.read_string(name_len);
551
469
  if (n_dims < 1 || n_dims > 2) {
552
470
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
553
471
  }
554
- switch (shard.type) {
472
+ switch (tensor.type) {
555
473
  case GGML_TYPE_F32:
556
474
  case GGML_TYPE_F16:
557
475
  case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
566
484
  case GGML_TYPE_Q6_K:
567
485
  break;
568
486
  default: {
569
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
487
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
570
488
  }
571
489
  }
572
490
 
573
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574
- // skip to the next multiple of 32 bytes
575
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
576
- }
577
- shard.file_idx = file_idx;
578
- shard.file_off = file.tell();
491
+ // skip to the next multiple of 32 bytes
492
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
579
493
 
580
- shard.calc_size();
581
- file.seek(shard.size, SEEK_CUR);
494
+ tensor.file_off = file.tell();
495
+ tensor.name = name;
496
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
497
+ file.seek(tensor.size, SEEK_CUR);
582
498
 
583
- auto it = tensors_map.name_to_idx.find(name);
584
- size_t idx;
585
- if (it != tensors_map.name_to_idx.end()) {
586
- idx = it->second;
587
- } else {
588
- tensors_map.tensors.emplace_back(name);
589
- idx = tensors_map.tensors.size() - 1;
590
- tensors_map.name_to_idx.emplace(name, idx);
591
- }
592
- tensors_map.tensors.at(idx).shards.push_back(shard);
499
+ tensors_map.tensors.push_back(tensor);
500
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
593
501
  }
594
502
  }
595
503
  };
@@ -659,56 +567,19 @@ struct llama_file_saver {
659
567
  };
660
568
 
661
569
  struct llama_model_loader {
662
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
570
+ std::unique_ptr<llama_file_loader> file_loader;
663
571
  llama_load_tensors_map tensors_map;
664
572
  bool use_mmap;
665
573
  size_t num_ggml_tensors_created = 0;
666
574
  struct ggml_context * ggml_ctx = NULL;
667
575
  std::unique_ptr<llama_mmap> mapping;
668
576
 
669
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
670
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
671
- file_loaders.emplace_back(first_file);
672
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
673
- for (uint32_t i = 1; i < n_parts; i++) {
674
- std::string fname = fname_base + "." + std::to_string(i);
675
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
676
- file_loaders.emplace_back(ith_file);
677
- if (ith_file->hparams != first_file->hparams) {
678
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
679
- }
680
- }
577
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
578
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
681
579
  if (!llama_mmap::SUPPORTED) {
682
580
  use_mmap = false;
683
581
  }
684
- if (use_mmap && alignment_prevents_mmap()) {
685
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
686
- use_mmap = false;
687
- }
688
582
  this->use_mmap = use_mmap;
689
- for (llama_load_tensor & lt : tensors_map.tensors) {
690
- lt.calc_all();
691
- }
692
- }
693
-
694
- bool alignment_prevents_mmap() {
695
- for (const llama_load_tensor & lt : tensors_map.tensors) {
696
- for (const llama_load_tensor_shard & shard : lt.shards) {
697
- if (shard.file_off & 3) {
698
- return true;
699
- }
700
- }
701
- }
702
- return false;
703
- }
704
-
705
- uint32_t guess_n_parts() const {
706
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
707
- if (it == tensors_map.name_to_idx.end()) {
708
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709
- }
710
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
712
583
  }
713
584
 
714
585
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
774
645
  }
775
646
 
776
647
  if (use_mmap) {
777
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
648
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
778
649
  if (lmlock) {
779
650
  lmlock->init(mapping->addr);
780
651
  }
@@ -830,45 +701,13 @@ struct llama_model_loader {
830
701
 
831
702
  void load_data_for(llama_load_tensor & lt) {
832
703
  if (use_mmap) {
833
- LLAMA_ASSERT(lt.shards.size() == 1);
834
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
704
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
705
+ } else {
706
+ llama_file & file = file_loader->file;
707
+ file.seek(lt.file_off, SEEK_SET);
838
708
  file.read_raw(lt.data, lt.size);
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0;
841
- for (llama_load_tensor_shard & shard : lt.shards) {
842
- llama_file & file = file_loaders.at(shard.file_idx)->file;
843
- file.seek(shard.file_off, SEEK_SET);
844
- file.read_raw(lt.data + offset, shard.size);
845
- offset += shard.size;
846
- }
847
- LLAMA_ASSERT(offset == lt.size);
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851
- for (size_t i = 0; i < lt.shards.size(); i++) {
852
- llama_load_tensor_shard & shard = lt.shards.at(i);
853
- llama_file & file = file_loaders.at(shard.file_idx)->file;
854
- file.seek(shard.file_off, SEEK_SET);
855
- tmp_bufs.at(i).resize(shard.size);
856
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne.at(1);
860
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861
- size_t out_offset = 0;
862
- for (size_t row = 0; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy(lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT(out_offset == lt.size);
871
709
  }
710
+
872
711
  if (0) {
873
712
  print_checksum(lt);
874
713
  }
@@ -938,7 +777,7 @@ static bool kv_cache_init(
938
777
 
939
778
  struct llama_context_params llama_context_default_params() {
940
779
  struct llama_context_params result = {
941
- /*.seed =*/ -1,
780
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
942
781
  /*.n_ctx =*/ 512,
943
782
  /*.n_batch =*/ 512,
944
783
  /*.gpu_layers =*/ 0,
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
1067
906
 
1068
907
  model.t_start_us = ggml_time_us();
1069
908
 
1070
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
909
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1071
910
 
1072
- vocab = std::move(ml->file_loaders.at(0)->vocab);
1073
- model.hparams = ml->file_loaders.at(0)->hparams;
911
+ vocab = std::move(ml->file_loader->vocab);
912
+ model.hparams = ml->file_loader->hparams;
1074
913
  model.n_gpu_layers = n_gpu_layers;
1075
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
914
+ llama_file_version file_version = ml->file_loader->file_version;
1076
915
  auto & hparams = model.hparams;
1077
916
 
1078
917
  {
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
1106
945
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1107
946
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1108
947
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1109
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1110
948
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
949
  }
1112
950
 
@@ -1369,22 +1207,26 @@ static bool llama_model_load(
1369
1207
 
1370
1208
  // evaluate the transformer
1371
1209
  //
1372
- // - lctx: llama context
1373
- // - tokens: new batch of tokens to process
1374
- // - n_past: the context size so far
1375
- // - n_threads: number of threads to use
1376
- // - cgraph_fname: filename of the exported computation graph
1210
+ // - lctx: llama context
1211
+ // - tokens: new batch of tokens to process
1212
+ // - embd embeddings input
1213
+ // - n_tokens number of tokens
1214
+ // - n_past: the context size so far
1215
+ // - n_threads: number of threads to use
1377
1216
  //
1378
1217
  static bool llama_eval_internal(
1379
- llama_context & lctx,
1380
- const llama_token * tokens,
1381
- const int n_tokens,
1382
- const int n_past,
1383
- const int n_threads,
1218
+ llama_context & lctx,
1219
+ const llama_token * tokens,
1220
+ const float * embd,
1221
+ const int n_tokens,
1222
+ const int n_past,
1223
+ const int n_threads,
1384
1224
  const char * cgraph_fname) {
1385
1225
 
1226
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
+
1386
1228
  // enforce that the first token is BOS
1387
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1229
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1388
1230
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1389
1231
  return false;
1390
1232
  }
@@ -1424,12 +1266,18 @@ static bool llama_eval_internal(
1424
1266
  ggml_cgraph gf = {};
1425
1267
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1426
1268
 
1427
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1428
- ggml_set_name(embd, "embd");
1429
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1430
-
1431
1269
  struct ggml_tensor * cur;
1432
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1270
+ struct ggml_tensor * inpL;
1271
+
1272
+ if (tokens) {
1273
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
+ ggml_set_name(embd, "embd");
1275
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1277
+ } else {
1278
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
+ }
1433
1281
 
1434
1282
  const int i_gpu_start = n_layer - n_gpu_layers;
1435
1283
  (void) i_gpu_start;
@@ -2451,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2451
2299
  nthread = std::thread::hardware_concurrency();
2452
2300
  }
2453
2301
 
2454
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2455
- /*vocab_only*/ false));
2456
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2302
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2303
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2457
2304
 
2458
2305
  #ifdef GGML_USE_K_QUANTS
2459
2306
  int n_attention_wv = 0;
@@ -2654,6 +2501,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2654
2501
  }
2655
2502
  }
2656
2503
 
2504
+
2505
+
2657
2506
  //
2658
2507
  // interface implementation
2659
2508
  //
@@ -2692,7 +2541,7 @@ struct llama_context * llama_new_context_with_model(
2692
2541
 
2693
2542
  llama_context * ctx = new llama_context(*model, model->vocab);
2694
2543
 
2695
- if (params.seed < 0) {
2544
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2696
2545
  params.seed = time(NULL);
2697
2546
  }
2698
2547
 
@@ -2874,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2874
2723
 
2875
2724
  // create a name -> tensor map of the model to accelerate lookups
2876
2725
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2877
- for (auto & kv: model.tensors_by_name) {
2726
+ for (const auto & kv: model.tensors_by_name) {
2878
2727
  model_tensors.insert(kv);
2879
2728
  }
2880
2729
 
@@ -2885,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2885
2734
  llama_buffer base_buf;
2886
2735
  if (path_base_model) {
2887
2736
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2888
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2737
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2889
2738
 
2890
2739
  size_t ctx_size;
2891
2740
  size_t mmapped_size;
@@ -2903,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2903
2752
 
2904
2753
  // maybe this should in llama_model_loader
2905
2754
  if (model_loader->use_mmap) {
2906
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2755
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2907
2756
  }
2908
2757
  }
2909
2758
 
@@ -2964,7 +2813,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2964
2813
  return false;
2965
2814
  }
2966
2815
  }
2967
- ggml_tensor* lora_tensor;
2816
+ ggml_tensor * lora_tensor;
2968
2817
  if (n_dims == 2) {
2969
2818
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2970
2819
  }
@@ -2972,6 +2821,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2972
2821
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2973
2822
  return 1;
2974
2823
  }
2824
+ ggml_set_name(lora_tensor, "lora_tensor");
2975
2825
 
2976
2826
  // load tensor data
2977
2827
  size_t offset = fin.tellg();
@@ -2987,6 +2837,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2987
2837
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2988
2838
 
2989
2839
  ggml_tensor * dest_t = model_tensors[base_name];
2840
+
2841
+ offload_func_t offload_func = llama_nop;
2842
+ offload_func_t offload_func_force_inplace = llama_nop;
2843
+
2844
+ #ifdef GGML_USE_CUBLAS
2845
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2846
+ if (dest_t->type != GGML_TYPE_F16) {
2847
+ throw std::runtime_error(format(
2848
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2849
+ }
2850
+ offload_func = ggml_cuda_assign_buffers;
2851
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2852
+ }
2853
+ #endif // GGML_USE_CUBLAS
2854
+
2990
2855
  ggml_tensor * base_t;
2991
2856
  if (model_loader) {
2992
2857
  // load from base model
@@ -3014,7 +2879,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3014
2879
  }
3015
2880
 
3016
2881
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2882
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2883
+ ggml_set_name(loraA, "loraA");
2884
+
3017
2885
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2886
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2887
+ ggml_set_name(loraB, "loraB");
3018
2888
 
3019
2889
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3020
2890
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3024,19 +2894,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3024
2894
 
3025
2895
  // w = w + BA*s
3026
2896
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2897
+ offload_func(BA);
2898
+ ggml_set_name(BA, "BA");
3027
2899
 
3028
2900
  if (scaling != 1.0f) {
3029
2901
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2902
+ ggml_set_name(scale_tensor, "scale_tensor");
2903
+
3030
2904
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2905
+ offload_func(BA);
2906
+ ggml_set_name(BA, "BA_scaled");
3031
2907
  }
3032
2908
 
3033
2909
  ggml_tensor * r;
3034
2910
  if (base_t == dest_t) {
3035
2911
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2912
+ offload_func_force_inplace(r);
2913
+ ggml_set_name(r, "r_add_inplace");
3036
2914
  }
3037
2915
  else {
3038
2916
  r = ggml_add(lora_ctx, base_t, BA);
2917
+ offload_func(r);
2918
+ ggml_set_name(r, "r_add");
2919
+
3039
2920
  r = ggml_cpy(lora_ctx, r, dest_t);
2921
+ offload_func(r);
2922
+ ggml_set_name(r, "r_cpy");
3040
2923
  }
3041
2924
 
3042
2925
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3091,8 +2974,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3091
2974
 
3092
2975
  #define LLAMA_MAX_RNG_STATE (64*1024)
3093
2976
 
3094
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3095
- if (seed < 0) {
2977
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
2978
+ if (seed == LLAMA_DEFAULT_SEED) {
3096
2979
  seed = time(NULL);
3097
2980
  }
3098
2981
  ctx->rng.seed(seed);
@@ -3421,7 +3304,29 @@ int llama_eval(
3421
3304
  int n_tokens,
3422
3305
  int n_past,
3423
3306
  int n_threads) {
3424
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3307
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3308
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3309
+ return 1;
3310
+ }
3311
+
3312
+ // get a more accurate load time, upon first eval
3313
+ // TODO: fix this
3314
+ if (!ctx->has_evaluated_once) {
3315
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3316
+ ctx->has_evaluated_once = true;
3317
+ }
3318
+
3319
+ return 0;
3320
+ }
3321
+
3322
+
3323
+ int llama_eval_embd(
3324
+ struct llama_context * ctx,
3325
+ const float * embd,
3326
+ int n_tokens,
3327
+ int n_past,
3328
+ int n_threads) {
3329
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3425
3330
  fprintf(stderr, "%s: failed to eval\n", __func__);
3426
3331
  return 1;
3427
3332
  }
@@ -3442,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3442
3347
 
3443
3348
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3444
3349
 
3445
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3350
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3446
3351
  fprintf(stderr, "%s: failed to eval\n", __func__);
3447
3352
  return 1;
3448
3353
  }
@@ -46,6 +46,8 @@
46
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
47
  #define LLAMA_SESSION_VERSION 1
48
48
 
49
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50
+
49
51
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
52
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
81
83
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
84
 
83
85
  struct llama_context_params {
84
- int seed; // RNG seed, -1 for random
85
- int n_ctx; // text context
86
- int n_batch; // prompt processing batch size
87
- int n_gpu_layers; // number of layers to store in VRAM
88
- int main_gpu; // the GPU that is used for scratch and small tensors
86
+ uint32_t seed; // RNG seed, -1 for random
87
+ int32_t n_ctx; // text context
88
+ int32_t n_batch; // prompt processing batch size
89
+ int32_t n_gpu_layers; // number of layers to store in VRAM
90
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
89
91
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
92
  // called with a progress value between 0 and 1, pass NULL to disable
91
93
  llama_progress_callback progress_callback;
@@ -196,7 +198,7 @@ extern "C" {
196
198
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
197
199
 
198
200
  // Sets the current rng seed.
199
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
201
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
200
202
 
201
203
  // Returns the maximum size in bytes of the state (rng, logits, embedding
202
204
  // and kv_cache) - will often be smaller after compacting tokens
@@ -226,6 +228,14 @@ extern "C" {
226
228
  int n_past,
227
229
  int n_threads);
228
230
 
231
+ // Same as llama_eval, but use float matrix input directly.
232
+ LLAMA_API int llama_eval_embd(
233
+ struct llama_context * ctx,
234
+ const float * embd,
235
+ int n_tokens,
236
+ int n_past,
237
+ int n_threads);
238
+
229
239
  // Export a static computation graph for context of 511 and batch size of 1
230
240
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
231
241
  // parameters here to keep things simple
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9d23589'
9
+ LLAMA_CPP_VERSION = 'master-b8c8dda'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
4
4
  LLAMA_FILE_VERSION: String
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+ LLAMA_DEFALUT_SEED: String
7
8
 
8
9
  LLAMA_MAX_DEVICES: Integer
9
10
 
@@ -72,6 +73,7 @@ module LLaMACpp
72
73
  def initialize: (model: ::LLaMACpp::Model) -> void
73
74
  def embeddings: () -> Array[Float]
74
75
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
76
+ def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
75
77
  def eval_export: (String) -> bool
76
78
  def logits: () -> Array[Float]
77
79
  def n_ctx: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: