llama_cpp 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
364
364
  return size / ggml_blck_size(type);
365
365
  }
366
366
 
367
- struct llama_load_tensor_shard {
368
- std::vector<uint32_t> ne;
369
- size_t size;
370
- enum ggml_type type;
371
- size_t file_idx;
372
- size_t file_off;
373
-
374
- void calc_size() {
375
- size = llama_calc_tensor_size(ne, type);
376
- }
377
- };
378
-
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
367
  struct llama_load_tensor {
386
- std::vector<llama_load_tensor_shard> shards;
387
-
388
368
  std::string name;
389
369
  enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
370
  std::vector<uint32_t> ne;
371
+ size_t file_off;
392
372
  size_t size;
393
373
  struct ggml_tensor * ggml_tensor = NULL;
394
374
  uint8_t * data;
395
-
396
- llama_load_tensor(const std::string & name) : name(name) {}
397
-
398
- void calc_all() {
399
- calc_type();
400
- calc_split_type();
401
- calc_ne();
402
- calc_size();
403
- }
404
-
405
- void calc_type() {
406
- const auto & first_shard = shards.at(0);
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type) {
409
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410
- }
411
- }
412
- type = first_shard.type;
413
- }
414
-
415
- void calc_split_type() {
416
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417
- shards.size() == 1) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find("tok_embeddings.") == 0 ||
420
- name.find(".attention.wo.weight") != std::string::npos ||
421
- name.find(".feed_forward.w2.weight") != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
- void calc_ne() {
429
- const auto & first_shard = shards.at(0);
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne) {
432
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434
- }
435
- }
436
- ne = first_shard.ne;
437
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t) shards.size();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne;
442
- break;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445
- first_shard.ne[1]};
446
- break;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne[0],
449
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450
- break;
451
- }
452
- }
453
-
454
- void calc_size() {
455
- size = llama_calc_tensor_size(ne, type);
456
- }
457
375
  };
458
376
 
459
377
  struct llama_load_tensors_map {
@@ -476,13 +394,13 @@ struct llama_file_loader {
476
394
  llama_hparams hparams;
477
395
  llama_vocab vocab;
478
396
 
479
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
397
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
480
398
  : file(fname, "rb") {
481
399
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
482
400
  read_magic();
483
401
  read_hparams();
484
402
  read_vocab();
485
- read_tensor_metadata(file_idx, tensors_map);
403
+ read_tensor_metadata(tensors_map);
486
404
  }
487
405
  void read_magic() {
488
406
  uint32_t magic = file.read_u32();
@@ -539,19 +457,19 @@ struct llama_file_loader {
539
457
  tok_score.score = score;
540
458
  }
541
459
  }
542
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
460
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
543
461
  while (file.tell() < file.size) {
544
- llama_load_tensor_shard shard;
462
+ llama_load_tensor tensor;
545
463
  uint32_t n_dims = file.read_u32();
546
464
  uint32_t name_len = file.read_u32();
547
- shard.type = (enum ggml_type) file.read_u32();
548
- shard.ne.resize(n_dims);
549
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
465
+ tensor.type = (enum ggml_type) file.read_u32();
466
+ tensor.ne.resize(n_dims);
467
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
550
468
  std::string name = file.read_string(name_len);
551
469
  if (n_dims < 1 || n_dims > 2) {
552
470
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
553
471
  }
554
- switch (shard.type) {
472
+ switch (tensor.type) {
555
473
  case GGML_TYPE_F32:
556
474
  case GGML_TYPE_F16:
557
475
  case GGML_TYPE_Q4_0:
@@ -566,30 +484,20 @@ struct llama_file_loader {
566
484
  case GGML_TYPE_Q6_K:
567
485
  break;
568
486
  default: {
569
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
487
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
570
488
  }
571
489
  }
572
490
 
573
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
574
- // skip to the next multiple of 32 bytes
575
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
576
- }
577
- shard.file_idx = file_idx;
578
- shard.file_off = file.tell();
491
+ // skip to the next multiple of 32 bytes
492
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
579
493
 
580
- shard.calc_size();
581
- file.seek(shard.size, SEEK_CUR);
494
+ tensor.file_off = file.tell();
495
+ tensor.name = name;
496
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
497
+ file.seek(tensor.size, SEEK_CUR);
582
498
 
583
- auto it = tensors_map.name_to_idx.find(name);
584
- size_t idx;
585
- if (it != tensors_map.name_to_idx.end()) {
586
- idx = it->second;
587
- } else {
588
- tensors_map.tensors.emplace_back(name);
589
- idx = tensors_map.tensors.size() - 1;
590
- tensors_map.name_to_idx.emplace(name, idx);
591
- }
592
- tensors_map.tensors.at(idx).shards.push_back(shard);
499
+ tensors_map.tensors.push_back(tensor);
500
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
593
501
  }
594
502
  }
595
503
  };
@@ -659,56 +567,19 @@ struct llama_file_saver {
659
567
  };
660
568
 
661
569
  struct llama_model_loader {
662
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
570
+ std::unique_ptr<llama_file_loader> file_loader;
663
571
  llama_load_tensors_map tensors_map;
664
572
  bool use_mmap;
665
573
  size_t num_ggml_tensors_created = 0;
666
574
  struct ggml_context * ggml_ctx = NULL;
667
575
  std::unique_ptr<llama_mmap> mapping;
668
576
 
669
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
670
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
671
- file_loaders.emplace_back(first_file);
672
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
673
- for (uint32_t i = 1; i < n_parts; i++) {
674
- std::string fname = fname_base + "." + std::to_string(i);
675
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
676
- file_loaders.emplace_back(ith_file);
677
- if (ith_file->hparams != first_file->hparams) {
678
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
679
- }
680
- }
577
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
578
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
681
579
  if (!llama_mmap::SUPPORTED) {
682
580
  use_mmap = false;
683
581
  }
684
- if (use_mmap && alignment_prevents_mmap()) {
685
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
686
- use_mmap = false;
687
- }
688
582
  this->use_mmap = use_mmap;
689
- for (llama_load_tensor & lt : tensors_map.tensors) {
690
- lt.calc_all();
691
- }
692
- }
693
-
694
- bool alignment_prevents_mmap() {
695
- for (const llama_load_tensor & lt : tensors_map.tensors) {
696
- for (const llama_load_tensor_shard & shard : lt.shards) {
697
- if (shard.file_off & 3) {
698
- return true;
699
- }
700
- }
701
- }
702
- return false;
703
- }
704
-
705
- uint32_t guess_n_parts() const {
706
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
707
- if (it == tensors_map.name_to_idx.end()) {
708
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709
- }
710
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
712
583
  }
713
584
 
714
585
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -774,7 +645,7 @@ struct llama_model_loader {
774
645
  }
775
646
 
776
647
  if (use_mmap) {
777
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
648
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
778
649
  if (lmlock) {
779
650
  lmlock->init(mapping->addr);
780
651
  }
@@ -830,45 +701,13 @@ struct llama_model_loader {
830
701
 
831
702
  void load_data_for(llama_load_tensor & lt) {
832
703
  if (use_mmap) {
833
- LLAMA_ASSERT(lt.shards.size() == 1);
834
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
704
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
705
+ } else {
706
+ llama_file & file = file_loader->file;
707
+ file.seek(lt.file_off, SEEK_SET);
838
708
  file.read_raw(lt.data, lt.size);
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0;
841
- for (llama_load_tensor_shard & shard : lt.shards) {
842
- llama_file & file = file_loaders.at(shard.file_idx)->file;
843
- file.seek(shard.file_off, SEEK_SET);
844
- file.read_raw(lt.data + offset, shard.size);
845
- offset += shard.size;
846
- }
847
- LLAMA_ASSERT(offset == lt.size);
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851
- for (size_t i = 0; i < lt.shards.size(); i++) {
852
- llama_load_tensor_shard & shard = lt.shards.at(i);
853
- llama_file & file = file_loaders.at(shard.file_idx)->file;
854
- file.seek(shard.file_off, SEEK_SET);
855
- tmp_bufs.at(i).resize(shard.size);
856
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne.at(1);
860
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861
- size_t out_offset = 0;
862
- for (size_t row = 0; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy(lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT(out_offset == lt.size);
871
709
  }
710
+
872
711
  if (0) {
873
712
  print_checksum(lt);
874
713
  }
@@ -938,7 +777,7 @@ static bool kv_cache_init(
938
777
 
939
778
  struct llama_context_params llama_context_default_params() {
940
779
  struct llama_context_params result = {
941
- /*.seed =*/ -1,
780
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
942
781
  /*.n_ctx =*/ 512,
943
782
  /*.n_batch =*/ 512,
944
783
  /*.gpu_layers =*/ 0,
@@ -1067,12 +906,12 @@ static void llama_model_load_internal(
1067
906
 
1068
907
  model.t_start_us = ggml_time_us();
1069
908
 
1070
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
909
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1071
910
 
1072
- vocab = std::move(ml->file_loaders.at(0)->vocab);
1073
- model.hparams = ml->file_loaders.at(0)->hparams;
911
+ vocab = std::move(ml->file_loader->vocab);
912
+ model.hparams = ml->file_loader->hparams;
1074
913
  model.n_gpu_layers = n_gpu_layers;
1075
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
914
+ llama_file_version file_version = ml->file_loader->file_version;
1076
915
  auto & hparams = model.hparams;
1077
916
 
1078
917
  {
@@ -1106,7 +945,6 @@ static void llama_model_load_internal(
1106
945
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1107
946
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1108
947
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1109
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1110
948
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
949
  }
1112
950
 
@@ -1369,22 +1207,26 @@ static bool llama_model_load(
1369
1207
 
1370
1208
  // evaluate the transformer
1371
1209
  //
1372
- // - lctx: llama context
1373
- // - tokens: new batch of tokens to process
1374
- // - n_past: the context size so far
1375
- // - n_threads: number of threads to use
1376
- // - cgraph_fname: filename of the exported computation graph
1210
+ // - lctx: llama context
1211
+ // - tokens: new batch of tokens to process
1212
+ // - embd embeddings input
1213
+ // - n_tokens number of tokens
1214
+ // - n_past: the context size so far
1215
+ // - n_threads: number of threads to use
1377
1216
  //
1378
1217
  static bool llama_eval_internal(
1379
- llama_context & lctx,
1380
- const llama_token * tokens,
1381
- const int n_tokens,
1382
- const int n_past,
1383
- const int n_threads,
1218
+ llama_context & lctx,
1219
+ const llama_token * tokens,
1220
+ const float * embd,
1221
+ const int n_tokens,
1222
+ const int n_past,
1223
+ const int n_threads,
1384
1224
  const char * cgraph_fname) {
1385
1225
 
1226
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
+
1386
1228
  // enforce that the first token is BOS
1387
- if (n_past == 0 && tokens[0] != llama_token_bos()) {
1229
+ if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1388
1230
  fprintf(stderr, "%s: first token must be BOS\n", __func__);
1389
1231
  return false;
1390
1232
  }
@@ -1424,12 +1266,18 @@ static bool llama_eval_internal(
1424
1266
  ggml_cgraph gf = {};
1425
1267
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1426
1268
 
1427
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1428
- ggml_set_name(embd, "embd");
1429
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1430
-
1431
1269
  struct ggml_tensor * cur;
1432
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1270
+ struct ggml_tensor * inpL;
1271
+
1272
+ if (tokens) {
1273
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
+ ggml_set_name(embd, "embd");
1275
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1277
+ } else {
1278
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
+ }
1433
1281
 
1434
1282
  const int i_gpu_start = n_layer - n_gpu_layers;
1435
1283
  (void) i_gpu_start;
@@ -2451,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2451
2299
  nthread = std::thread::hardware_concurrency();
2452
2300
  }
2453
2301
 
2454
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2455
- /*vocab_only*/ false));
2456
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2302
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2303
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
2457
2304
 
2458
2305
  #ifdef GGML_USE_K_QUANTS
2459
2306
  int n_attention_wv = 0;
@@ -2654,6 +2501,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2654
2501
  }
2655
2502
  }
2656
2503
 
2504
+
2505
+
2657
2506
  //
2658
2507
  // interface implementation
2659
2508
  //
@@ -2692,7 +2541,7 @@ struct llama_context * llama_new_context_with_model(
2692
2541
 
2693
2542
  llama_context * ctx = new llama_context(*model, model->vocab);
2694
2543
 
2695
- if (params.seed < 0) {
2544
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2696
2545
  params.seed = time(NULL);
2697
2546
  }
2698
2547
 
@@ -2874,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2874
2723
 
2875
2724
  // create a name -> tensor map of the model to accelerate lookups
2876
2725
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2877
- for (auto & kv: model.tensors_by_name) {
2726
+ for (const auto & kv: model.tensors_by_name) {
2878
2727
  model_tensors.insert(kv);
2879
2728
  }
2880
2729
 
@@ -2885,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2885
2734
  llama_buffer base_buf;
2886
2735
  if (path_base_model) {
2887
2736
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2888
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2737
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2889
2738
 
2890
2739
  size_t ctx_size;
2891
2740
  size_t mmapped_size;
@@ -2903,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2903
2752
 
2904
2753
  // maybe this should in llama_model_loader
2905
2754
  if (model_loader->use_mmap) {
2906
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2755
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2907
2756
  }
2908
2757
  }
2909
2758
 
@@ -2964,7 +2813,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2964
2813
  return false;
2965
2814
  }
2966
2815
  }
2967
- ggml_tensor* lora_tensor;
2816
+ ggml_tensor * lora_tensor;
2968
2817
  if (n_dims == 2) {
2969
2818
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2970
2819
  }
@@ -2972,6 +2821,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2972
2821
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2973
2822
  return 1;
2974
2823
  }
2824
+ ggml_set_name(lora_tensor, "lora_tensor");
2975
2825
 
2976
2826
  // load tensor data
2977
2827
  size_t offset = fin.tellg();
@@ -2987,6 +2837,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2987
2837
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2988
2838
 
2989
2839
  ggml_tensor * dest_t = model_tensors[base_name];
2840
+
2841
+ offload_func_t offload_func = llama_nop;
2842
+ offload_func_t offload_func_force_inplace = llama_nop;
2843
+
2844
+ #ifdef GGML_USE_CUBLAS
2845
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2846
+ if (dest_t->type != GGML_TYPE_F16) {
2847
+ throw std::runtime_error(format(
2848
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2849
+ }
2850
+ offload_func = ggml_cuda_assign_buffers;
2851
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2852
+ }
2853
+ #endif // GGML_USE_CUBLAS
2854
+
2990
2855
  ggml_tensor * base_t;
2991
2856
  if (model_loader) {
2992
2857
  // load from base model
@@ -3014,7 +2879,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3014
2879
  }
3015
2880
 
3016
2881
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2882
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2883
+ ggml_set_name(loraA, "loraA");
2884
+
3017
2885
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2886
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2887
+ ggml_set_name(loraB, "loraB");
3018
2888
 
3019
2889
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3020
2890
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3024,19 +2894,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3024
2894
 
3025
2895
  // w = w + BA*s
3026
2896
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2897
+ offload_func(BA);
2898
+ ggml_set_name(BA, "BA");
3027
2899
 
3028
2900
  if (scaling != 1.0f) {
3029
2901
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2902
+ ggml_set_name(scale_tensor, "scale_tensor");
2903
+
3030
2904
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2905
+ offload_func(BA);
2906
+ ggml_set_name(BA, "BA_scaled");
3031
2907
  }
3032
2908
 
3033
2909
  ggml_tensor * r;
3034
2910
  if (base_t == dest_t) {
3035
2911
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2912
+ offload_func_force_inplace(r);
2913
+ ggml_set_name(r, "r_add_inplace");
3036
2914
  }
3037
2915
  else {
3038
2916
  r = ggml_add(lora_ctx, base_t, BA);
2917
+ offload_func(r);
2918
+ ggml_set_name(r, "r_add");
2919
+
3039
2920
  r = ggml_cpy(lora_ctx, r, dest_t);
2921
+ offload_func(r);
2922
+ ggml_set_name(r, "r_cpy");
3040
2923
  }
3041
2924
 
3042
2925
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3091,8 +2974,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3091
2974
 
3092
2975
  #define LLAMA_MAX_RNG_STATE (64*1024)
3093
2976
 
3094
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3095
- if (seed < 0) {
2977
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
2978
+ if (seed == LLAMA_DEFAULT_SEED) {
3096
2979
  seed = time(NULL);
3097
2980
  }
3098
2981
  ctx->rng.seed(seed);
@@ -3421,7 +3304,29 @@ int llama_eval(
3421
3304
  int n_tokens,
3422
3305
  int n_past,
3423
3306
  int n_threads) {
3424
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
3307
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3308
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3309
+ return 1;
3310
+ }
3311
+
3312
+ // get a more accurate load time, upon first eval
3313
+ // TODO: fix this
3314
+ if (!ctx->has_evaluated_once) {
3315
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3316
+ ctx->has_evaluated_once = true;
3317
+ }
3318
+
3319
+ return 0;
3320
+ }
3321
+
3322
+
3323
+ int llama_eval_embd(
3324
+ struct llama_context * ctx,
3325
+ const float * embd,
3326
+ int n_tokens,
3327
+ int n_past,
3328
+ int n_threads) {
3329
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3425
3330
  fprintf(stderr, "%s: failed to eval\n", __func__);
3426
3331
  return 1;
3427
3332
  }
@@ -3442,7 +3347,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3442
3347
 
3443
3348
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3444
3349
 
3445
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3350
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3446
3351
  fprintf(stderr, "%s: failed to eval\n", __func__);
3447
3352
  return 1;
3448
3353
  }
@@ -46,6 +46,8 @@
46
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
47
  #define LLAMA_SESSION_VERSION 1
48
48
 
49
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50
+
49
51
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
52
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
81
83
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
84
 
83
85
  struct llama_context_params {
84
- int seed; // RNG seed, -1 for random
85
- int n_ctx; // text context
86
- int n_batch; // prompt processing batch size
87
- int n_gpu_layers; // number of layers to store in VRAM
88
- int main_gpu; // the GPU that is used for scratch and small tensors
86
+ uint32_t seed; // RNG seed, -1 for random
87
+ int32_t n_ctx; // text context
88
+ int32_t n_batch; // prompt processing batch size
89
+ int32_t n_gpu_layers; // number of layers to store in VRAM
90
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
89
91
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
92
  // called with a progress value between 0 and 1, pass NULL to disable
91
93
  llama_progress_callback progress_callback;
@@ -196,7 +198,7 @@ extern "C" {
196
198
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
197
199
 
198
200
  // Sets the current rng seed.
199
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
201
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
200
202
 
201
203
  // Returns the maximum size in bytes of the state (rng, logits, embedding
202
204
  // and kv_cache) - will often be smaller after compacting tokens
@@ -226,6 +228,14 @@ extern "C" {
226
228
  int n_past,
227
229
  int n_threads);
228
230
 
231
+ // Same as llama_eval, but use float matrix input directly.
232
+ LLAMA_API int llama_eval_embd(
233
+ struct llama_context * ctx,
234
+ const float * embd,
235
+ int n_tokens,
236
+ int n_past,
237
+ int n_threads);
238
+
229
239
  // Export a static computation graph for context of 511 and batch size of 1
230
240
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
231
241
  // parameters here to keep things simple
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9d23589'
9
+ LLAMA_CPP_VERSION = 'master-b8c8dda'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
4
4
  LLAMA_FILE_VERSION: String
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+ LLAMA_DEFALUT_SEED: String
7
8
 
8
9
  LLAMA_MAX_DEVICES: Integer
9
10
 
@@ -72,6 +73,7 @@ module LLaMACpp
72
73
  def initialize: (model: ::LLaMACpp::Model) -> void
73
74
  def embeddings: () -> Array[Float]
74
75
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
76
+ def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
75
77
  def eval_export: (String) -> bool
76
78
  def logits: () -> Array[Float]
77
79
  def n_ctx: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: