llama_cpp 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -204,6 +204,9 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
+ GGML_TYPE_Q4_2 = 4,
208
+ GGML_TYPE_Q4_3 = 5,
209
+ GGML_TYPE_Q8_0 = 6,
207
210
  GGML_TYPE_I8,
208
211
  GGML_TYPE_I16,
209
212
  GGML_TYPE_I32,
@@ -358,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
358
361
 
359
362
  size_t ggml_element_size(const struct ggml_tensor * tensor);
360
363
 
364
+ bool ggml_is_quantized(enum ggml_type type);
365
+
361
366
  struct ggml_context * ggml_init(struct ggml_init_params params);
362
367
  void ggml_free(struct ggml_context * ctx);
363
368
 
@@ -429,6 +434,12 @@ struct ggml_tensor * ggml_add(
429
434
  struct ggml_tensor * a,
430
435
  struct ggml_tensor * b);
431
436
 
437
+
438
+ struct ggml_tensor * ggml_add_inplace(
439
+ struct ggml_context * ctx,
440
+ struct ggml_tensor * a,
441
+ struct ggml_tensor * b);
442
+
432
443
  struct ggml_tensor * ggml_sub(
433
444
  struct ggml_context * ctx,
434
445
  struct ggml_tensor * a,
@@ -619,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
619
630
 
620
631
  // rotary position embedding
621
632
  // in-place, returns view(a)
622
- // if mode == 1, skip n_past elements
633
+ // if mode & 1 == 1, skip n_past elements
634
+ // if mode & 2 == 1, GPT-NeoX style
623
635
  // TODO: avoid creating a new tensor every time
624
636
  struct ggml_tensor * ggml_rope(
625
637
  struct ggml_context * ctx,
@@ -799,6 +811,10 @@ enum ggml_opt_result ggml_opt(
799
811
 
800
812
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
801
813
  size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
+ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
+ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
816
+
817
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
802
818
 
803
819
  //
804
820
  // system info
@@ -807,6 +823,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
807
823
  int ggml_cpu_has_avx(void);
808
824
  int ggml_cpu_has_avx2(void);
809
825
  int ggml_cpu_has_avx512(void);
826
+ int ggml_cpu_has_avx512_vbmi(void);
827
+ int ggml_cpu_has_avx512_vnni(void);
810
828
  int ggml_cpu_has_fma(void);
811
829
  int ggml_cpu_has_neon(void);
812
830
  int ggml_cpu_has_arm_fma(void);
@@ -814,6 +832,7 @@ int ggml_cpu_has_f16c(void);
814
832
  int ggml_cpu_has_fp16_va(void);
815
833
  int ggml_cpu_has_wasm_simd(void);
816
834
  int ggml_cpu_has_blas(void);
835
+ int ggml_cpu_has_cublas(void);
817
836
  int ggml_cpu_has_sse3(void);
818
837
  int ggml_cpu_has_vsx(void);
819
838
 
@@ -836,6 +855,7 @@ typedef struct {
836
855
  dequantize_row_q_t dequantize_row_q;
837
856
  quantize_row_q_t quantize_row_q;
838
857
  quantize_row_q_t quantize_row_q_reference;
858
+ quantize_row_q_t quantize_row_q_dot;
839
859
  vec_dot_q_t vec_dot_q;
840
860
  } quantize_fns_t;
841
861
 
@@ -1,6 +1,8 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
4
6
  #endif
5
7
 
6
8
  #include "llama_util.h"
@@ -9,6 +11,7 @@
9
11
  #include "ggml.h"
10
12
 
11
13
  #include <array>
14
+ #include <ctime>
12
15
  #include <cinttypes>
13
16
  #include <fstream>
14
17
  #include <random>
@@ -21,6 +24,9 @@
21
24
  #include <memory>
22
25
  #include <algorithm>
23
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
24
30
 
25
31
  #define LLAMA_USE_SCRATCH
26
32
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -41,35 +47,51 @@ static const size_t MB = 1024*1024;
41
47
  // TODO: dynamically determine these sizes
42
48
  // needs modifications in ggml
43
49
 
44
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
45
- { MODEL_7B, 512ull*MB },
46
- { MODEL_13B, 512ull*MB },
47
- { MODEL_30B, 512ull*MB },
48
- { MODEL_65B, 512ull*MB },
49
- };
50
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
51
+ {
52
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
53
+ { MODEL_7B, 512ull * MB },
54
+ { MODEL_13B, 512ull * MB },
55
+ { MODEL_30B, 512ull * MB },
56
+ { MODEL_65B, 512ull * MB },
57
+ };
58
+ return _MEM_REQ_SCRATCH0;
59
+ }
50
60
 
51
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
52
- { MODEL_7B, 512ull*MB },
53
- { MODEL_13B, 512ull*MB },
54
- { MODEL_30B, 512ull*MB },
55
- { MODEL_65B, 512ull*MB },
61
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
62
+ {
63
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
64
+ { MODEL_7B, 512ull * MB },
65
+ { MODEL_13B, 512ull * MB },
66
+ { MODEL_30B, 512ull * MB },
67
+ { MODEL_65B, 512ull * MB },
68
+ };
69
+ return _MEM_REQ_SCRATCH1;
56
70
  };
57
71
 
58
72
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
59
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
60
- { MODEL_7B, 1026ull*MB },
61
- { MODEL_13B, 1608ull*MB },
62
- { MODEL_30B, 3124ull*MB },
63
- { MODEL_65B, 5120ull*MB },
73
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
74
+ {
75
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
76
+ { MODEL_7B, 1026ull * MB },
77
+ { MODEL_13B, 1608ull * MB },
78
+ { MODEL_30B, 3124ull * MB },
79
+ { MODEL_65B, 5120ull * MB },
80
+ };
81
+ return _MEM_REQ_KV_SELF;
64
82
  };
65
83
 
66
84
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
67
85
  // not actually needed if BLAS is disabled
68
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
69
- { MODEL_7B, 768ull*MB },
70
- { MODEL_13B, 1024ull*MB },
71
- { MODEL_30B, 1280ull*MB },
72
- { MODEL_65B, 1536ull*MB },
86
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
87
+ {
88
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
89
+ { MODEL_7B, 768ull * MB },
90
+ { MODEL_13B, 1024ull * MB },
91
+ { MODEL_30B, 1280ull * MB },
92
+ { MODEL_65B, 1536ull * MB },
93
+ };
94
+ return _MEM_REQ_EVAL;
73
95
  };
74
96
 
75
97
  // default hparams (LLaMA 7B)
@@ -261,12 +283,12 @@ static size_t checked_div(size_t a, size_t b) {
261
283
  }
262
284
 
263
285
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
- std::string ret = "[" + std::to_string(ne.at(0));
286
+ char buf[256];
287
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
265
288
  for (size_t i = 1; i < ne.size(); i++) {
266
- ret += " x " + std::to_string(ne.at(i));
289
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
267
290
  }
268
- ret += "]";
269
- return ret;
291
+ return buf;
270
292
  }
271
293
 
272
294
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -459,6 +481,8 @@ struct llama_file_loader {
459
481
  case GGML_TYPE_F16:
460
482
  case GGML_TYPE_Q4_0:
461
483
  case GGML_TYPE_Q4_1:
484
+ case GGML_TYPE_Q4_2:
485
+ case GGML_TYPE_Q4_3:
462
486
  break;
463
487
  default: {
464
488
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -531,6 +555,8 @@ struct llama_file_saver {
531
555
  case GGML_TYPE_F16:
532
556
  case GGML_TYPE_Q4_0:
533
557
  case GGML_TYPE_Q4_1:
558
+ case GGML_TYPE_Q4_2:
559
+ case GGML_TYPE_Q4_3:
534
560
  break;
535
561
  default: LLAMA_ASSERT(false);
536
562
  }
@@ -616,6 +642,7 @@ struct llama_model_loader {
616
642
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
643
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
644
  }
645
+
619
646
  return get_tensor_for(lt);
620
647
  }
621
648
 
@@ -818,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
818
845
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
819
846
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
820
847
  return "mostly Q4_1, some F16";
848
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
821
850
  default: return "unknown, may not work";
822
851
  }
823
852
  }
@@ -898,13 +927,13 @@ static void llama_model_load_internal(
898
927
  const size_t mem_required =
899
928
  ctx_size +
900
929
  mmapped_size +
901
- MEM_REQ_SCRATCH0.at(model.type) +
902
- MEM_REQ_SCRATCH1.at(model.type) +
903
- MEM_REQ_EVAL.at (model.type);
930
+ MEM_REQ_SCRATCH0().at(model.type) +
931
+ MEM_REQ_SCRATCH1().at(model.type) +
932
+ MEM_REQ_EVAL().at(model.type);
904
933
 
905
934
  // this is the memory required by one llama_state
906
935
  const size_t mem_required_state =
907
- scale*MEM_REQ_KV_SELF.at(model.type);
936
+ scale*MEM_REQ_KV_SELF().at(model.type);
908
937
 
909
938
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
910
939
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -941,8 +970,8 @@ static void llama_model_load_internal(
941
970
  ml->ggml_ctx = ctx;
942
971
 
943
972
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
- model.norm = ml->get_tensor("norm.weight", {n_embd});
945
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
973
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
974
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
946
975
 
947
976
  model.layers.resize(n_layer);
948
977
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1046,7 +1075,7 @@ static bool llama_eval_internal(
1046
1075
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1047
1076
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1048
1077
  ggml_cgraph gf = {};
1049
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1078
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1050
1079
 
1051
1080
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1052
1081
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1546,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1546
1575
  // quantization
1547
1576
  //
1548
1577
 
1549
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1578
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1550
1579
  ggml_type quantized_type;
1551
1580
  switch (ftype) {
1552
1581
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1553
1582
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1554
1585
  default: throw format("invalid output file type %d\n", ftype);
1555
1586
  };
1556
1587
 
1588
+ if (nthread <= 0) {
1589
+ nthread = std::thread::hardware_concurrency();
1590
+ }
1591
+
1557
1592
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1558
1593
  /*vocab_only*/ false));
1559
1594
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1562,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1562
1597
  size_t total_size_new = 0;
1563
1598
  std::vector<int64_t> hist_all(1 << 4, 0);
1564
1599
 
1600
+ std::vector<std::thread> workers;
1601
+ std::mutex mutex;
1602
+
1565
1603
  size_t idx = 0;
1566
1604
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1567
1605
  llama_buffer read_data;
@@ -1569,7 +1607,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1569
1607
  tensor.data = read_data.addr;
1570
1608
  model_loader->load_data_for(tensor);
1571
1609
 
1572
- printf("[%zu/%zu] %36s - %s, type = %6s, ",
1610
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1573
1611
  ++idx, model_loader->tensors_map.tensors.size(),
1574
1612
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
1613
  ggml_type_name(tensor.type));
@@ -1580,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1580
1618
  // quantize only 2D tensors
1581
1619
  quantize &= (tensor.ne.size() == 2);
1582
1620
 
1621
+ // GG: uncomment this to keep the output layer in FP16
1622
+ //if (tensor.name.rfind("output")) {
1623
+ // quantize = false;
1624
+ //}
1625
+
1583
1626
  enum ggml_type new_type;
1584
1627
  void * new_data;
1585
1628
  size_t new_size;
@@ -1615,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1615
1658
  new_data = work.addr;
1616
1659
  std::vector<int64_t> hist_cur(1 << 4, 0);
1617
1660
 
1618
- switch (new_type) {
1619
- case GGML_TYPE_Q4_0:
1620
- {
1621
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1622
- } break;
1623
- case GGML_TYPE_Q4_1:
1624
- {
1625
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1626
- } break;
1627
- default:
1628
- LLAMA_ASSERT(false);
1661
+ int chunk_size = 32 * 512;
1662
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1663
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1664
+ if (nthread_use < 2) {
1665
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1666
+ } else {
1667
+ size_t counter = 0;
1668
+ new_size = 0;
1669
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1670
+ std::vector<int64_t> local_hist;
1671
+ size_t local_size = 0;
1672
+ while (true) {
1673
+ std::unique_lock<std::mutex> lock(mutex);
1674
+ size_t first = counter; counter += chunk_size;
1675
+ if (first >= nelements) {
1676
+ if (!local_hist.empty()) {
1677
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1678
+ new_size += local_size;
1679
+ }
1680
+ break;
1681
+ }
1682
+ lock.unlock();
1683
+ size_t last = std::min(nelements, first + chunk_size);
1684
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1685
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1686
+ }
1687
+ };
1688
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1689
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1690
+ compute();
1691
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1629
1692
  }
1630
1693
 
1631
1694
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1731,10 +1794,10 @@ struct llama_context * llama_init_from_file(
1731
1794
  ctx->embedding.resize(hparams.n_embd);
1732
1795
  }
1733
1796
 
1734
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1797
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1735
1798
 
1736
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1737
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1799
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1800
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1738
1801
  }
1739
1802
 
1740
1803
  return ctx;
@@ -1747,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
1747
1810
  int llama_model_quantize(
1748
1811
  const char * fname_inp,
1749
1812
  const char * fname_out,
1750
- enum llama_ftype ftype) {
1813
+ enum llama_ftype ftype,
1814
+ int nthread) {
1751
1815
  try {
1752
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1816
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1753
1817
  return 0;
1754
1818
  } catch (const std::string & err) {
1755
1819
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1757,6 +1821,254 @@ int llama_model_quantize(
1757
1821
  }
1758
1822
  }
1759
1823
 
1824
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1825
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1826
+
1827
+ auto & model = ctx->model;
1828
+
1829
+ const int64_t t_start_lora_us = ggml_time_us();
1830
+
1831
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1832
+ if (!fin) {
1833
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1834
+ return 1;
1835
+ }
1836
+
1837
+ // verify magic and version
1838
+ {
1839
+ uint32_t magic;
1840
+ fin.read((char *) &magic, sizeof(magic));
1841
+ if (magic != 'ggla') {
1842
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1843
+ return 1;
1844
+ }
1845
+ uint32_t format_version;
1846
+ fin.read((char *) &format_version, sizeof(format_version));
1847
+
1848
+ if (format_version != 1) {
1849
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1850
+ return 1;
1851
+ }
1852
+ }
1853
+
1854
+ int32_t lora_r;
1855
+ int32_t lora_alpha;
1856
+ fin.read((char *) &lora_r, sizeof(lora_r));
1857
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1858
+ float scaling = (float)lora_alpha / (float)lora_r;
1859
+
1860
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1861
+
1862
+
1863
+ // create a temporary ggml context to store the lora tensors
1864
+ // todo: calculate size from biggest possible tensor
1865
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1866
+ struct ggml_init_params params;
1867
+ params.mem_size = lora_buf.size();
1868
+ params.mem_buffer = lora_buf.data();
1869
+ params.no_alloc = false;
1870
+
1871
+ ggml_context * lora_ctx = ggml_init(params);
1872
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1873
+
1874
+ // create a name -> tensor map of the model to accelerate lookups
1875
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1876
+ for (auto & kv: model.tensors_by_name) {
1877
+ model_tensors.insert(kv);
1878
+ }
1879
+
1880
+
1881
+ // load base model
1882
+ std::unique_ptr<llama_model_loader> model_loader;
1883
+ ggml_context * base_ctx = NULL;
1884
+ llama_buffer base_buf;
1885
+ if (path_base_model) {
1886
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1887
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1888
+
1889
+ size_t ctx_size, mmapped_size;
1890
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1891
+ base_buf.resize(ctx_size);
1892
+
1893
+ ggml_init_params base_params;
1894
+ base_params.mem_size = base_buf.size;
1895
+ base_params.mem_buffer = base_buf.addr;
1896
+ base_params.no_alloc = model_loader->use_mmap;
1897
+
1898
+ base_ctx = ggml_init(base_params);
1899
+
1900
+ model_loader->ggml_ctx = base_ctx;
1901
+
1902
+ // maybe this should in llama_model_loader
1903
+ if (model_loader->use_mmap) {
1904
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1905
+ }
1906
+ }
1907
+
1908
+ // read tensors and apply
1909
+ bool warned = false;
1910
+ int n_tensors = 0;
1911
+ while (true) {
1912
+ int32_t n_dims;
1913
+ int32_t length;
1914
+ int32_t ftype;
1915
+
1916
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1917
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1918
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1919
+ if (fin.eof()) {
1920
+ break;
1921
+ }
1922
+
1923
+ int32_t ne[2] = { 1, 1 };
1924
+ for (int i = 0; i < n_dims; ++i) {
1925
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1926
+ }
1927
+
1928
+ std::string name(length, 0);
1929
+ fin.read(&name[0], length);
1930
+
1931
+ // check for lora suffix and get the type of tensor
1932
+ const std::string lora_suffix = ".lora";
1933
+ size_t pos = name.rfind(lora_suffix);
1934
+ if (pos == std::string::npos) {
1935
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1936
+ return 1;
1937
+ }
1938
+
1939
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1940
+ std::string base_name = name;
1941
+ base_name.erase(pos);
1942
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1943
+
1944
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1945
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1946
+ return 1;
1947
+ }
1948
+
1949
+ // create ggml tensor
1950
+ ggml_type wtype;
1951
+ switch (ftype) {
1952
+ case 0: wtype = GGML_TYPE_F32; break;
1953
+ case 1: wtype = GGML_TYPE_F16; break;
1954
+ default:
1955
+ {
1956
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1957
+ __func__, ftype);
1958
+ return false;
1959
+ }
1960
+ }
1961
+ ggml_tensor* lora_tensor;
1962
+ if (n_dims == 2) {
1963
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1964
+ }
1965
+ else {
1966
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1967
+ return 1;
1968
+ }
1969
+
1970
+ // load tensor data
1971
+ size_t offset = fin.tellg();
1972
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1973
+ offset = (offset + 31) & -32;
1974
+ fin.seekg(offset);
1975
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1976
+
1977
+ lora_tensors[name] = lora_tensor;
1978
+
1979
+ // check if we have both A and B tensors and apply
1980
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1981
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1982
+
1983
+ ggml_tensor * dest_t = model_tensors[base_name];
1984
+ ggml_tensor * base_t;
1985
+ if (model_loader) {
1986
+ // load from base model
1987
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1988
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1989
+ return 1;
1990
+ }
1991
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1992
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1993
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1994
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1995
+ model_loader->load_data_for(lt);
1996
+ lt.ggml_tensor->data = lt.data;
1997
+ }
1998
+ else {
1999
+ base_t = dest_t;
2000
+ }
2001
+
2002
+ if (ggml_is_quantized(base_t->type)) {
2003
+ if (!warned) {
2004
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
2005
+ "use a f16 or f32 base model with --lora-base\n", __func__);
2006
+ warned = true;
2007
+ }
2008
+ }
2009
+
2010
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2011
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2012
+
2013
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2014
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
2015
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
2016
+ return 1;
2017
+ }
2018
+
2019
+ // w = w + BA*s
2020
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2021
+
2022
+ if (scaling != 1.0f) {
2023
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2024
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
2025
+ }
2026
+
2027
+ ggml_tensor * r;
2028
+ if (base_t == dest_t) {
2029
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
2030
+ }
2031
+ else {
2032
+ r = ggml_add(lora_ctx, base_t, BA);
2033
+ r = ggml_cpy(lora_ctx, r, dest_t);
2034
+ }
2035
+
2036
+ struct ggml_cgraph gf = ggml_build_forward(r);
2037
+ gf.n_threads = n_threads;
2038
+ ggml_graph_compute(lora_ctx, &gf);
2039
+
2040
+ // we won't need these tensors again, reset the context to save memory
2041
+ ggml_free(lora_ctx);
2042
+ lora_ctx = ggml_init(params);
2043
+ lora_tensors.clear();
2044
+
2045
+ n_tensors++;
2046
+ if (n_tensors % 4 == 0)
2047
+ fprintf(stderr, ".");
2048
+ }
2049
+ }
2050
+
2051
+ // TODO: this should be in a destructor, it will leak on failure
2052
+ ggml_free(lora_ctx);
2053
+ if (base_ctx) {
2054
+ ggml_free(base_ctx);
2055
+ }
2056
+
2057
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2058
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2059
+
2060
+ return 0;
2061
+ }
2062
+
2063
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2064
+ try {
2065
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2066
+ } catch (const std::string & err) {
2067
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2068
+ return 1;
2069
+ }
2070
+ }
2071
+
1760
2072
  // Returns the KV cache that will contain the context for the
1761
2073
  // ongoing prediction with the model.
1762
2074
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@@ -1914,18 +2226,20 @@ const char * llama_print_system_info(void) {
1914
2226
  static std::string s;
1915
2227
 
1916
2228
  s = "";
1917
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1921
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1922
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1923
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1924
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1925
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1926
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1927
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1928
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2229
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2230
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2231
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2232
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2233
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2234
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2235
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2236
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2237
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2238
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2239
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2240
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2241
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2242
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1929
2243
 
1930
2244
  return s.c_str();
1931
2245
  }