llama_cpp 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -204,6 +204,9 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
+ GGML_TYPE_Q4_2 = 4,
208
+ GGML_TYPE_Q4_3 = 5,
209
+ GGML_TYPE_Q8_0 = 6,
207
210
  GGML_TYPE_I8,
208
211
  GGML_TYPE_I16,
209
212
  GGML_TYPE_I32,
@@ -358,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
358
361
 
359
362
  size_t ggml_element_size(const struct ggml_tensor * tensor);
360
363
 
364
+ bool ggml_is_quantized(enum ggml_type type);
365
+
361
366
  struct ggml_context * ggml_init(struct ggml_init_params params);
362
367
  void ggml_free(struct ggml_context * ctx);
363
368
 
@@ -429,6 +434,12 @@ struct ggml_tensor * ggml_add(
429
434
  struct ggml_tensor * a,
430
435
  struct ggml_tensor * b);
431
436
 
437
+
438
+ struct ggml_tensor * ggml_add_inplace(
439
+ struct ggml_context * ctx,
440
+ struct ggml_tensor * a,
441
+ struct ggml_tensor * b);
442
+
432
443
  struct ggml_tensor * ggml_sub(
433
444
  struct ggml_context * ctx,
434
445
  struct ggml_tensor * a,
@@ -619,7 +630,8 @@ struct ggml_tensor * ggml_soft_max(
619
630
 
620
631
  // rotary position embedding
621
632
  // in-place, returns view(a)
622
- // if mode == 1, skip n_past elements
633
+ // if mode & 1 == 1, skip n_past elements
634
+ // if mode & 2 == 1, GPT-NeoX style
623
635
  // TODO: avoid creating a new tensor every time
624
636
  struct ggml_tensor * ggml_rope(
625
637
  struct ggml_context * ctx,
@@ -799,6 +811,10 @@ enum ggml_opt_result ggml_opt(
799
811
 
800
812
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
801
813
  size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
+ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
+ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
816
+
817
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
802
818
 
803
819
  //
804
820
  // system info
@@ -807,6 +823,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
807
823
  int ggml_cpu_has_avx(void);
808
824
  int ggml_cpu_has_avx2(void);
809
825
  int ggml_cpu_has_avx512(void);
826
+ int ggml_cpu_has_avx512_vbmi(void);
827
+ int ggml_cpu_has_avx512_vnni(void);
810
828
  int ggml_cpu_has_fma(void);
811
829
  int ggml_cpu_has_neon(void);
812
830
  int ggml_cpu_has_arm_fma(void);
@@ -814,6 +832,7 @@ int ggml_cpu_has_f16c(void);
814
832
  int ggml_cpu_has_fp16_va(void);
815
833
  int ggml_cpu_has_wasm_simd(void);
816
834
  int ggml_cpu_has_blas(void);
835
+ int ggml_cpu_has_cublas(void);
817
836
  int ggml_cpu_has_sse3(void);
818
837
  int ggml_cpu_has_vsx(void);
819
838
 
@@ -836,6 +855,7 @@ typedef struct {
836
855
  dequantize_row_q_t dequantize_row_q;
837
856
  quantize_row_q_t quantize_row_q;
838
857
  quantize_row_q_t quantize_row_q_reference;
858
+ quantize_row_q_t quantize_row_q_dot;
839
859
  vec_dot_q_t vec_dot_q;
840
860
  } quantize_fns_t;
841
861
 
@@ -1,6 +1,8 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
4
6
  #endif
5
7
 
6
8
  #include "llama_util.h"
@@ -9,6 +11,7 @@
9
11
  #include "ggml.h"
10
12
 
11
13
  #include <array>
14
+ #include <ctime>
12
15
  #include <cinttypes>
13
16
  #include <fstream>
14
17
  #include <random>
@@ -21,6 +24,9 @@
21
24
  #include <memory>
22
25
  #include <algorithm>
23
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
24
30
 
25
31
  #define LLAMA_USE_SCRATCH
26
32
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -41,35 +47,51 @@ static const size_t MB = 1024*1024;
41
47
  // TODO: dynamically determine these sizes
42
48
  // needs modifications in ggml
43
49
 
44
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
45
- { MODEL_7B, 512ull*MB },
46
- { MODEL_13B, 512ull*MB },
47
- { MODEL_30B, 512ull*MB },
48
- { MODEL_65B, 512ull*MB },
49
- };
50
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
51
+ {
52
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
53
+ { MODEL_7B, 512ull * MB },
54
+ { MODEL_13B, 512ull * MB },
55
+ { MODEL_30B, 512ull * MB },
56
+ { MODEL_65B, 512ull * MB },
57
+ };
58
+ return _MEM_REQ_SCRATCH0;
59
+ }
50
60
 
51
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
52
- { MODEL_7B, 512ull*MB },
53
- { MODEL_13B, 512ull*MB },
54
- { MODEL_30B, 512ull*MB },
55
- { MODEL_65B, 512ull*MB },
61
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
62
+ {
63
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
64
+ { MODEL_7B, 512ull * MB },
65
+ { MODEL_13B, 512ull * MB },
66
+ { MODEL_30B, 512ull * MB },
67
+ { MODEL_65B, 512ull * MB },
68
+ };
69
+ return _MEM_REQ_SCRATCH1;
56
70
  };
57
71
 
58
72
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
59
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
60
- { MODEL_7B, 1026ull*MB },
61
- { MODEL_13B, 1608ull*MB },
62
- { MODEL_30B, 3124ull*MB },
63
- { MODEL_65B, 5120ull*MB },
73
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
74
+ {
75
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
76
+ { MODEL_7B, 1026ull * MB },
77
+ { MODEL_13B, 1608ull * MB },
78
+ { MODEL_30B, 3124ull * MB },
79
+ { MODEL_65B, 5120ull * MB },
80
+ };
81
+ return _MEM_REQ_KV_SELF;
64
82
  };
65
83
 
66
84
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
67
85
  // not actually needed if BLAS is disabled
68
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
69
- { MODEL_7B, 768ull*MB },
70
- { MODEL_13B, 1024ull*MB },
71
- { MODEL_30B, 1280ull*MB },
72
- { MODEL_65B, 1536ull*MB },
86
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
87
+ {
88
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
89
+ { MODEL_7B, 768ull * MB },
90
+ { MODEL_13B, 1024ull * MB },
91
+ { MODEL_30B, 1280ull * MB },
92
+ { MODEL_65B, 1536ull * MB },
93
+ };
94
+ return _MEM_REQ_EVAL;
73
95
  };
74
96
 
75
97
  // default hparams (LLaMA 7B)
@@ -261,12 +283,12 @@ static size_t checked_div(size_t a, size_t b) {
261
283
  }
262
284
 
263
285
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
- std::string ret = "[" + std::to_string(ne.at(0));
286
+ char buf[256];
287
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
265
288
  for (size_t i = 1; i < ne.size(); i++) {
266
- ret += " x " + std::to_string(ne.at(i));
289
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
267
290
  }
268
- ret += "]";
269
- return ret;
291
+ return buf;
270
292
  }
271
293
 
272
294
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -459,6 +481,8 @@ struct llama_file_loader {
459
481
  case GGML_TYPE_F16:
460
482
  case GGML_TYPE_Q4_0:
461
483
  case GGML_TYPE_Q4_1:
484
+ case GGML_TYPE_Q4_2:
485
+ case GGML_TYPE_Q4_3:
462
486
  break;
463
487
  default: {
464
488
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -531,6 +555,8 @@ struct llama_file_saver {
531
555
  case GGML_TYPE_F16:
532
556
  case GGML_TYPE_Q4_0:
533
557
  case GGML_TYPE_Q4_1:
558
+ case GGML_TYPE_Q4_2:
559
+ case GGML_TYPE_Q4_3:
534
560
  break;
535
561
  default: LLAMA_ASSERT(false);
536
562
  }
@@ -616,6 +642,7 @@ struct llama_model_loader {
616
642
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
643
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
644
  }
645
+
619
646
  return get_tensor_for(lt);
620
647
  }
621
648
 
@@ -818,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
818
845
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
819
846
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
820
847
  return "mostly Q4_1, some F16";
848
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
+ case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
821
850
  default: return "unknown, may not work";
822
851
  }
823
852
  }
@@ -898,13 +927,13 @@ static void llama_model_load_internal(
898
927
  const size_t mem_required =
899
928
  ctx_size +
900
929
  mmapped_size +
901
- MEM_REQ_SCRATCH0.at(model.type) +
902
- MEM_REQ_SCRATCH1.at(model.type) +
903
- MEM_REQ_EVAL.at (model.type);
930
+ MEM_REQ_SCRATCH0().at(model.type) +
931
+ MEM_REQ_SCRATCH1().at(model.type) +
932
+ MEM_REQ_EVAL().at(model.type);
904
933
 
905
934
  // this is the memory required by one llama_state
906
935
  const size_t mem_required_state =
907
- scale*MEM_REQ_KV_SELF.at(model.type);
936
+ scale*MEM_REQ_KV_SELF().at(model.type);
908
937
 
909
938
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
910
939
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -941,8 +970,8 @@ static void llama_model_load_internal(
941
970
  ml->ggml_ctx = ctx;
942
971
 
943
972
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
- model.norm = ml->get_tensor("norm.weight", {n_embd});
945
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
973
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
974
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
946
975
 
947
976
  model.layers.resize(n_layer);
948
977
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1046,7 +1075,7 @@ static bool llama_eval_internal(
1046
1075
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1047
1076
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1048
1077
  ggml_cgraph gf = {};
1049
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1078
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1050
1079
 
1051
1080
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1052
1081
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1546,14 +1575,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
1546
1575
  // quantization
1547
1576
  //
1548
1577
 
1549
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
1578
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1550
1579
  ggml_type quantized_type;
1551
1580
  switch (ftype) {
1552
1581
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1553
1582
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
+ case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1554
1585
  default: throw format("invalid output file type %d\n", ftype);
1555
1586
  };
1556
1587
 
1588
+ if (nthread <= 0) {
1589
+ nthread = std::thread::hardware_concurrency();
1590
+ }
1591
+
1557
1592
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1558
1593
  /*vocab_only*/ false));
1559
1594
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@@ -1562,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1562
1597
  size_t total_size_new = 0;
1563
1598
  std::vector<int64_t> hist_all(1 << 4, 0);
1564
1599
 
1600
+ std::vector<std::thread> workers;
1601
+ std::mutex mutex;
1602
+
1565
1603
  size_t idx = 0;
1566
1604
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1567
1605
  llama_buffer read_data;
@@ -1569,7 +1607,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1569
1607
  tensor.data = read_data.addr;
1570
1608
  model_loader->load_data_for(tensor);
1571
1609
 
1572
- printf("[%zu/%zu] %36s - %s, type = %6s, ",
1610
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1573
1611
  ++idx, model_loader->tensors_map.tensors.size(),
1574
1612
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
1613
  ggml_type_name(tensor.type));
@@ -1580,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1580
1618
  // quantize only 2D tensors
1581
1619
  quantize &= (tensor.ne.size() == 2);
1582
1620
 
1621
+ // GG: uncomment this to keep the output layer in FP16
1622
+ //if (tensor.name.rfind("output")) {
1623
+ // quantize = false;
1624
+ //}
1625
+
1583
1626
  enum ggml_type new_type;
1584
1627
  void * new_data;
1585
1628
  size_t new_size;
@@ -1615,17 +1658,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1615
1658
  new_data = work.addr;
1616
1659
  std::vector<int64_t> hist_cur(1 << 4, 0);
1617
1660
 
1618
- switch (new_type) {
1619
- case GGML_TYPE_Q4_0:
1620
- {
1621
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1622
- } break;
1623
- case GGML_TYPE_Q4_1:
1624
- {
1625
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1626
- } break;
1627
- default:
1628
- LLAMA_ASSERT(false);
1661
+ int chunk_size = 32 * 512;
1662
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1663
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1664
+ if (nthread_use < 2) {
1665
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1666
+ } else {
1667
+ size_t counter = 0;
1668
+ new_size = 0;
1669
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1670
+ std::vector<int64_t> local_hist;
1671
+ size_t local_size = 0;
1672
+ while (true) {
1673
+ std::unique_lock<std::mutex> lock(mutex);
1674
+ size_t first = counter; counter += chunk_size;
1675
+ if (first >= nelements) {
1676
+ if (!local_hist.empty()) {
1677
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1678
+ new_size += local_size;
1679
+ }
1680
+ break;
1681
+ }
1682
+ lock.unlock();
1683
+ size_t last = std::min(nelements, first + chunk_size);
1684
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1685
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1686
+ }
1687
+ };
1688
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1689
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1690
+ compute();
1691
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1629
1692
  }
1630
1693
 
1631
1694
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1731,10 +1794,10 @@ struct llama_context * llama_init_from_file(
1731
1794
  ctx->embedding.resize(hparams.n_embd);
1732
1795
  }
1733
1796
 
1734
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1797
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1735
1798
 
1736
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1737
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1799
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1800
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1738
1801
  }
1739
1802
 
1740
1803
  return ctx;
@@ -1747,9 +1810,10 @@ void llama_free(struct llama_context * ctx) {
1747
1810
  int llama_model_quantize(
1748
1811
  const char * fname_inp,
1749
1812
  const char * fname_out,
1750
- enum llama_ftype ftype) {
1813
+ enum llama_ftype ftype,
1814
+ int nthread) {
1751
1815
  try {
1752
- llama_model_quantize_internal(fname_inp, fname_out, ftype);
1816
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
1753
1817
  return 0;
1754
1818
  } catch (const std::string & err) {
1755
1819
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1757,6 +1821,254 @@ int llama_model_quantize(
1757
1821
  }
1758
1822
  }
1759
1823
 
1824
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1825
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1826
+
1827
+ auto & model = ctx->model;
1828
+
1829
+ const int64_t t_start_lora_us = ggml_time_us();
1830
+
1831
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1832
+ if (!fin) {
1833
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1834
+ return 1;
1835
+ }
1836
+
1837
+ // verify magic and version
1838
+ {
1839
+ uint32_t magic;
1840
+ fin.read((char *) &magic, sizeof(magic));
1841
+ if (magic != 'ggla') {
1842
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1843
+ return 1;
1844
+ }
1845
+ uint32_t format_version;
1846
+ fin.read((char *) &format_version, sizeof(format_version));
1847
+
1848
+ if (format_version != 1) {
1849
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1850
+ return 1;
1851
+ }
1852
+ }
1853
+
1854
+ int32_t lora_r;
1855
+ int32_t lora_alpha;
1856
+ fin.read((char *) &lora_r, sizeof(lora_r));
1857
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1858
+ float scaling = (float)lora_alpha / (float)lora_r;
1859
+
1860
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1861
+
1862
+
1863
+ // create a temporary ggml context to store the lora tensors
1864
+ // todo: calculate size from biggest possible tensor
1865
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1866
+ struct ggml_init_params params;
1867
+ params.mem_size = lora_buf.size();
1868
+ params.mem_buffer = lora_buf.data();
1869
+ params.no_alloc = false;
1870
+
1871
+ ggml_context * lora_ctx = ggml_init(params);
1872
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1873
+
1874
+ // create a name -> tensor map of the model to accelerate lookups
1875
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1876
+ for (auto & kv: model.tensors_by_name) {
1877
+ model_tensors.insert(kv);
1878
+ }
1879
+
1880
+
1881
+ // load base model
1882
+ std::unique_ptr<llama_model_loader> model_loader;
1883
+ ggml_context * base_ctx = NULL;
1884
+ llama_buffer base_buf;
1885
+ if (path_base_model) {
1886
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1887
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1888
+
1889
+ size_t ctx_size, mmapped_size;
1890
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1891
+ base_buf.resize(ctx_size);
1892
+
1893
+ ggml_init_params base_params;
1894
+ base_params.mem_size = base_buf.size;
1895
+ base_params.mem_buffer = base_buf.addr;
1896
+ base_params.no_alloc = model_loader->use_mmap;
1897
+
1898
+ base_ctx = ggml_init(base_params);
1899
+
1900
+ model_loader->ggml_ctx = base_ctx;
1901
+
1902
+ // maybe this should in llama_model_loader
1903
+ if (model_loader->use_mmap) {
1904
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1905
+ }
1906
+ }
1907
+
1908
+ // read tensors and apply
1909
+ bool warned = false;
1910
+ int n_tensors = 0;
1911
+ while (true) {
1912
+ int32_t n_dims;
1913
+ int32_t length;
1914
+ int32_t ftype;
1915
+
1916
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1917
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1918
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1919
+ if (fin.eof()) {
1920
+ break;
1921
+ }
1922
+
1923
+ int32_t ne[2] = { 1, 1 };
1924
+ for (int i = 0; i < n_dims; ++i) {
1925
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1926
+ }
1927
+
1928
+ std::string name(length, 0);
1929
+ fin.read(&name[0], length);
1930
+
1931
+ // check for lora suffix and get the type of tensor
1932
+ const std::string lora_suffix = ".lora";
1933
+ size_t pos = name.rfind(lora_suffix);
1934
+ if (pos == std::string::npos) {
1935
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1936
+ return 1;
1937
+ }
1938
+
1939
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1940
+ std::string base_name = name;
1941
+ base_name.erase(pos);
1942
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1943
+
1944
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1945
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1946
+ return 1;
1947
+ }
1948
+
1949
+ // create ggml tensor
1950
+ ggml_type wtype;
1951
+ switch (ftype) {
1952
+ case 0: wtype = GGML_TYPE_F32; break;
1953
+ case 1: wtype = GGML_TYPE_F16; break;
1954
+ default:
1955
+ {
1956
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1957
+ __func__, ftype);
1958
+ return false;
1959
+ }
1960
+ }
1961
+ ggml_tensor* lora_tensor;
1962
+ if (n_dims == 2) {
1963
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1964
+ }
1965
+ else {
1966
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1967
+ return 1;
1968
+ }
1969
+
1970
+ // load tensor data
1971
+ size_t offset = fin.tellg();
1972
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1973
+ offset = (offset + 31) & -32;
1974
+ fin.seekg(offset);
1975
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1976
+
1977
+ lora_tensors[name] = lora_tensor;
1978
+
1979
+ // check if we have both A and B tensors and apply
1980
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1981
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1982
+
1983
+ ggml_tensor * dest_t = model_tensors[base_name];
1984
+ ggml_tensor * base_t;
1985
+ if (model_loader) {
1986
+ // load from base model
1987
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1988
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1989
+ return 1;
1990
+ }
1991
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1992
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1993
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1994
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1995
+ model_loader->load_data_for(lt);
1996
+ lt.ggml_tensor->data = lt.data;
1997
+ }
1998
+ else {
1999
+ base_t = dest_t;
2000
+ }
2001
+
2002
+ if (ggml_is_quantized(base_t->type)) {
2003
+ if (!warned) {
2004
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
2005
+ "use a f16 or f32 base model with --lora-base\n", __func__);
2006
+ warned = true;
2007
+ }
2008
+ }
2009
+
2010
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2011
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2012
+
2013
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2014
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
2015
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
2016
+ return 1;
2017
+ }
2018
+
2019
+ // w = w + BA*s
2020
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2021
+
2022
+ if (scaling != 1.0f) {
2023
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2024
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
2025
+ }
2026
+
2027
+ ggml_tensor * r;
2028
+ if (base_t == dest_t) {
2029
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
2030
+ }
2031
+ else {
2032
+ r = ggml_add(lora_ctx, base_t, BA);
2033
+ r = ggml_cpy(lora_ctx, r, dest_t);
2034
+ }
2035
+
2036
+ struct ggml_cgraph gf = ggml_build_forward(r);
2037
+ gf.n_threads = n_threads;
2038
+ ggml_graph_compute(lora_ctx, &gf);
2039
+
2040
+ // we won't need these tensors again, reset the context to save memory
2041
+ ggml_free(lora_ctx);
2042
+ lora_ctx = ggml_init(params);
2043
+ lora_tensors.clear();
2044
+
2045
+ n_tensors++;
2046
+ if (n_tensors % 4 == 0)
2047
+ fprintf(stderr, ".");
2048
+ }
2049
+ }
2050
+
2051
+ // TODO: this should be in a destructor, it will leak on failure
2052
+ ggml_free(lora_ctx);
2053
+ if (base_ctx) {
2054
+ ggml_free(base_ctx);
2055
+ }
2056
+
2057
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2058
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2059
+
2060
+ return 0;
2061
+ }
2062
+
2063
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2064
+ try {
2065
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2066
+ } catch (const std::string & err) {
2067
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2068
+ return 1;
2069
+ }
2070
+ }
2071
+
1760
2072
  // Returns the KV cache that will contain the context for the
1761
2073
  // ongoing prediction with the model.
1762
2074
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@@ -1914,18 +2226,20 @@ const char * llama_print_system_info(void) {
1914
2226
  static std::string s;
1915
2227
 
1916
2228
  s = "";
1917
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1921
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1922
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1923
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1924
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1925
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1926
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1927
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1928
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2229
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2230
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2231
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2232
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2233
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2234
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2235
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2236
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2237
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2238
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2239
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2240
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2241
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2242
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1929
2243
 
1930
2244
  return s.c_str();
1931
2245
  }