llama_cpp 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -204,6 +204,7 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
+ GGML_TYPE_Q8_0 = 4,
207
208
  GGML_TYPE_I8,
208
209
  GGML_TYPE_I16,
209
210
  GGML_TYPE_I32,
@@ -429,6 +430,12 @@ struct ggml_tensor * ggml_add(
429
430
  struct ggml_tensor * a,
430
431
  struct ggml_tensor * b);
431
432
 
433
+
434
+ struct ggml_tensor * ggml_add_inplace(
435
+ struct ggml_context * ctx,
436
+ struct ggml_tensor * a,
437
+ struct ggml_tensor * b);
438
+
432
439
  struct ggml_tensor * ggml_sub(
433
440
  struct ggml_context * ctx,
434
441
  struct ggml_tensor * a,
@@ -807,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
807
814
  int ggml_cpu_has_avx(void);
808
815
  int ggml_cpu_has_avx2(void);
809
816
  int ggml_cpu_has_avx512(void);
817
+ int ggml_cpu_has_avx512_vbmi(void);
818
+ int ggml_cpu_has_avx512_vnni(void);
810
819
  int ggml_cpu_has_fma(void);
811
820
  int ggml_cpu_has_neon(void);
812
821
  int ggml_cpu_has_arm_fma(void);
@@ -836,6 +845,7 @@ typedef struct {
836
845
  dequantize_row_q_t dequantize_row_q;
837
846
  quantize_row_q_t quantize_row_q;
838
847
  quantize_row_q_t quantize_row_q_reference;
848
+ quantize_row_q_t quantize_row_q_dot;
839
849
  vec_dot_q_t vec_dot_q;
840
850
  } quantize_fns_t;
841
851
 
@@ -1,6 +1,8 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
4
6
  #endif
5
7
 
6
8
  #include "llama_util.h"
@@ -9,6 +11,7 @@
9
11
  #include "ggml.h"
10
12
 
11
13
  #include <array>
14
+ #include <ctime>
12
15
  #include <cinttypes>
13
16
  #include <fstream>
14
17
  #include <random>
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
41
44
  // TODO: dynamically determine these sizes
42
45
  // needs modifications in ggml
43
46
 
44
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
45
- { MODEL_7B, 512ull*MB },
46
- { MODEL_13B, 512ull*MB },
47
- { MODEL_30B, 512ull*MB },
48
- { MODEL_65B, 512ull*MB },
49
- };
47
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
48
+ {
49
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
50
+ { MODEL_7B, 512ull * MB },
51
+ { MODEL_13B, 512ull * MB },
52
+ { MODEL_30B, 512ull * MB },
53
+ { MODEL_65B, 512ull * MB },
54
+ };
55
+ return _MEM_REQ_SCRATCH0;
56
+ }
50
57
 
51
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
52
- { MODEL_7B, 512ull*MB },
53
- { MODEL_13B, 512ull*MB },
54
- { MODEL_30B, 512ull*MB },
55
- { MODEL_65B, 512ull*MB },
58
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
59
+ {
60
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
61
+ { MODEL_7B, 512ull * MB },
62
+ { MODEL_13B, 512ull * MB },
63
+ { MODEL_30B, 512ull * MB },
64
+ { MODEL_65B, 512ull * MB },
65
+ };
66
+ return _MEM_REQ_SCRATCH1;
56
67
  };
57
68
 
58
69
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
59
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
60
- { MODEL_7B, 1026ull*MB },
61
- { MODEL_13B, 1608ull*MB },
62
- { MODEL_30B, 3124ull*MB },
63
- { MODEL_65B, 5120ull*MB },
70
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
71
+ {
72
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
73
+ { MODEL_7B, 1026ull * MB },
74
+ { MODEL_13B, 1608ull * MB },
75
+ { MODEL_30B, 3124ull * MB },
76
+ { MODEL_65B, 5120ull * MB },
77
+ };
78
+ return _MEM_REQ_KV_SELF;
64
79
  };
65
80
 
66
81
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
67
82
  // not actually needed if BLAS is disabled
68
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
69
- { MODEL_7B, 768ull*MB },
70
- { MODEL_13B, 1024ull*MB },
71
- { MODEL_30B, 1280ull*MB },
72
- { MODEL_65B, 1536ull*MB },
83
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
84
+ {
85
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
86
+ { MODEL_7B, 768ull * MB },
87
+ { MODEL_13B, 1024ull * MB },
88
+ { MODEL_30B, 1280ull * MB },
89
+ { MODEL_65B, 1536ull * MB },
90
+ };
91
+ return _MEM_REQ_EVAL;
73
92
  };
74
93
 
75
94
  // default hparams (LLaMA 7B)
@@ -261,12 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
261
280
  }
262
281
 
263
282
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
- std::string ret = "[" + std::to_string(ne.at(0));
283
+ char buf[256];
284
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
265
285
  for (size_t i = 1; i < ne.size(); i++) {
266
- ret += " x " + std::to_string(ne.at(i));
286
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
267
287
  }
268
- ret += "]";
269
- return ret;
288
+ return buf;
270
289
  }
271
290
 
272
291
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -616,6 +635,7 @@ struct llama_model_loader {
616
635
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
636
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
637
  }
638
+
619
639
  return get_tensor_for(lt);
620
640
  }
621
641
 
@@ -898,13 +918,13 @@ static void llama_model_load_internal(
898
918
  const size_t mem_required =
899
919
  ctx_size +
900
920
  mmapped_size +
901
- MEM_REQ_SCRATCH0.at(model.type) +
902
- MEM_REQ_SCRATCH1.at(model.type) +
903
- MEM_REQ_EVAL.at (model.type);
921
+ MEM_REQ_SCRATCH0().at(model.type) +
922
+ MEM_REQ_SCRATCH1().at(model.type) +
923
+ MEM_REQ_EVAL().at(model.type);
904
924
 
905
925
  // this is the memory required by one llama_state
906
926
  const size_t mem_required_state =
907
- scale*MEM_REQ_KV_SELF.at(model.type);
927
+ scale*MEM_REQ_KV_SELF().at(model.type);
908
928
 
909
929
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
910
930
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -941,8 +961,8 @@ static void llama_model_load_internal(
941
961
  ml->ggml_ctx = ctx;
942
962
 
943
963
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
- model.norm = ml->get_tensor("norm.weight", {n_embd});
945
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
964
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
965
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
946
966
 
947
967
  model.layers.resize(n_layer);
948
968
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1569,7 +1589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1569
1589
  tensor.data = read_data.addr;
1570
1590
  model_loader->load_data_for(tensor);
1571
1591
 
1572
- printf("[%zu/%zu] %36s - %s, type = %6s, ",
1592
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1573
1593
  ++idx, model_loader->tensors_map.tensors.size(),
1574
1594
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
1595
  ggml_type_name(tensor.type));
@@ -1731,10 +1751,10 @@ struct llama_context * llama_init_from_file(
1731
1751
  ctx->embedding.resize(hparams.n_embd);
1732
1752
  }
1733
1753
 
1734
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1754
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1735
1755
 
1736
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1737
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1756
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1757
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1738
1758
  }
1739
1759
 
1740
1760
  return ctx;
@@ -1757,6 +1777,254 @@ int llama_model_quantize(
1757
1777
  }
1758
1778
  }
1759
1779
 
1780
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1781
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1782
+
1783
+ auto & model = ctx->model;
1784
+
1785
+ const int64_t t_start_lora_us = ggml_time_us();
1786
+
1787
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1788
+ if (!fin) {
1789
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1790
+ return 1;
1791
+ }
1792
+
1793
+ // verify magic and version
1794
+ {
1795
+ uint32_t magic;
1796
+ fin.read((char *) &magic, sizeof(magic));
1797
+ if (magic != 'ggla') {
1798
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1799
+ return 1;
1800
+ }
1801
+ uint32_t format_version;
1802
+ fin.read((char *) &format_version, sizeof(format_version));
1803
+
1804
+ if (format_version != 1) {
1805
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1806
+ return 1;
1807
+ }
1808
+ }
1809
+
1810
+ int32_t lora_r;
1811
+ int32_t lora_alpha;
1812
+ fin.read((char *) &lora_r, sizeof(lora_r));
1813
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1814
+ float scaling = (float)lora_alpha / (float)lora_r;
1815
+
1816
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1817
+
1818
+
1819
+ // create a temporary ggml context to store the lora tensors
1820
+ // todo: calculate size from biggest possible tensor
1821
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1822
+ struct ggml_init_params params;
1823
+ params.mem_size = lora_buf.size();
1824
+ params.mem_buffer = lora_buf.data();
1825
+ params.no_alloc = false;
1826
+
1827
+ ggml_context * lora_ctx = ggml_init(params);
1828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1829
+
1830
+ // create a name -> tensor map of the model to accelerate lookups
1831
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1832
+ for (auto & kv: model.tensors_by_name) {
1833
+ model_tensors.insert(kv);
1834
+ }
1835
+
1836
+
1837
+ // load base model
1838
+ std::unique_ptr<llama_model_loader> model_loader;
1839
+ ggml_context * base_ctx = NULL;
1840
+ llama_buffer base_buf;
1841
+ if (path_base_model) {
1842
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1843
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1844
+
1845
+ size_t ctx_size, mmapped_size;
1846
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1847
+ base_buf.resize(ctx_size);
1848
+
1849
+ ggml_init_params base_params;
1850
+ base_params.mem_size = base_buf.size;
1851
+ base_params.mem_buffer = base_buf.addr;
1852
+ base_params.no_alloc = model_loader->use_mmap;
1853
+
1854
+ base_ctx = ggml_init(base_params);
1855
+
1856
+ model_loader->ggml_ctx = base_ctx;
1857
+
1858
+ // maybe this should in llama_model_loader
1859
+ if (model_loader->use_mmap) {
1860
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1861
+ }
1862
+ }
1863
+
1864
+ // read tensors and apply
1865
+ bool warned = false;
1866
+ int n_tensors = 0;
1867
+ while (true) {
1868
+ int32_t n_dims;
1869
+ int32_t length;
1870
+ int32_t ftype;
1871
+
1872
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1873
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1874
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1875
+ if (fin.eof()) {
1876
+ break;
1877
+ }
1878
+
1879
+ int32_t ne[2] = { 1, 1 };
1880
+ for (int i = 0; i < n_dims; ++i) {
1881
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1882
+ }
1883
+
1884
+ std::string name(length, 0);
1885
+ fin.read(&name[0], length);
1886
+
1887
+ // check for lora suffix and get the type of tensor
1888
+ const std::string lora_suffix = ".lora";
1889
+ size_t pos = name.rfind(lora_suffix);
1890
+ if (pos == std::string::npos) {
1891
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1892
+ return 1;
1893
+ }
1894
+
1895
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1896
+ std::string base_name = name;
1897
+ base_name.erase(pos);
1898
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1899
+
1900
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1901
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1902
+ return 1;
1903
+ }
1904
+
1905
+ // create ggml tensor
1906
+ ggml_type wtype;
1907
+ switch (ftype) {
1908
+ case 0: wtype = GGML_TYPE_F32; break;
1909
+ case 1: wtype = GGML_TYPE_F16; break;
1910
+ default:
1911
+ {
1912
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1913
+ __func__, ftype);
1914
+ return false;
1915
+ }
1916
+ }
1917
+ ggml_tensor* lora_tensor;
1918
+ if (n_dims == 2) {
1919
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1920
+ }
1921
+ else {
1922
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1923
+ return 1;
1924
+ }
1925
+
1926
+ // load tensor data
1927
+ size_t offset = fin.tellg();
1928
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1929
+ offset = (offset + 31) & -32;
1930
+ fin.seekg(offset);
1931
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1932
+
1933
+ lora_tensors[name] = lora_tensor;
1934
+
1935
+ // check if we have both A and B tensors and apply
1936
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1937
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1938
+
1939
+ ggml_tensor * dest_t = model_tensors[base_name];
1940
+ ggml_tensor * base_t;
1941
+ if (model_loader) {
1942
+ // load from base model
1943
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1944
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1945
+ return 1;
1946
+ }
1947
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1948
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1949
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1950
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1951
+ model_loader->load_data_for(lt);
1952
+ lt.ggml_tensor->data = lt.data;
1953
+ }
1954
+ else {
1955
+ base_t = dest_t;
1956
+ }
1957
+
1958
+ if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
1959
+ if (!warned) {
1960
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
+ "use a f16 or f32 base model with --lora-base\n", __func__);
1962
+ warned = true;
1963
+ }
1964
+ }
1965
+
1966
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
1967
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
1968
+
1969
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
1970
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
1971
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
1972
+ return 1;
1973
+ }
1974
+
1975
+ // w = w + BA*s
1976
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
1977
+
1978
+ if (scaling != 1.0f) {
1979
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
1980
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
1981
+ }
1982
+
1983
+ ggml_tensor * r;
1984
+ if (base_t == dest_t) {
1985
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
1986
+ }
1987
+ else {
1988
+ r = ggml_add(lora_ctx, base_t, BA);
1989
+ r = ggml_cpy(lora_ctx, r, dest_t);
1990
+ }
1991
+
1992
+ struct ggml_cgraph gf = ggml_build_forward(r);
1993
+ gf.n_threads = n_threads;
1994
+ ggml_graph_compute(lora_ctx, &gf);
1995
+
1996
+ // we won't need these tensors again, reset the context to save memory
1997
+ ggml_free(lora_ctx);
1998
+ lora_ctx = ggml_init(params);
1999
+ lora_tensors.clear();
2000
+
2001
+ n_tensors++;
2002
+ if (n_tensors % 4 == 0)
2003
+ fprintf(stderr, ".");
2004
+ }
2005
+ }
2006
+
2007
+ // TODO: this should be in a destructor, it will leak on failure
2008
+ ggml_free(lora_ctx);
2009
+ if (base_ctx) {
2010
+ ggml_free(base_ctx);
2011
+ }
2012
+
2013
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2014
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2015
+
2016
+ return 0;
2017
+ }
2018
+
2019
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2020
+ try {
2021
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2022
+ } catch (const std::string & err) {
2023
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2024
+ return 1;
2025
+ }
2026
+ }
2027
+
1760
2028
  // Returns the KV cache that will contain the context for the
1761
2029
  // ongoing prediction with the model.
1762
2030
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@@ -1914,18 +2182,20 @@ const char * llama_print_system_info(void) {
1914
2182
  static std::string s;
1915
2183
 
1916
2184
  s = "";
1917
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1921
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1922
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1923
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1924
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1925
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1926
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1927
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1928
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2185
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2186
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2187
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2188
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2189
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2190
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2191
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2192
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2193
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2194
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2195
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2196
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2197
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2198
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1929
2199
 
1930
2200
  return s.c_str();
1931
2201
  }
@@ -96,6 +96,18 @@ extern "C" {
96
96
  const char * fname_out,
97
97
  enum llama_ftype ftype);
98
98
 
99
+ // Apply a LoRA adapter to a loaded model
100
+ // path_base_model is the path to a higher quality model to use as a base for
101
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
102
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
103
+ // will be applied on top of the previous one
104
+ // Returns 0 on success
105
+ LLAMA_API int llama_apply_lora_from_file(
106
+ struct llama_context * ctx,
107
+ const char * path_lora,
108
+ const char * path_base_model,
109
+ int n_threads);
110
+
99
111
  // Returns the KV cache that will contain the context for the
100
112
  // ongoing prediction with the model.
101
113
  LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
@@ -43,8 +43,12 @@
43
43
  } while (0)
44
44
 
45
45
  #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
46
49
  __attribute__((format(printf, 1, 2)))
47
50
  #endif
51
+ #endif
48
52
  static std::string format(const char * fmt, ...) {
49
53
  va_list ap, ap2;
50
54
  va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
57
61
  va_end(ap2);
58
62
  va_end(ap);
59
63
  return std::string(buf.data(), size);
60
- };
64
+ }
61
65
 
62
66
  struct llama_file {
63
67
  // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
164
168
  #ifdef _POSIX_MAPPED_FILES
165
169
  static constexpr bool SUPPORTED = true;
166
170
 
167
- llama_mmap(struct llama_file * file) {
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
168
172
  size = file->size;
169
173
  int fd = fileno(file->fp);
170
174
  int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
172
176
  flags |= MAP_POPULATE;
173
177
  #endif
174
178
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
- close(fd);
176
179
  if (addr == MAP_FAILED) {
177
180
  throw format("mmap failed: %s", strerror(errno));
178
181
  }
179
182
 
180
- // Advise the kernel to preload the mapped memory
181
- if (madvise(addr, file->size, MADV_WILLNEED)) {
182
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
- strerror(errno));
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
184
189
  }
185
190
  }
186
191
 
@@ -190,7 +195,7 @@ struct llama_mmap {
190
195
  #elif defined(_WIN32)
191
196
  static constexpr bool SUPPORTED = true;
192
197
 
193
- llama_mmap(struct llama_file * file) {
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
194
199
  size = file->size;
195
200
 
196
201
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -212,13 +217,15 @@ struct llama_mmap {
212
217
  }
213
218
 
214
219
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
- // Advise the kernel to preload the mapped memory
216
- WIN32_MEMORY_RANGE_ENTRY range;
217
- range.VirtualAddress = addr;
218
- range.NumberOfBytes = (SIZE_T)size;
219
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
- llama_format_win_err(GetLastError()).c_str());
220
+ if (prefetch) {
221
+ // Advise the kernel to preload the mapped memory
222
+ WIN32_MEMORY_RANGE_ENTRY range;
223
+ range.VirtualAddress = addr;
224
+ range.NumberOfBytes = (SIZE_T)size;
225
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
226
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
227
+ llama_format_win_err(GetLastError()).c_str());
228
+ }
222
229
  }
223
230
  #else
224
231
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-c85e03d'
9
+ LLAMA_CPP_VERSION = 'master-315a95a'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -9,6 +9,8 @@ module LLaMACpp
9
9
  def self?.print_system_info: () -> void
10
10
  def self?.token_bos: () -> Integer
11
11
  def self?.token_eos: () -> Integer
12
+ def self?.mmap_supported?: () -> bool
13
+ def self?.mlock_supported?: () -> bool
12
14
 
13
15
  class Context
14
16
  public
@@ -28,6 +30,7 @@ module LLaMACpp
28
30
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
29
31
  def token_to_str: (Integer) -> String
30
32
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
33
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
31
34
  end
32
35
 
33
36
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: