llama_cpp 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -204,6 +204,7 @@ enum ggml_type {
204
204
  GGML_TYPE_F16 = 1,
205
205
  GGML_TYPE_Q4_0 = 2,
206
206
  GGML_TYPE_Q4_1 = 3,
207
+ GGML_TYPE_Q8_0 = 4,
207
208
  GGML_TYPE_I8,
208
209
  GGML_TYPE_I16,
209
210
  GGML_TYPE_I32,
@@ -429,6 +430,12 @@ struct ggml_tensor * ggml_add(
429
430
  struct ggml_tensor * a,
430
431
  struct ggml_tensor * b);
431
432
 
433
+
434
+ struct ggml_tensor * ggml_add_inplace(
435
+ struct ggml_context * ctx,
436
+ struct ggml_tensor * a,
437
+ struct ggml_tensor * b);
438
+
432
439
  struct ggml_tensor * ggml_sub(
433
440
  struct ggml_context * ctx,
434
441
  struct ggml_tensor * a,
@@ -807,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
807
814
  int ggml_cpu_has_avx(void);
808
815
  int ggml_cpu_has_avx2(void);
809
816
  int ggml_cpu_has_avx512(void);
817
+ int ggml_cpu_has_avx512_vbmi(void);
818
+ int ggml_cpu_has_avx512_vnni(void);
810
819
  int ggml_cpu_has_fma(void);
811
820
  int ggml_cpu_has_neon(void);
812
821
  int ggml_cpu_has_arm_fma(void);
@@ -836,6 +845,7 @@ typedef struct {
836
845
  dequantize_row_q_t dequantize_row_q;
837
846
  quantize_row_q_t quantize_row_q;
838
847
  quantize_row_q_t quantize_row_q_reference;
848
+ quantize_row_q_t quantize_row_q_dot;
839
849
  vec_dot_q_t vec_dot_q;
840
850
  } quantize_fns_t;
841
851
 
@@ -1,6 +1,8 @@
1
1
  // Defines fileno on msys:
2
2
  #ifndef _GNU_SOURCE
3
3
  #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
4
6
  #endif
5
7
 
6
8
  #include "llama_util.h"
@@ -9,6 +11,7 @@
9
11
  #include "ggml.h"
10
12
 
11
13
  #include <array>
14
+ #include <ctime>
12
15
  #include <cinttypes>
13
16
  #include <fstream>
14
17
  #include <random>
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
41
44
  // TODO: dynamically determine these sizes
42
45
  // needs modifications in ggml
43
46
 
44
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
45
- { MODEL_7B, 512ull*MB },
46
- { MODEL_13B, 512ull*MB },
47
- { MODEL_30B, 512ull*MB },
48
- { MODEL_65B, 512ull*MB },
49
- };
47
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
48
+ {
49
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
50
+ { MODEL_7B, 512ull * MB },
51
+ { MODEL_13B, 512ull * MB },
52
+ { MODEL_30B, 512ull * MB },
53
+ { MODEL_65B, 512ull * MB },
54
+ };
55
+ return _MEM_REQ_SCRATCH0;
56
+ }
50
57
 
51
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
52
- { MODEL_7B, 512ull*MB },
53
- { MODEL_13B, 512ull*MB },
54
- { MODEL_30B, 512ull*MB },
55
- { MODEL_65B, 512ull*MB },
58
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
59
+ {
60
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
61
+ { MODEL_7B, 512ull * MB },
62
+ { MODEL_13B, 512ull * MB },
63
+ { MODEL_30B, 512ull * MB },
64
+ { MODEL_65B, 512ull * MB },
65
+ };
66
+ return _MEM_REQ_SCRATCH1;
56
67
  };
57
68
 
58
69
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
59
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
60
- { MODEL_7B, 1026ull*MB },
61
- { MODEL_13B, 1608ull*MB },
62
- { MODEL_30B, 3124ull*MB },
63
- { MODEL_65B, 5120ull*MB },
70
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
71
+ {
72
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
73
+ { MODEL_7B, 1026ull * MB },
74
+ { MODEL_13B, 1608ull * MB },
75
+ { MODEL_30B, 3124ull * MB },
76
+ { MODEL_65B, 5120ull * MB },
77
+ };
78
+ return _MEM_REQ_KV_SELF;
64
79
  };
65
80
 
66
81
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
67
82
  // not actually needed if BLAS is disabled
68
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
69
- { MODEL_7B, 768ull*MB },
70
- { MODEL_13B, 1024ull*MB },
71
- { MODEL_30B, 1280ull*MB },
72
- { MODEL_65B, 1536ull*MB },
83
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
84
+ {
85
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
86
+ { MODEL_7B, 768ull * MB },
87
+ { MODEL_13B, 1024ull * MB },
88
+ { MODEL_30B, 1280ull * MB },
89
+ { MODEL_65B, 1536ull * MB },
90
+ };
91
+ return _MEM_REQ_EVAL;
73
92
  };
74
93
 
75
94
  // default hparams (LLaMA 7B)
@@ -261,12 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
261
280
  }
262
281
 
263
282
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
264
- std::string ret = "[" + std::to_string(ne.at(0));
283
+ char buf[256];
284
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
265
285
  for (size_t i = 1; i < ne.size(); i++) {
266
- ret += " x " + std::to_string(ne.at(i));
286
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
267
287
  }
268
- ret += "]";
269
- return ret;
288
+ return buf;
270
289
  }
271
290
 
272
291
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -616,6 +635,7 @@ struct llama_model_loader {
616
635
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617
636
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618
637
  }
638
+
619
639
  return get_tensor_for(lt);
620
640
  }
621
641
 
@@ -898,13 +918,13 @@ static void llama_model_load_internal(
898
918
  const size_t mem_required =
899
919
  ctx_size +
900
920
  mmapped_size +
901
- MEM_REQ_SCRATCH0.at(model.type) +
902
- MEM_REQ_SCRATCH1.at(model.type) +
903
- MEM_REQ_EVAL.at (model.type);
921
+ MEM_REQ_SCRATCH0().at(model.type) +
922
+ MEM_REQ_SCRATCH1().at(model.type) +
923
+ MEM_REQ_EVAL().at(model.type);
904
924
 
905
925
  // this is the memory required by one llama_state
906
926
  const size_t mem_required_state =
907
- scale*MEM_REQ_KV_SELF.at(model.type);
927
+ scale*MEM_REQ_KV_SELF().at(model.type);
908
928
 
909
929
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
910
930
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -941,8 +961,8 @@ static void llama_model_load_internal(
941
961
  ml->ggml_ctx = ctx;
942
962
 
943
963
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
944
- model.norm = ml->get_tensor("norm.weight", {n_embd});
945
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
964
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
965
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
946
966
 
947
967
  model.layers.resize(n_layer);
948
968
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1569,7 +1589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1569
1589
  tensor.data = read_data.addr;
1570
1590
  model_loader->load_data_for(tensor);
1571
1591
 
1572
- printf("[%zu/%zu] %36s - %s, type = %6s, ",
1592
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1573
1593
  ++idx, model_loader->tensors_map.tensors.size(),
1574
1594
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1575
1595
  ggml_type_name(tensor.type));
@@ -1731,10 +1751,10 @@ struct llama_context * llama_init_from_file(
1731
1751
  ctx->embedding.resize(hparams.n_embd);
1732
1752
  }
1733
1753
 
1734
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1754
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
1735
1755
 
1736
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1737
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1756
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
1757
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
1738
1758
  }
1739
1759
 
1740
1760
  return ctx;
@@ -1757,6 +1777,254 @@ int llama_model_quantize(
1757
1777
  }
1758
1778
  }
1759
1779
 
1780
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1781
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1782
+
1783
+ auto & model = ctx->model;
1784
+
1785
+ const int64_t t_start_lora_us = ggml_time_us();
1786
+
1787
+ auto fin = std::ifstream(path_lora, std::ios::binary);
1788
+ if (!fin) {
1789
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
1790
+ return 1;
1791
+ }
1792
+
1793
+ // verify magic and version
1794
+ {
1795
+ uint32_t magic;
1796
+ fin.read((char *) &magic, sizeof(magic));
1797
+ if (magic != 'ggla') {
1798
+ fprintf(stderr, "%s: bad file magic\n", __func__);
1799
+ return 1;
1800
+ }
1801
+ uint32_t format_version;
1802
+ fin.read((char *) &format_version, sizeof(format_version));
1803
+
1804
+ if (format_version != 1) {
1805
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
1806
+ return 1;
1807
+ }
1808
+ }
1809
+
1810
+ int32_t lora_r;
1811
+ int32_t lora_alpha;
1812
+ fin.read((char *) &lora_r, sizeof(lora_r));
1813
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
1814
+ float scaling = (float)lora_alpha / (float)lora_r;
1815
+
1816
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
1817
+
1818
+
1819
+ // create a temporary ggml context to store the lora tensors
1820
+ // todo: calculate size from biggest possible tensor
1821
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
1822
+ struct ggml_init_params params;
1823
+ params.mem_size = lora_buf.size();
1824
+ params.mem_buffer = lora_buf.data();
1825
+ params.no_alloc = false;
1826
+
1827
+ ggml_context * lora_ctx = ggml_init(params);
1828
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1829
+
1830
+ // create a name -> tensor map of the model to accelerate lookups
1831
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
1832
+ for (auto & kv: model.tensors_by_name) {
1833
+ model_tensors.insert(kv);
1834
+ }
1835
+
1836
+
1837
+ // load base model
1838
+ std::unique_ptr<llama_model_loader> model_loader;
1839
+ ggml_context * base_ctx = NULL;
1840
+ llama_buffer base_buf;
1841
+ if (path_base_model) {
1842
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
1843
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
1844
+
1845
+ size_t ctx_size, mmapped_size;
1846
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
1847
+ base_buf.resize(ctx_size);
1848
+
1849
+ ggml_init_params base_params;
1850
+ base_params.mem_size = base_buf.size;
1851
+ base_params.mem_buffer = base_buf.addr;
1852
+ base_params.no_alloc = model_loader->use_mmap;
1853
+
1854
+ base_ctx = ggml_init(base_params);
1855
+
1856
+ model_loader->ggml_ctx = base_ctx;
1857
+
1858
+ // maybe this should in llama_model_loader
1859
+ if (model_loader->use_mmap) {
1860
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
1861
+ }
1862
+ }
1863
+
1864
+ // read tensors and apply
1865
+ bool warned = false;
1866
+ int n_tensors = 0;
1867
+ while (true) {
1868
+ int32_t n_dims;
1869
+ int32_t length;
1870
+ int32_t ftype;
1871
+
1872
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
1873
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
1874
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
1875
+ if (fin.eof()) {
1876
+ break;
1877
+ }
1878
+
1879
+ int32_t ne[2] = { 1, 1 };
1880
+ for (int i = 0; i < n_dims; ++i) {
1881
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1882
+ }
1883
+
1884
+ std::string name(length, 0);
1885
+ fin.read(&name[0], length);
1886
+
1887
+ // check for lora suffix and get the type of tensor
1888
+ const std::string lora_suffix = ".lora";
1889
+ size_t pos = name.rfind(lora_suffix);
1890
+ if (pos == std::string::npos) {
1891
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
1892
+ return 1;
1893
+ }
1894
+
1895
+ std::string lora_type = name.substr(pos + lora_suffix.length());
1896
+ std::string base_name = name;
1897
+ base_name.erase(pos);
1898
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
1899
+
1900
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
1901
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
1902
+ return 1;
1903
+ }
1904
+
1905
+ // create ggml tensor
1906
+ ggml_type wtype;
1907
+ switch (ftype) {
1908
+ case 0: wtype = GGML_TYPE_F32; break;
1909
+ case 1: wtype = GGML_TYPE_F16; break;
1910
+ default:
1911
+ {
1912
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
1913
+ __func__, ftype);
1914
+ return false;
1915
+ }
1916
+ }
1917
+ ggml_tensor* lora_tensor;
1918
+ if (n_dims == 2) {
1919
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
1920
+ }
1921
+ else {
1922
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
1923
+ return 1;
1924
+ }
1925
+
1926
+ // load tensor data
1927
+ size_t offset = fin.tellg();
1928
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
1929
+ offset = (offset + 31) & -32;
1930
+ fin.seekg(offset);
1931
+ fin.read((char*)lora_tensor->data, tensor_data_size);
1932
+
1933
+ lora_tensors[name] = lora_tensor;
1934
+
1935
+ // check if we have both A and B tensors and apply
1936
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
1937
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
1938
+
1939
+ ggml_tensor * dest_t = model_tensors[base_name];
1940
+ ggml_tensor * base_t;
1941
+ if (model_loader) {
1942
+ // load from base model
1943
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
1944
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
1945
+ return 1;
1946
+ }
1947
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
1948
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
1949
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
1950
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
1951
+ model_loader->load_data_for(lt);
1952
+ lt.ggml_tensor->data = lt.data;
1953
+ }
1954
+ else {
1955
+ base_t = dest_t;
1956
+ }
1957
+
1958
+ if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
1959
+ if (!warned) {
1960
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
1961
+ "use a f16 or f32 base model with --lora-base\n", __func__);
1962
+ warned = true;
1963
+ }
1964
+ }
1965
+
1966
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
1967
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
1968
+
1969
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
1970
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
1971
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
1972
+ return 1;
1973
+ }
1974
+
1975
+ // w = w + BA*s
1976
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
1977
+
1978
+ if (scaling != 1.0f) {
1979
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
1980
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
1981
+ }
1982
+
1983
+ ggml_tensor * r;
1984
+ if (base_t == dest_t) {
1985
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
1986
+ }
1987
+ else {
1988
+ r = ggml_add(lora_ctx, base_t, BA);
1989
+ r = ggml_cpy(lora_ctx, r, dest_t);
1990
+ }
1991
+
1992
+ struct ggml_cgraph gf = ggml_build_forward(r);
1993
+ gf.n_threads = n_threads;
1994
+ ggml_graph_compute(lora_ctx, &gf);
1995
+
1996
+ // we won't need these tensors again, reset the context to save memory
1997
+ ggml_free(lora_ctx);
1998
+ lora_ctx = ggml_init(params);
1999
+ lora_tensors.clear();
2000
+
2001
+ n_tensors++;
2002
+ if (n_tensors % 4 == 0)
2003
+ fprintf(stderr, ".");
2004
+ }
2005
+ }
2006
+
2007
+ // TODO: this should be in a destructor, it will leak on failure
2008
+ ggml_free(lora_ctx);
2009
+ if (base_ctx) {
2010
+ ggml_free(base_ctx);
2011
+ }
2012
+
2013
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2014
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2015
+
2016
+ return 0;
2017
+ }
2018
+
2019
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2020
+ try {
2021
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2022
+ } catch (const std::string & err) {
2023
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2024
+ return 1;
2025
+ }
2026
+ }
2027
+
1760
2028
  // Returns the KV cache that will contain the context for the
1761
2029
  // ongoing prediction with the model.
1762
2030
  const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@@ -1914,18 +2182,20 @@ const char * llama_print_system_info(void) {
1914
2182
  static std::string s;
1915
2183
 
1916
2184
  s = "";
1917
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1918
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1919
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1920
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1921
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1922
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1923
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1924
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1925
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1926
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1927
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1928
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2185
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2186
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2187
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2188
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2189
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2190
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2191
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2192
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2193
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2194
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2195
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2196
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2197
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2198
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
1929
2199
 
1930
2200
  return s.c_str();
1931
2201
  }
@@ -96,6 +96,18 @@ extern "C" {
96
96
  const char * fname_out,
97
97
  enum llama_ftype ftype);
98
98
 
99
+ // Apply a LoRA adapter to a loaded model
100
+ // path_base_model is the path to a higher quality model to use as a base for
101
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
102
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
103
+ // will be applied on top of the previous one
104
+ // Returns 0 on success
105
+ LLAMA_API int llama_apply_lora_from_file(
106
+ struct llama_context * ctx,
107
+ const char * path_lora,
108
+ const char * path_base_model,
109
+ int n_threads);
110
+
99
111
  // Returns the KV cache that will contain the context for the
100
112
  // ongoing prediction with the model.
101
113
  LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
@@ -43,8 +43,12 @@
43
43
  } while (0)
44
44
 
45
45
  #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
46
49
  __attribute__((format(printf, 1, 2)))
47
50
  #endif
51
+ #endif
48
52
  static std::string format(const char * fmt, ...) {
49
53
  va_list ap, ap2;
50
54
  va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
57
61
  va_end(ap2);
58
62
  va_end(ap);
59
63
  return std::string(buf.data(), size);
60
- };
64
+ }
61
65
 
62
66
  struct llama_file {
63
67
  // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
164
168
  #ifdef _POSIX_MAPPED_FILES
165
169
  static constexpr bool SUPPORTED = true;
166
170
 
167
- llama_mmap(struct llama_file * file) {
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
168
172
  size = file->size;
169
173
  int fd = fileno(file->fp);
170
174
  int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
172
176
  flags |= MAP_POPULATE;
173
177
  #endif
174
178
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
- close(fd);
176
179
  if (addr == MAP_FAILED) {
177
180
  throw format("mmap failed: %s", strerror(errno));
178
181
  }
179
182
 
180
- // Advise the kernel to preload the mapped memory
181
- if (madvise(addr, file->size, MADV_WILLNEED)) {
182
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
- strerror(errno));
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
184
189
  }
185
190
  }
186
191
 
@@ -190,7 +195,7 @@ struct llama_mmap {
190
195
  #elif defined(_WIN32)
191
196
  static constexpr bool SUPPORTED = true;
192
197
 
193
- llama_mmap(struct llama_file * file) {
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
194
199
  size = file->size;
195
200
 
196
201
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -212,13 +217,15 @@ struct llama_mmap {
212
217
  }
213
218
 
214
219
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
- // Advise the kernel to preload the mapped memory
216
- WIN32_MEMORY_RANGE_ENTRY range;
217
- range.VirtualAddress = addr;
218
- range.NumberOfBytes = (SIZE_T)size;
219
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
- llama_format_win_err(GetLastError()).c_str());
220
+ if (prefetch) {
221
+ // Advise the kernel to preload the mapped memory
222
+ WIN32_MEMORY_RANGE_ENTRY range;
223
+ range.VirtualAddress = addr;
224
+ range.NumberOfBytes = (SIZE_T)size;
225
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
226
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
227
+ llama_format_win_err(GetLastError()).c_str());
228
+ }
222
229
  }
223
230
  #else
224
231
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-c85e03d'
9
+ LLAMA_CPP_VERSION = 'master-315a95a'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -9,6 +9,8 @@ module LLaMACpp
9
9
  def self?.print_system_info: () -> void
10
10
  def self?.token_bos: () -> Integer
11
11
  def self?.token_eos: () -> Integer
12
+ def self?.mmap_supported?: () -> bool
13
+ def self?.mlock_supported?: () -> bool
12
14
 
13
15
  class Context
14
16
  public
@@ -28,6 +30,7 @@ module LLaMACpp
28
30
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
29
31
  def token_to_str: (Integer) -> String
30
32
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
33
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
31
34
  end
32
35
 
33
36
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: