llama_cpp 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/ext/llama_cpp/src/ggml.c +1108 -508
- data/ext/llama_cpp/src/ggml.h +10 -0
- data/ext/llama_cpp/src/llama.cpp +317 -47
- data/ext/llama_cpp/src/llama.h +12 -0
- data/ext/llama_cpp/src/llama_util.h +22 -15
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,6 +204,7 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q8_0 = 4,
|
207
208
|
GGML_TYPE_I8,
|
208
209
|
GGML_TYPE_I16,
|
209
210
|
GGML_TYPE_I32,
|
@@ -429,6 +430,12 @@ struct ggml_tensor * ggml_add(
|
|
429
430
|
struct ggml_tensor * a,
|
430
431
|
struct ggml_tensor * b);
|
431
432
|
|
433
|
+
|
434
|
+
struct ggml_tensor * ggml_add_inplace(
|
435
|
+
struct ggml_context * ctx,
|
436
|
+
struct ggml_tensor * a,
|
437
|
+
struct ggml_tensor * b);
|
438
|
+
|
432
439
|
struct ggml_tensor * ggml_sub(
|
433
440
|
struct ggml_context * ctx,
|
434
441
|
struct ggml_tensor * a,
|
@@ -807,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
807
814
|
int ggml_cpu_has_avx(void);
|
808
815
|
int ggml_cpu_has_avx2(void);
|
809
816
|
int ggml_cpu_has_avx512(void);
|
817
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
818
|
+
int ggml_cpu_has_avx512_vnni(void);
|
810
819
|
int ggml_cpu_has_fma(void);
|
811
820
|
int ggml_cpu_has_neon(void);
|
812
821
|
int ggml_cpu_has_arm_fma(void);
|
@@ -836,6 +845,7 @@ typedef struct {
|
|
836
845
|
dequantize_row_q_t dequantize_row_q;
|
837
846
|
quantize_row_q_t quantize_row_q;
|
838
847
|
quantize_row_q_t quantize_row_q_reference;
|
848
|
+
quantize_row_q_t quantize_row_q_dot;
|
839
849
|
vec_dot_q_t vec_dot_q;
|
840
850
|
} quantize_fns_t;
|
841
851
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
4
6
|
#endif
|
5
7
|
|
6
8
|
#include "llama_util.h"
|
@@ -9,6 +11,7 @@
|
|
9
11
|
#include "ggml.h"
|
10
12
|
|
11
13
|
#include <array>
|
14
|
+
#include <ctime>
|
12
15
|
#include <cinttypes>
|
13
16
|
#include <fstream>
|
14
17
|
#include <random>
|
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
|
|
41
44
|
// TODO: dynamically determine these sizes
|
42
45
|
// needs modifications in ggml
|
43
46
|
|
44
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
}
|
47
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
48
|
+
{
|
49
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
50
|
+
{ MODEL_7B, 512ull * MB },
|
51
|
+
{ MODEL_13B, 512ull * MB },
|
52
|
+
{ MODEL_30B, 512ull * MB },
|
53
|
+
{ MODEL_65B, 512ull * MB },
|
54
|
+
};
|
55
|
+
return _MEM_REQ_SCRATCH0;
|
56
|
+
}
|
50
57
|
|
51
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
58
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
59
|
+
{
|
60
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
61
|
+
{ MODEL_7B, 512ull * MB },
|
62
|
+
{ MODEL_13B, 512ull * MB },
|
63
|
+
{ MODEL_30B, 512ull * MB },
|
64
|
+
{ MODEL_65B, 512ull * MB },
|
65
|
+
};
|
66
|
+
return _MEM_REQ_SCRATCH1;
|
56
67
|
};
|
57
68
|
|
58
69
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
59
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
70
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
71
|
+
{
|
72
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
73
|
+
{ MODEL_7B, 1026ull * MB },
|
74
|
+
{ MODEL_13B, 1608ull * MB },
|
75
|
+
{ MODEL_30B, 3124ull * MB },
|
76
|
+
{ MODEL_65B, 5120ull * MB },
|
77
|
+
};
|
78
|
+
return _MEM_REQ_KV_SELF;
|
64
79
|
};
|
65
80
|
|
66
81
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
67
82
|
// not actually needed if BLAS is disabled
|
68
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
83
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
84
|
+
{
|
85
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
86
|
+
{ MODEL_7B, 768ull * MB },
|
87
|
+
{ MODEL_13B, 1024ull * MB },
|
88
|
+
{ MODEL_30B, 1280ull * MB },
|
89
|
+
{ MODEL_65B, 1536ull * MB },
|
90
|
+
};
|
91
|
+
return _MEM_REQ_EVAL;
|
73
92
|
};
|
74
93
|
|
75
94
|
// default hparams (LLaMA 7B)
|
@@ -261,12 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
|
|
261
280
|
}
|
262
281
|
|
263
282
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
-
|
283
|
+
char buf[256];
|
284
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
265
285
|
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
-
|
286
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
267
287
|
}
|
268
|
-
|
269
|
-
return ret;
|
288
|
+
return buf;
|
270
289
|
}
|
271
290
|
|
272
291
|
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
@@ -616,6 +635,7 @@ struct llama_model_loader {
|
|
616
635
|
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
636
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
637
|
}
|
638
|
+
|
619
639
|
return get_tensor_for(lt);
|
620
640
|
}
|
621
641
|
|
@@ -898,13 +918,13 @@ static void llama_model_load_internal(
|
|
898
918
|
const size_t mem_required =
|
899
919
|
ctx_size +
|
900
920
|
mmapped_size +
|
901
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
902
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
903
|
-
MEM_REQ_EVAL.at
|
921
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
922
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
923
|
+
MEM_REQ_EVAL().at(model.type);
|
904
924
|
|
905
925
|
// this is the memory required by one llama_state
|
906
926
|
const size_t mem_required_state =
|
907
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
927
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
908
928
|
|
909
929
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
910
930
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -941,8 +961,8 @@ static void llama_model_load_internal(
|
|
941
961
|
ml->ggml_ctx = ctx;
|
942
962
|
|
943
963
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
-
model.norm
|
945
|
-
model.output
|
964
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
965
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
946
966
|
|
947
967
|
model.layers.resize(n_layer);
|
948
968
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -1569,7 +1589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1569
1589
|
tensor.data = read_data.addr;
|
1570
1590
|
model_loader->load_data_for(tensor);
|
1571
1591
|
|
1572
|
-
printf("[%
|
1592
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1573
1593
|
++idx, model_loader->tensors_map.tensors.size(),
|
1574
1594
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
1595
|
ggml_type_name(tensor.type));
|
@@ -1731,10 +1751,10 @@ struct llama_context * llama_init_from_file(
|
|
1731
1751
|
ctx->embedding.resize(hparams.n_embd);
|
1732
1752
|
}
|
1733
1753
|
|
1734
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1754
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1735
1755
|
|
1736
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1737
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1756
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1757
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1738
1758
|
}
|
1739
1759
|
|
1740
1760
|
return ctx;
|
@@ -1757,6 +1777,254 @@ int llama_model_quantize(
|
|
1757
1777
|
}
|
1758
1778
|
}
|
1759
1779
|
|
1780
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1781
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1782
|
+
|
1783
|
+
auto & model = ctx->model;
|
1784
|
+
|
1785
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1786
|
+
|
1787
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1788
|
+
if (!fin) {
|
1789
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1790
|
+
return 1;
|
1791
|
+
}
|
1792
|
+
|
1793
|
+
// verify magic and version
|
1794
|
+
{
|
1795
|
+
uint32_t magic;
|
1796
|
+
fin.read((char *) &magic, sizeof(magic));
|
1797
|
+
if (magic != 'ggla') {
|
1798
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1799
|
+
return 1;
|
1800
|
+
}
|
1801
|
+
uint32_t format_version;
|
1802
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1803
|
+
|
1804
|
+
if (format_version != 1) {
|
1805
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1806
|
+
return 1;
|
1807
|
+
}
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
int32_t lora_r;
|
1811
|
+
int32_t lora_alpha;
|
1812
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1813
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1814
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1815
|
+
|
1816
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1817
|
+
|
1818
|
+
|
1819
|
+
// create a temporary ggml context to store the lora tensors
|
1820
|
+
// todo: calculate size from biggest possible tensor
|
1821
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1822
|
+
struct ggml_init_params params;
|
1823
|
+
params.mem_size = lora_buf.size();
|
1824
|
+
params.mem_buffer = lora_buf.data();
|
1825
|
+
params.no_alloc = false;
|
1826
|
+
|
1827
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1829
|
+
|
1830
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1831
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1832
|
+
for (auto & kv: model.tensors_by_name) {
|
1833
|
+
model_tensors.insert(kv);
|
1834
|
+
}
|
1835
|
+
|
1836
|
+
|
1837
|
+
// load base model
|
1838
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1839
|
+
ggml_context * base_ctx = NULL;
|
1840
|
+
llama_buffer base_buf;
|
1841
|
+
if (path_base_model) {
|
1842
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1843
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1844
|
+
|
1845
|
+
size_t ctx_size, mmapped_size;
|
1846
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1847
|
+
base_buf.resize(ctx_size);
|
1848
|
+
|
1849
|
+
ggml_init_params base_params;
|
1850
|
+
base_params.mem_size = base_buf.size;
|
1851
|
+
base_params.mem_buffer = base_buf.addr;
|
1852
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1853
|
+
|
1854
|
+
base_ctx = ggml_init(base_params);
|
1855
|
+
|
1856
|
+
model_loader->ggml_ctx = base_ctx;
|
1857
|
+
|
1858
|
+
// maybe this should in llama_model_loader
|
1859
|
+
if (model_loader->use_mmap) {
|
1860
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1861
|
+
}
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
// read tensors and apply
|
1865
|
+
bool warned = false;
|
1866
|
+
int n_tensors = 0;
|
1867
|
+
while (true) {
|
1868
|
+
int32_t n_dims;
|
1869
|
+
int32_t length;
|
1870
|
+
int32_t ftype;
|
1871
|
+
|
1872
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1873
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1874
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1875
|
+
if (fin.eof()) {
|
1876
|
+
break;
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
int32_t ne[2] = { 1, 1 };
|
1880
|
+
for (int i = 0; i < n_dims; ++i) {
|
1881
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
std::string name(length, 0);
|
1885
|
+
fin.read(&name[0], length);
|
1886
|
+
|
1887
|
+
// check for lora suffix and get the type of tensor
|
1888
|
+
const std::string lora_suffix = ".lora";
|
1889
|
+
size_t pos = name.rfind(lora_suffix);
|
1890
|
+
if (pos == std::string::npos) {
|
1891
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1892
|
+
return 1;
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1896
|
+
std::string base_name = name;
|
1897
|
+
base_name.erase(pos);
|
1898
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1899
|
+
|
1900
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1901
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1902
|
+
return 1;
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
// create ggml tensor
|
1906
|
+
ggml_type wtype;
|
1907
|
+
switch (ftype) {
|
1908
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1909
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1910
|
+
default:
|
1911
|
+
{
|
1912
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1913
|
+
__func__, ftype);
|
1914
|
+
return false;
|
1915
|
+
}
|
1916
|
+
}
|
1917
|
+
ggml_tensor* lora_tensor;
|
1918
|
+
if (n_dims == 2) {
|
1919
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1920
|
+
}
|
1921
|
+
else {
|
1922
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1923
|
+
return 1;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
// load tensor data
|
1927
|
+
size_t offset = fin.tellg();
|
1928
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1929
|
+
offset = (offset + 31) & -32;
|
1930
|
+
fin.seekg(offset);
|
1931
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1932
|
+
|
1933
|
+
lora_tensors[name] = lora_tensor;
|
1934
|
+
|
1935
|
+
// check if we have both A and B tensors and apply
|
1936
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1937
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1938
|
+
|
1939
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1940
|
+
ggml_tensor * base_t;
|
1941
|
+
if (model_loader) {
|
1942
|
+
// load from base model
|
1943
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1944
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1945
|
+
return 1;
|
1946
|
+
}
|
1947
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1948
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1949
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1950
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1951
|
+
model_loader->load_data_for(lt);
|
1952
|
+
lt.ggml_tensor->data = lt.data;
|
1953
|
+
}
|
1954
|
+
else {
|
1955
|
+
base_t = dest_t;
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
|
1959
|
+
if (!warned) {
|
1960
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
1962
|
+
warned = true;
|
1963
|
+
}
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
1967
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
1968
|
+
|
1969
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
1970
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
1971
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
1972
|
+
return 1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
// w = w + BA*s
|
1976
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
1977
|
+
|
1978
|
+
if (scaling != 1.0f) {
|
1979
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
1980
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
1981
|
+
}
|
1982
|
+
|
1983
|
+
ggml_tensor * r;
|
1984
|
+
if (base_t == dest_t) {
|
1985
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
1986
|
+
}
|
1987
|
+
else {
|
1988
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
1989
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
1993
|
+
gf.n_threads = n_threads;
|
1994
|
+
ggml_graph_compute(lora_ctx, &gf);
|
1995
|
+
|
1996
|
+
// we won't need these tensors again, reset the context to save memory
|
1997
|
+
ggml_free(lora_ctx);
|
1998
|
+
lora_ctx = ggml_init(params);
|
1999
|
+
lora_tensors.clear();
|
2000
|
+
|
2001
|
+
n_tensors++;
|
2002
|
+
if (n_tensors % 4 == 0)
|
2003
|
+
fprintf(stderr, ".");
|
2004
|
+
}
|
2005
|
+
}
|
2006
|
+
|
2007
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2008
|
+
ggml_free(lora_ctx);
|
2009
|
+
if (base_ctx) {
|
2010
|
+
ggml_free(base_ctx);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2014
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2015
|
+
|
2016
|
+
return 0;
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2020
|
+
try {
|
2021
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2022
|
+
} catch (const std::string & err) {
|
2023
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2024
|
+
return 1;
|
2025
|
+
}
|
2026
|
+
}
|
2027
|
+
|
1760
2028
|
// Returns the KV cache that will contain the context for the
|
1761
2029
|
// ongoing prediction with the model.
|
1762
2030
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
@@ -1914,18 +2182,20 @@ const char * llama_print_system_info(void) {
|
|
1914
2182
|
static std::string s;
|
1915
2183
|
|
1916
2184
|
s = "";
|
1917
|
-
s += "AVX = "
|
1918
|
-
s += "AVX2 = "
|
1919
|
-
s += "AVX512 = "
|
1920
|
-
s += "
|
1921
|
-
s += "
|
1922
|
-
s += "
|
1923
|
-
s += "
|
1924
|
-
s += "
|
1925
|
-
s += "
|
1926
|
-
s += "
|
1927
|
-
s += "
|
1928
|
-
s += "
|
2185
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2186
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2187
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2188
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2189
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2190
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2191
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2192
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2193
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2194
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2195
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2196
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2197
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2198
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1929
2199
|
|
1930
2200
|
return s.c_str();
|
1931
2201
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -96,6 +96,18 @@ extern "C" {
|
|
96
96
|
const char * fname_out,
|
97
97
|
enum llama_ftype ftype);
|
98
98
|
|
99
|
+
// Apply a LoRA adapter to a loaded model
|
100
|
+
// path_base_model is the path to a higher quality model to use as a base for
|
101
|
+
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
102
|
+
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
103
|
+
// will be applied on top of the previous one
|
104
|
+
// Returns 0 on success
|
105
|
+
LLAMA_API int llama_apply_lora_from_file(
|
106
|
+
struct llama_context * ctx,
|
107
|
+
const char * path_lora,
|
108
|
+
const char * path_base_model,
|
109
|
+
int n_threads);
|
110
|
+
|
99
111
|
// Returns the KV cache that will contain the context for the
|
100
112
|
// ongoing prediction with the model.
|
101
113
|
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
@@ -43,8 +43,12 @@
|
|
43
43
|
} while (0)
|
44
44
|
|
45
45
|
#ifdef __GNUC__
|
46
|
+
#ifdef __MINGW32__
|
47
|
+
__attribute__((format(gnu_printf, 1, 2)))
|
48
|
+
#else
|
46
49
|
__attribute__((format(printf, 1, 2)))
|
47
50
|
#endif
|
51
|
+
#endif
|
48
52
|
static std::string format(const char * fmt, ...) {
|
49
53
|
va_list ap, ap2;
|
50
54
|
va_start(ap, fmt);
|
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
|
57
61
|
va_end(ap2);
|
58
62
|
va_end(ap);
|
59
63
|
return std::string(buf.data(), size);
|
60
|
-
}
|
64
|
+
}
|
61
65
|
|
62
66
|
struct llama_file {
|
63
67
|
// use FILE * so we don't have to re-open the file to mmap
|
@@ -164,7 +168,7 @@ struct llama_mmap {
|
|
164
168
|
#ifdef _POSIX_MAPPED_FILES
|
165
169
|
static constexpr bool SUPPORTED = true;
|
166
170
|
|
167
|
-
llama_mmap(struct llama_file * file) {
|
171
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
168
172
|
size = file->size;
|
169
173
|
int fd = fileno(file->fp);
|
170
174
|
int flags = MAP_SHARED;
|
@@ -172,15 +176,16 @@ struct llama_mmap {
|
|
172
176
|
flags |= MAP_POPULATE;
|
173
177
|
#endif
|
174
178
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
175
|
-
close(fd);
|
176
179
|
if (addr == MAP_FAILED) {
|
177
180
|
throw format("mmap failed: %s", strerror(errno));
|
178
181
|
}
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
183
|
+
if (prefetch) {
|
184
|
+
// Advise the kernel to preload the mapped memory
|
185
|
+
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
186
|
+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
187
|
+
strerror(errno));
|
188
|
+
}
|
184
189
|
}
|
185
190
|
}
|
186
191
|
|
@@ -190,7 +195,7 @@ struct llama_mmap {
|
|
190
195
|
#elif defined(_WIN32)
|
191
196
|
static constexpr bool SUPPORTED = true;
|
192
197
|
|
193
|
-
llama_mmap(struct llama_file * file) {
|
198
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
194
199
|
size = file->size;
|
195
200
|
|
196
201
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -212,13 +217,15 @@ struct llama_mmap {
|
|
212
217
|
}
|
213
218
|
|
214
219
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
220
|
+
if (prefetch) {
|
221
|
+
// Advise the kernel to preload the mapped memory
|
222
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
223
|
+
range.VirtualAddress = addr;
|
224
|
+
range.NumberOfBytes = (SIZE_T)size;
|
225
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
226
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
227
|
+
llama_format_win_err(GetLastError()).c_str());
|
228
|
+
}
|
222
229
|
}
|
223
230
|
#else
|
224
231
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-315a95a'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
|
|
17
17
|
# @param n_threads [Integer]
|
18
18
|
# @return [String]
|
19
19
|
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
20
|
+
spaced_prompt = " #{prompt}"
|
21
21
|
|
22
|
-
embd_input = context.tokenize(text:
|
22
|
+
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
23
|
|
24
24
|
n_ctx = context.n_ctx
|
25
25
|
last_n_tokens = [0] * n_ctx
|
@@ -71,6 +71,6 @@ module LLaMACpp
|
|
71
71
|
break if embd[-1] == LLaMACpp.token_eos
|
72
72
|
end
|
73
73
|
|
74
|
-
output.join.delete_prefix(
|
74
|
+
output.join.delete_prefix(spaced_prompt).strip
|
75
75
|
end
|
76
76
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -9,6 +9,8 @@ module LLaMACpp
|
|
9
9
|
def self?.print_system_info: () -> void
|
10
10
|
def self?.token_bos: () -> Integer
|
11
11
|
def self?.token_eos: () -> Integer
|
12
|
+
def self?.mmap_supported?: () -> bool
|
13
|
+
def self?.mlock_supported?: () -> bool
|
12
14
|
|
13
15
|
class Context
|
14
16
|
public
|
@@ -28,6 +30,7 @@ module LLaMACpp
|
|
28
30
|
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
29
31
|
def token_to_str: (Integer) -> String
|
30
32
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
33
|
+
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
31
34
|
end
|
32
35
|
|
33
36
|
class ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|