llama_cpp 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +3 -2
- data/ext/llama_cpp/extconf.rb +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/ext/llama_cpp/src/ggml.c +1108 -508
- data/ext/llama_cpp/src/ggml.h +10 -0
- data/ext/llama_cpp/src/llama.cpp +317 -47
- data/ext/llama_cpp/src/llama.h +12 -0
- data/ext/llama_cpp/src/llama_util.h +22 -15
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -204,6 +204,7 @@ enum ggml_type {
|
|
204
204
|
GGML_TYPE_F16 = 1,
|
205
205
|
GGML_TYPE_Q4_0 = 2,
|
206
206
|
GGML_TYPE_Q4_1 = 3,
|
207
|
+
GGML_TYPE_Q8_0 = 4,
|
207
208
|
GGML_TYPE_I8,
|
208
209
|
GGML_TYPE_I16,
|
209
210
|
GGML_TYPE_I32,
|
@@ -429,6 +430,12 @@ struct ggml_tensor * ggml_add(
|
|
429
430
|
struct ggml_tensor * a,
|
430
431
|
struct ggml_tensor * b);
|
431
432
|
|
433
|
+
|
434
|
+
struct ggml_tensor * ggml_add_inplace(
|
435
|
+
struct ggml_context * ctx,
|
436
|
+
struct ggml_tensor * a,
|
437
|
+
struct ggml_tensor * b);
|
438
|
+
|
432
439
|
struct ggml_tensor * ggml_sub(
|
433
440
|
struct ggml_context * ctx,
|
434
441
|
struct ggml_tensor * a,
|
@@ -807,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
|
807
814
|
int ggml_cpu_has_avx(void);
|
808
815
|
int ggml_cpu_has_avx2(void);
|
809
816
|
int ggml_cpu_has_avx512(void);
|
817
|
+
int ggml_cpu_has_avx512_vbmi(void);
|
818
|
+
int ggml_cpu_has_avx512_vnni(void);
|
810
819
|
int ggml_cpu_has_fma(void);
|
811
820
|
int ggml_cpu_has_neon(void);
|
812
821
|
int ggml_cpu_has_arm_fma(void);
|
@@ -836,6 +845,7 @@ typedef struct {
|
|
836
845
|
dequantize_row_q_t dequantize_row_q;
|
837
846
|
quantize_row_q_t quantize_row_q;
|
838
847
|
quantize_row_q_t quantize_row_q_reference;
|
848
|
+
quantize_row_q_t quantize_row_q_dot;
|
839
849
|
vec_dot_q_t vec_dot_q;
|
840
850
|
} quantize_fns_t;
|
841
851
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstdint>
|
5
|
+
#include <cstdio>
|
4
6
|
#endif
|
5
7
|
|
6
8
|
#include "llama_util.h"
|
@@ -9,6 +11,7 @@
|
|
9
11
|
#include "ggml.h"
|
10
12
|
|
11
13
|
#include <array>
|
14
|
+
#include <ctime>
|
12
15
|
#include <cinttypes>
|
13
16
|
#include <fstream>
|
14
17
|
#include <random>
|
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
|
|
41
44
|
// TODO: dynamically determine these sizes
|
42
45
|
// needs modifications in ggml
|
43
46
|
|
44
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
}
|
47
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
48
|
+
{
|
49
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
50
|
+
{ MODEL_7B, 512ull * MB },
|
51
|
+
{ MODEL_13B, 512ull * MB },
|
52
|
+
{ MODEL_30B, 512ull * MB },
|
53
|
+
{ MODEL_65B, 512ull * MB },
|
54
|
+
};
|
55
|
+
return _MEM_REQ_SCRATCH0;
|
56
|
+
}
|
50
57
|
|
51
|
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
58
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
59
|
+
{
|
60
|
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
61
|
+
{ MODEL_7B, 512ull * MB },
|
62
|
+
{ MODEL_13B, 512ull * MB },
|
63
|
+
{ MODEL_30B, 512ull * MB },
|
64
|
+
{ MODEL_65B, 512ull * MB },
|
65
|
+
};
|
66
|
+
return _MEM_REQ_SCRATCH1;
|
56
67
|
};
|
57
68
|
|
58
69
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
59
|
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
70
|
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
71
|
+
{
|
72
|
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
73
|
+
{ MODEL_7B, 1026ull * MB },
|
74
|
+
{ MODEL_13B, 1608ull * MB },
|
75
|
+
{ MODEL_30B, 3124ull * MB },
|
76
|
+
{ MODEL_65B, 5120ull * MB },
|
77
|
+
};
|
78
|
+
return _MEM_REQ_KV_SELF;
|
64
79
|
};
|
65
80
|
|
66
81
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
67
82
|
// not actually needed if BLAS is disabled
|
68
|
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
83
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
84
|
+
{
|
85
|
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
86
|
+
{ MODEL_7B, 768ull * MB },
|
87
|
+
{ MODEL_13B, 1024ull * MB },
|
88
|
+
{ MODEL_30B, 1280ull * MB },
|
89
|
+
{ MODEL_65B, 1536ull * MB },
|
90
|
+
};
|
91
|
+
return _MEM_REQ_EVAL;
|
73
92
|
};
|
74
93
|
|
75
94
|
// default hparams (LLaMA 7B)
|
@@ -261,12 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
|
|
261
280
|
}
|
262
281
|
|
263
282
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
264
|
-
|
283
|
+
char buf[256];
|
284
|
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
265
285
|
for (size_t i = 1; i < ne.size(); i++) {
|
266
|
-
|
286
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
267
287
|
}
|
268
|
-
|
269
|
-
return ret;
|
288
|
+
return buf;
|
270
289
|
}
|
271
290
|
|
272
291
|
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
@@ -616,6 +635,7 @@ struct llama_model_loader {
|
|
616
635
|
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
617
636
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
618
637
|
}
|
638
|
+
|
619
639
|
return get_tensor_for(lt);
|
620
640
|
}
|
621
641
|
|
@@ -898,13 +918,13 @@ static void llama_model_load_internal(
|
|
898
918
|
const size_t mem_required =
|
899
919
|
ctx_size +
|
900
920
|
mmapped_size +
|
901
|
-
MEM_REQ_SCRATCH0.at(model.type) +
|
902
|
-
MEM_REQ_SCRATCH1.at(model.type) +
|
903
|
-
MEM_REQ_EVAL.at
|
921
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
922
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
923
|
+
MEM_REQ_EVAL().at(model.type);
|
904
924
|
|
905
925
|
// this is the memory required by one llama_state
|
906
926
|
const size_t mem_required_state =
|
907
|
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
927
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
908
928
|
|
909
929
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
910
930
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
@@ -941,8 +961,8 @@ static void llama_model_load_internal(
|
|
941
961
|
ml->ggml_ctx = ctx;
|
942
962
|
|
943
963
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
944
|
-
model.norm
|
945
|
-
model.output
|
964
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
965
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
946
966
|
|
947
967
|
model.layers.resize(n_layer);
|
948
968
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -1569,7 +1589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1569
1589
|
tensor.data = read_data.addr;
|
1570
1590
|
model_loader->load_data_for(tensor);
|
1571
1591
|
|
1572
|
-
printf("[%
|
1592
|
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
1573
1593
|
++idx, model_loader->tensors_map.tensors.size(),
|
1574
1594
|
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
1575
1595
|
ggml_type_name(tensor.type));
|
@@ -1731,10 +1751,10 @@ struct llama_context * llama_init_from_file(
|
|
1731
1751
|
ctx->embedding.resize(hparams.n_embd);
|
1732
1752
|
}
|
1733
1753
|
|
1734
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1754
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
1735
1755
|
|
1736
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1737
|
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1756
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
1757
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
1738
1758
|
}
|
1739
1759
|
|
1740
1760
|
return ctx;
|
@@ -1757,6 +1777,254 @@ int llama_model_quantize(
|
|
1757
1777
|
}
|
1758
1778
|
}
|
1759
1779
|
|
1780
|
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
1781
|
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
1782
|
+
|
1783
|
+
auto & model = ctx->model;
|
1784
|
+
|
1785
|
+
const int64_t t_start_lora_us = ggml_time_us();
|
1786
|
+
|
1787
|
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
1788
|
+
if (!fin) {
|
1789
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
1790
|
+
return 1;
|
1791
|
+
}
|
1792
|
+
|
1793
|
+
// verify magic and version
|
1794
|
+
{
|
1795
|
+
uint32_t magic;
|
1796
|
+
fin.read((char *) &magic, sizeof(magic));
|
1797
|
+
if (magic != 'ggla') {
|
1798
|
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
1799
|
+
return 1;
|
1800
|
+
}
|
1801
|
+
uint32_t format_version;
|
1802
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
1803
|
+
|
1804
|
+
if (format_version != 1) {
|
1805
|
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
1806
|
+
return 1;
|
1807
|
+
}
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
int32_t lora_r;
|
1811
|
+
int32_t lora_alpha;
|
1812
|
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
1813
|
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
1814
|
+
float scaling = (float)lora_alpha / (float)lora_r;
|
1815
|
+
|
1816
|
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
1817
|
+
|
1818
|
+
|
1819
|
+
// create a temporary ggml context to store the lora tensors
|
1820
|
+
// todo: calculate size from biggest possible tensor
|
1821
|
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
1822
|
+
struct ggml_init_params params;
|
1823
|
+
params.mem_size = lora_buf.size();
|
1824
|
+
params.mem_buffer = lora_buf.data();
|
1825
|
+
params.no_alloc = false;
|
1826
|
+
|
1827
|
+
ggml_context * lora_ctx = ggml_init(params);
|
1828
|
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
1829
|
+
|
1830
|
+
// create a name -> tensor map of the model to accelerate lookups
|
1831
|
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
1832
|
+
for (auto & kv: model.tensors_by_name) {
|
1833
|
+
model_tensors.insert(kv);
|
1834
|
+
}
|
1835
|
+
|
1836
|
+
|
1837
|
+
// load base model
|
1838
|
+
std::unique_ptr<llama_model_loader> model_loader;
|
1839
|
+
ggml_context * base_ctx = NULL;
|
1840
|
+
llama_buffer base_buf;
|
1841
|
+
if (path_base_model) {
|
1842
|
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
1843
|
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
1844
|
+
|
1845
|
+
size_t ctx_size, mmapped_size;
|
1846
|
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
1847
|
+
base_buf.resize(ctx_size);
|
1848
|
+
|
1849
|
+
ggml_init_params base_params;
|
1850
|
+
base_params.mem_size = base_buf.size;
|
1851
|
+
base_params.mem_buffer = base_buf.addr;
|
1852
|
+
base_params.no_alloc = model_loader->use_mmap;
|
1853
|
+
|
1854
|
+
base_ctx = ggml_init(base_params);
|
1855
|
+
|
1856
|
+
model_loader->ggml_ctx = base_ctx;
|
1857
|
+
|
1858
|
+
// maybe this should in llama_model_loader
|
1859
|
+
if (model_loader->use_mmap) {
|
1860
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
1861
|
+
}
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
// read tensors and apply
|
1865
|
+
bool warned = false;
|
1866
|
+
int n_tensors = 0;
|
1867
|
+
while (true) {
|
1868
|
+
int32_t n_dims;
|
1869
|
+
int32_t length;
|
1870
|
+
int32_t ftype;
|
1871
|
+
|
1872
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1873
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1874
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1875
|
+
if (fin.eof()) {
|
1876
|
+
break;
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
int32_t ne[2] = { 1, 1 };
|
1880
|
+
for (int i = 0; i < n_dims; ++i) {
|
1881
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
std::string name(length, 0);
|
1885
|
+
fin.read(&name[0], length);
|
1886
|
+
|
1887
|
+
// check for lora suffix and get the type of tensor
|
1888
|
+
const std::string lora_suffix = ".lora";
|
1889
|
+
size_t pos = name.rfind(lora_suffix);
|
1890
|
+
if (pos == std::string::npos) {
|
1891
|
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
1892
|
+
return 1;
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
1896
|
+
std::string base_name = name;
|
1897
|
+
base_name.erase(pos);
|
1898
|
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
1899
|
+
|
1900
|
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
1901
|
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
1902
|
+
return 1;
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
// create ggml tensor
|
1906
|
+
ggml_type wtype;
|
1907
|
+
switch (ftype) {
|
1908
|
+
case 0: wtype = GGML_TYPE_F32; break;
|
1909
|
+
case 1: wtype = GGML_TYPE_F16; break;
|
1910
|
+
default:
|
1911
|
+
{
|
1912
|
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
1913
|
+
__func__, ftype);
|
1914
|
+
return false;
|
1915
|
+
}
|
1916
|
+
}
|
1917
|
+
ggml_tensor* lora_tensor;
|
1918
|
+
if (n_dims == 2) {
|
1919
|
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
1920
|
+
}
|
1921
|
+
else {
|
1922
|
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
1923
|
+
return 1;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
// load tensor data
|
1927
|
+
size_t offset = fin.tellg();
|
1928
|
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
1929
|
+
offset = (offset + 31) & -32;
|
1930
|
+
fin.seekg(offset);
|
1931
|
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
1932
|
+
|
1933
|
+
lora_tensors[name] = lora_tensor;
|
1934
|
+
|
1935
|
+
// check if we have both A and B tensors and apply
|
1936
|
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
1937
|
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
1938
|
+
|
1939
|
+
ggml_tensor * dest_t = model_tensors[base_name];
|
1940
|
+
ggml_tensor * base_t;
|
1941
|
+
if (model_loader) {
|
1942
|
+
// load from base model
|
1943
|
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
1944
|
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
1945
|
+
return 1;
|
1946
|
+
}
|
1947
|
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
1948
|
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
1949
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
1950
|
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
1951
|
+
model_loader->load_data_for(lt);
|
1952
|
+
lt.ggml_tensor->data = lt.data;
|
1953
|
+
}
|
1954
|
+
else {
|
1955
|
+
base_t = dest_t;
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
|
1959
|
+
if (!warned) {
|
1960
|
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
1962
|
+
warned = true;
|
1963
|
+
}
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
1967
|
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
1968
|
+
|
1969
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
1970
|
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
1971
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
1972
|
+
return 1;
|
1973
|
+
}
|
1974
|
+
|
1975
|
+
// w = w + BA*s
|
1976
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
1977
|
+
|
1978
|
+
if (scaling != 1.0f) {
|
1979
|
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
1980
|
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
1981
|
+
}
|
1982
|
+
|
1983
|
+
ggml_tensor * r;
|
1984
|
+
if (base_t == dest_t) {
|
1985
|
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
1986
|
+
}
|
1987
|
+
else {
|
1988
|
+
r = ggml_add(lora_ctx, base_t, BA);
|
1989
|
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
1993
|
+
gf.n_threads = n_threads;
|
1994
|
+
ggml_graph_compute(lora_ctx, &gf);
|
1995
|
+
|
1996
|
+
// we won't need these tensors again, reset the context to save memory
|
1997
|
+
ggml_free(lora_ctx);
|
1998
|
+
lora_ctx = ggml_init(params);
|
1999
|
+
lora_tensors.clear();
|
2000
|
+
|
2001
|
+
n_tensors++;
|
2002
|
+
if (n_tensors % 4 == 0)
|
2003
|
+
fprintf(stderr, ".");
|
2004
|
+
}
|
2005
|
+
}
|
2006
|
+
|
2007
|
+
// TODO: this should be in a destructor, it will leak on failure
|
2008
|
+
ggml_free(lora_ctx);
|
2009
|
+
if (base_ctx) {
|
2010
|
+
ggml_free(base_ctx);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
2014
|
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
2015
|
+
|
2016
|
+
return 0;
|
2017
|
+
}
|
2018
|
+
|
2019
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2020
|
+
try {
|
2021
|
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2022
|
+
} catch (const std::string & err) {
|
2023
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
2024
|
+
return 1;
|
2025
|
+
}
|
2026
|
+
}
|
2027
|
+
|
1760
2028
|
// Returns the KV cache that will contain the context for the
|
1761
2029
|
// ongoing prediction with the model.
|
1762
2030
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
@@ -1914,18 +2182,20 @@ const char * llama_print_system_info(void) {
|
|
1914
2182
|
static std::string s;
|
1915
2183
|
|
1916
2184
|
s = "";
|
1917
|
-
s += "AVX = "
|
1918
|
-
s += "AVX2 = "
|
1919
|
-
s += "AVX512 = "
|
1920
|
-
s += "
|
1921
|
-
s += "
|
1922
|
-
s += "
|
1923
|
-
s += "
|
1924
|
-
s += "
|
1925
|
-
s += "
|
1926
|
-
s += "
|
1927
|
-
s += "
|
1928
|
-
s += "
|
2185
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
2186
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
2187
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
2188
|
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
2189
|
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
2190
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
2191
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
2192
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
2193
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
2194
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
2195
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
2196
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
2197
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
2198
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1929
2199
|
|
1930
2200
|
return s.c_str();
|
1931
2201
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -96,6 +96,18 @@ extern "C" {
|
|
96
96
|
const char * fname_out,
|
97
97
|
enum llama_ftype ftype);
|
98
98
|
|
99
|
+
// Apply a LoRA adapter to a loaded model
|
100
|
+
// path_base_model is the path to a higher quality model to use as a base for
|
101
|
+
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
102
|
+
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
103
|
+
// will be applied on top of the previous one
|
104
|
+
// Returns 0 on success
|
105
|
+
LLAMA_API int llama_apply_lora_from_file(
|
106
|
+
struct llama_context * ctx,
|
107
|
+
const char * path_lora,
|
108
|
+
const char * path_base_model,
|
109
|
+
int n_threads);
|
110
|
+
|
99
111
|
// Returns the KV cache that will contain the context for the
|
100
112
|
// ongoing prediction with the model.
|
101
113
|
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
@@ -43,8 +43,12 @@
|
|
43
43
|
} while (0)
|
44
44
|
|
45
45
|
#ifdef __GNUC__
|
46
|
+
#ifdef __MINGW32__
|
47
|
+
__attribute__((format(gnu_printf, 1, 2)))
|
48
|
+
#else
|
46
49
|
__attribute__((format(printf, 1, 2)))
|
47
50
|
#endif
|
51
|
+
#endif
|
48
52
|
static std::string format(const char * fmt, ...) {
|
49
53
|
va_list ap, ap2;
|
50
54
|
va_start(ap, fmt);
|
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
|
57
61
|
va_end(ap2);
|
58
62
|
va_end(ap);
|
59
63
|
return std::string(buf.data(), size);
|
60
|
-
}
|
64
|
+
}
|
61
65
|
|
62
66
|
struct llama_file {
|
63
67
|
// use FILE * so we don't have to re-open the file to mmap
|
@@ -164,7 +168,7 @@ struct llama_mmap {
|
|
164
168
|
#ifdef _POSIX_MAPPED_FILES
|
165
169
|
static constexpr bool SUPPORTED = true;
|
166
170
|
|
167
|
-
llama_mmap(struct llama_file * file) {
|
171
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
168
172
|
size = file->size;
|
169
173
|
int fd = fileno(file->fp);
|
170
174
|
int flags = MAP_SHARED;
|
@@ -172,15 +176,16 @@ struct llama_mmap {
|
|
172
176
|
flags |= MAP_POPULATE;
|
173
177
|
#endif
|
174
178
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
175
|
-
close(fd);
|
176
179
|
if (addr == MAP_FAILED) {
|
177
180
|
throw format("mmap failed: %s", strerror(errno));
|
178
181
|
}
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
183
|
+
if (prefetch) {
|
184
|
+
// Advise the kernel to preload the mapped memory
|
185
|
+
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
186
|
+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
187
|
+
strerror(errno));
|
188
|
+
}
|
184
189
|
}
|
185
190
|
}
|
186
191
|
|
@@ -190,7 +195,7 @@ struct llama_mmap {
|
|
190
195
|
#elif defined(_WIN32)
|
191
196
|
static constexpr bool SUPPORTED = true;
|
192
197
|
|
193
|
-
llama_mmap(struct llama_file * file) {
|
198
|
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
194
199
|
size = file->size;
|
195
200
|
|
196
201
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
@@ -212,13 +217,15 @@ struct llama_mmap {
|
|
212
217
|
}
|
213
218
|
|
214
219
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
220
|
+
if (prefetch) {
|
221
|
+
// Advise the kernel to preload the mapped memory
|
222
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
223
|
+
range.VirtualAddress = addr;
|
224
|
+
range.NumberOfBytes = (SIZE_T)size;
|
225
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
226
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
227
|
+
llama_format_win_err(GetLastError()).c_str());
|
228
|
+
}
|
222
229
|
}
|
223
230
|
#else
|
224
231
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-315a95a'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
|
|
17
17
|
# @param n_threads [Integer]
|
18
18
|
# @return [String]
|
19
19
|
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
20
|
+
spaced_prompt = " #{prompt}"
|
21
21
|
|
22
|
-
embd_input = context.tokenize(text:
|
22
|
+
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
23
|
|
24
24
|
n_ctx = context.n_ctx
|
25
25
|
last_n_tokens = [0] * n_ctx
|
@@ -71,6 +71,6 @@ module LLaMACpp
|
|
71
71
|
break if embd[-1] == LLaMACpp.token_eos
|
72
72
|
end
|
73
73
|
|
74
|
-
output.join.delete_prefix(
|
74
|
+
output.join.delete_prefix(spaced_prompt).strip
|
75
75
|
end
|
76
76
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -9,6 +9,8 @@ module LLaMACpp
|
|
9
9
|
def self?.print_system_info: () -> void
|
10
10
|
def self?.token_bos: () -> Integer
|
11
11
|
def self?.token_eos: () -> Integer
|
12
|
+
def self?.mmap_supported?: () -> bool
|
13
|
+
def self?.mlock_supported?: () -> bool
|
12
14
|
|
13
15
|
class Context
|
14
16
|
public
|
@@ -28,6 +30,7 @@ module LLaMACpp
|
|
28
30
|
def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
|
29
31
|
def token_to_str: (Integer) -> String
|
30
32
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
33
|
+
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
31
34
|
end
|
32
35
|
|
33
36
|
class ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|