llama_cpp 0.0.5 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/ext/llama_cpp/extconf.rb +24 -1
- data/ext/llama_cpp/llama_cpp.cpp +72 -0
- data/ext/llama_cpp/src/ggml-cuda.h +44 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +2324 -969
- data/ext/llama_cpp/src/ggml.h +656 -619
- data/ext/llama_cpp/src/llama.cpp +269 -42
- data/ext/llama_cpp/src/llama.h +22 -14
- data/ext/llama_cpp/src/llama_util.h +15 -3
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +26 -2
- metadata +6 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -24,6 +24,10 @@
|
|
24
24
|
#include <memory>
|
25
25
|
#include <algorithm>
|
26
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
30
|
+
#include <sstream>
|
27
31
|
|
28
32
|
#define LLAMA_USE_SCRATCH
|
29
33
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -50,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
50
54
|
{ MODEL_7B, 512ull * MB },
|
51
55
|
{ MODEL_13B, 512ull * MB },
|
52
56
|
{ MODEL_30B, 512ull * MB },
|
53
|
-
{ MODEL_65B,
|
57
|
+
{ MODEL_65B, 1024ull * MB },
|
54
58
|
};
|
55
59
|
return _MEM_REQ_SCRATCH0;
|
56
60
|
}
|
@@ -61,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
61
65
|
{ MODEL_7B, 512ull * MB },
|
62
66
|
{ MODEL_13B, 512ull * MB },
|
63
67
|
{ MODEL_30B, 512ull * MB },
|
64
|
-
{ MODEL_65B,
|
68
|
+
{ MODEL_65B, 1024ull * MB },
|
65
69
|
};
|
66
70
|
return _MEM_REQ_SCRATCH1;
|
67
|
-
}
|
71
|
+
}
|
68
72
|
|
69
73
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
70
74
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
@@ -76,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
76
80
|
{ MODEL_65B, 5120ull * MB },
|
77
81
|
};
|
78
82
|
return _MEM_REQ_KV_SELF;
|
79
|
-
}
|
83
|
+
}
|
80
84
|
|
81
85
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
82
86
|
// not actually needed if BLAS is disabled
|
@@ -89,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
89
93
|
{ MODEL_65B, 1536ull * MB },
|
90
94
|
};
|
91
95
|
return _MEM_REQ_EVAL;
|
92
|
-
}
|
96
|
+
}
|
93
97
|
|
94
98
|
// default hparams (LLaMA 7B)
|
95
99
|
struct llama_hparams {
|
@@ -478,6 +482,11 @@ struct llama_file_loader {
|
|
478
482
|
case GGML_TYPE_F16:
|
479
483
|
case GGML_TYPE_Q4_0:
|
480
484
|
case GGML_TYPE_Q4_1:
|
485
|
+
case GGML_TYPE_Q4_2:
|
486
|
+
case GGML_TYPE_Q4_3:
|
487
|
+
case GGML_TYPE_Q5_0:
|
488
|
+
case GGML_TYPE_Q5_1:
|
489
|
+
case GGML_TYPE_Q8_0:
|
481
490
|
break;
|
482
491
|
default: {
|
483
492
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -550,6 +559,11 @@ struct llama_file_saver {
|
|
550
559
|
case GGML_TYPE_F16:
|
551
560
|
case GGML_TYPE_Q4_0:
|
552
561
|
case GGML_TYPE_Q4_1:
|
562
|
+
case GGML_TYPE_Q4_2:
|
563
|
+
case GGML_TYPE_Q4_3:
|
564
|
+
case GGML_TYPE_Q5_0:
|
565
|
+
case GGML_TYPE_Q5_1:
|
566
|
+
case GGML_TYPE_Q8_0:
|
553
567
|
break;
|
554
568
|
default: LLAMA_ASSERT(false);
|
555
569
|
}
|
@@ -838,6 +852,11 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
838
852
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
853
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
854
|
return "mostly Q4_1, some F16";
|
855
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
841
860
|
default: return "unknown, may not work";
|
842
861
|
}
|
843
862
|
}
|
@@ -1066,7 +1085,7 @@ static bool llama_eval_internal(
|
|
1066
1085
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1067
1086
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1068
1087
|
ggml_cgraph gf = {};
|
1069
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1088
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1070
1089
|
|
1071
1090
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1072
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1240,9 +1259,11 @@ static bool llama_eval_internal(
|
|
1240
1259
|
ggml_build_forward_expand(&gf, inpL);
|
1241
1260
|
ggml_graph_compute (ctx0, &gf);
|
1242
1261
|
|
1262
|
+
#ifdef GGML_PERF
|
1243
1263
|
// print timing information per ggml operation (for debugging purposes)
|
1244
1264
|
// requires GGML_PERF to be defined
|
1245
|
-
|
1265
|
+
ggml_graph_print(&gf);
|
1266
|
+
#endif
|
1246
1267
|
|
1247
1268
|
// plot the computation graph in dot format (for debugging purposes)
|
1248
1269
|
//if (n_past%100 == 0) {
|
@@ -1566,14 +1587,23 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1566
1587
|
// quantization
|
1567
1588
|
//
|
1568
1589
|
|
1569
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1590
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1570
1591
|
ggml_type quantized_type;
|
1571
1592
|
switch (ftype) {
|
1572
1593
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
1594
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1595
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1596
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
1574
1600
|
default: throw format("invalid output file type %d\n", ftype);
|
1575
1601
|
};
|
1576
1602
|
|
1603
|
+
if (nthread <= 0) {
|
1604
|
+
nthread = std::thread::hardware_concurrency();
|
1605
|
+
}
|
1606
|
+
|
1577
1607
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
1608
|
/*vocab_only*/ false));
|
1579
1609
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1582,6 +1612,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1612
|
size_t total_size_new = 0;
|
1583
1613
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
1614
|
|
1615
|
+
std::vector<std::thread> workers;
|
1616
|
+
std::mutex mutex;
|
1617
|
+
|
1585
1618
|
size_t idx = 0;
|
1586
1619
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
1620
|
llama_buffer read_data;
|
@@ -1600,6 +1633,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1600
1633
|
// quantize only 2D tensors
|
1601
1634
|
quantize &= (tensor.ne.size() == 2);
|
1602
1635
|
|
1636
|
+
// uncomment this to keep the output layer in FP16
|
1637
|
+
//if (tensor.name == "output.weight") {
|
1638
|
+
// quantize = false;
|
1639
|
+
//}
|
1640
|
+
|
1603
1641
|
enum ggml_type new_type;
|
1604
1642
|
void * new_data;
|
1605
1643
|
size_t new_size;
|
@@ -1635,17 +1673,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1635
1673
|
new_data = work.addr;
|
1636
1674
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
1675
|
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1676
|
+
int chunk_size = 32 * 512;
|
1677
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1678
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1679
|
+
if (nthread_use < 2) {
|
1680
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1681
|
+
} else {
|
1682
|
+
size_t counter = 0;
|
1683
|
+
new_size = 0;
|
1684
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1685
|
+
std::vector<int64_t> local_hist;
|
1686
|
+
size_t local_size = 0;
|
1687
|
+
while (true) {
|
1688
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1689
|
+
size_t first = counter; counter += chunk_size;
|
1690
|
+
if (first >= nelements) {
|
1691
|
+
if (!local_hist.empty()) {
|
1692
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1693
|
+
new_size += local_size;
|
1694
|
+
}
|
1695
|
+
break;
|
1696
|
+
}
|
1697
|
+
lock.unlock();
|
1698
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1699
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1700
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1701
|
+
}
|
1702
|
+
};
|
1703
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1704
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1705
|
+
compute();
|
1706
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1649
1707
|
}
|
1650
1708
|
|
1651
1709
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1744,7 +1802,7 @@ struct llama_context * llama_init_from_file(
|
|
1744
1802
|
if (params.logits_all) {
|
1745
1803
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1746
1804
|
} else {
|
1747
|
-
ctx->logits.reserve(hparams.
|
1805
|
+
ctx->logits.reserve(hparams.n_vocab);
|
1748
1806
|
}
|
1749
1807
|
|
1750
1808
|
if (params.embedding){
|
@@ -1767,9 +1825,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1767
1825
|
int llama_model_quantize(
|
1768
1826
|
const char * fname_inp,
|
1769
1827
|
const char * fname_out,
|
1770
|
-
enum llama_ftype ftype
|
1828
|
+
enum llama_ftype ftype,
|
1829
|
+
int nthread) {
|
1771
1830
|
try {
|
1772
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1831
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1773
1832
|
return 0;
|
1774
1833
|
} catch (const std::string & err) {
|
1775
1834
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1955,7 +2014,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1955
2014
|
base_t = dest_t;
|
1956
2015
|
}
|
1957
2016
|
|
1958
|
-
if (base_t->type
|
2017
|
+
if (ggml_is_quantized(base_t->type)) {
|
1959
2018
|
if (!warned) {
|
1960
2019
|
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
2020
|
"use a f16 or f32 base model with --lora-base\n", __func__);
|
@@ -2025,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2025
2084
|
}
|
2026
2085
|
}
|
2027
2086
|
|
2028
|
-
|
2029
|
-
|
2030
|
-
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
2031
|
-
return ctx->model.kv_self.buf.addr;
|
2087
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2088
|
+
return ctx->model.kv_self.n;
|
2032
2089
|
}
|
2033
2090
|
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2091
|
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
2092
|
+
|
2093
|
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
+
if (seed <= 0) {
|
2095
|
+
seed = time(NULL);
|
2096
|
+
}
|
2097
|
+
ctx->rng.seed(seed);
|
2037
2098
|
}
|
2038
2099
|
|
2039
|
-
|
2040
|
-
|
2100
|
+
// Returns the size of the state
|
2101
|
+
size_t llama_get_state_size(struct llama_context * ctx) {
|
2102
|
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
|
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
|
+
const size_t s_rng_size = sizeof(size_t);
|
2105
|
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
2106
|
+
const size_t s_logits_capacity = sizeof(size_t);
|
2107
|
+
const size_t s_logits_size = sizeof(size_t);
|
2108
|
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
2109
|
+
const size_t s_embedding_size = sizeof(size_t);
|
2110
|
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
2111
|
+
const size_t s_kv_size = sizeof(size_t);
|
2112
|
+
const size_t s_kv_ntok = sizeof(int);
|
2113
|
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
2114
|
+
|
2115
|
+
const size_t s_total = (
|
2116
|
+
+ s_rng_size
|
2117
|
+
+ s_rng
|
2118
|
+
+ s_logits_capacity
|
2119
|
+
+ s_logits_size
|
2120
|
+
+ s_logits
|
2121
|
+
+ s_embedding_size
|
2122
|
+
+ s_embedding
|
2123
|
+
+ s_kv_size
|
2124
|
+
+ s_kv_ntok
|
2125
|
+
+ s_kv
|
2126
|
+
);
|
2127
|
+
|
2128
|
+
return s_total;
|
2041
2129
|
}
|
2042
2130
|
|
2043
|
-
//
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2131
|
+
// Copies the state to the specified destination address
|
2132
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
2133
|
+
uint8_t * out = dest;
|
2134
|
+
|
2135
|
+
// copy rng
|
2136
|
+
{
|
2137
|
+
std::stringstream rng_ss;
|
2138
|
+
rng_ss << ctx->rng;
|
2139
|
+
|
2140
|
+
const size_t rng_size = rng_ss.str().size();
|
2141
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2142
|
+
|
2143
|
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
2144
|
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
2145
|
+
|
2146
|
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
2147
|
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
2148
|
+
}
|
2149
|
+
|
2150
|
+
// copy logits
|
2151
|
+
{
|
2152
|
+
const size_t logits_cap = ctx->logits.capacity();
|
2153
|
+
const size_t logits_size = ctx->logits.size();
|
2154
|
+
|
2155
|
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
2156
|
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
2157
|
+
|
2158
|
+
if (logits_size) {
|
2159
|
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
out += logits_cap * sizeof(float);
|
2163
|
+
}
|
2164
|
+
|
2165
|
+
// copy embeddings
|
2166
|
+
{
|
2167
|
+
const size_t embedding_size = ctx->embedding.size();
|
2168
|
+
|
2169
|
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
2170
|
+
|
2171
|
+
if (embedding_size) {
|
2172
|
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
2173
|
+
out += embedding_size * sizeof(float);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
// copy kv cache
|
2178
|
+
{
|
2179
|
+
const size_t kv_size = ctx->model.kv_self.buf.size;
|
2180
|
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
|
+
|
2182
|
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
|
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
|
+
|
2185
|
+
if (kv_size) {
|
2186
|
+
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
2187
|
+
}
|
2188
|
+
}
|
2189
|
+
|
2190
|
+
const size_t written = out - dest;
|
2191
|
+
const size_t expected = llama_get_state_size(ctx);
|
2192
|
+
|
2193
|
+
LLAMA_ASSERT(written == expected);
|
2194
|
+
|
2195
|
+
return written;
|
2196
|
+
}
|
2197
|
+
|
2198
|
+
// Sets the state reading from the specified source address
|
2199
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2200
|
+
const uint8_t * in = src;
|
2201
|
+
|
2202
|
+
// set rng
|
2203
|
+
{
|
2204
|
+
size_t rng_size;
|
2205
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2206
|
+
|
2207
|
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
2208
|
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
2209
|
+
|
2210
|
+
std::stringstream rng_ss;
|
2211
|
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
2212
|
+
rng_ss >> ctx->rng;
|
2213
|
+
|
2214
|
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
// set logits
|
2218
|
+
{
|
2219
|
+
size_t logits_cap;
|
2220
|
+
size_t logits_size;
|
2221
|
+
|
2222
|
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
2223
|
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
2224
|
+
|
2225
|
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2226
|
+
|
2227
|
+
if (logits_size) {
|
2228
|
+
ctx->logits.resize(logits_size);
|
2229
|
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
2230
|
+
}
|
2231
|
+
|
2232
|
+
in += logits_cap * sizeof(float);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
// set embeddings
|
2236
|
+
{
|
2237
|
+
size_t embedding_size;
|
2238
|
+
|
2239
|
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
2240
|
+
|
2241
|
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2242
|
+
|
2243
|
+
if (embedding_size) {
|
2244
|
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
2245
|
+
in += embedding_size * sizeof(float);
|
2246
|
+
}
|
2247
|
+
}
|
2248
|
+
|
2249
|
+
// set kv cache
|
2250
|
+
{
|
2251
|
+
size_t kv_size;
|
2252
|
+
int kv_ntok;
|
2253
|
+
|
2254
|
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
2255
|
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2256
|
+
|
2257
|
+
if (kv_size) {
|
2258
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
2259
|
+
|
2260
|
+
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
2261
|
+
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
2262
|
+
|
2263
|
+
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
2264
|
+
|
2265
|
+
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
2266
|
+
ctx->model.kv_self.v->data = v_data;
|
2267
|
+
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
ctx->model.kv_self.n = kv_ntok;
|
2271
|
+
}
|
2272
|
+
|
2273
|
+
const size_t nread = in - src;
|
2274
|
+
const size_t expected = llama_get_state_size(ctx);
|
2275
|
+
|
2276
|
+
LLAMA_ASSERT(nread == expected);
|
2277
|
+
|
2278
|
+
return nread;
|
2053
2279
|
}
|
2054
2280
|
|
2055
2281
|
int llama_eval(
|
@@ -2204,3 +2430,4 @@ const char * llama_print_system_info(void) {
|
|
2204
2430
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2205
2431
|
return ctx->model.tensors_by_name;
|
2206
2432
|
}
|
2433
|
+
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,11 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
77
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
75
80
|
};
|
76
81
|
|
77
82
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +96,12 @@ extern "C" {
|
|
91
96
|
|
92
97
|
// TODO: not great API - very likely to change
|
93
98
|
// Returns 0 on success
|
99
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
100
|
LLAMA_API int llama_model_quantize(
|
95
101
|
const char * fname_inp,
|
96
102
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
103
|
+
enum llama_ftype ftype,
|
104
|
+
int nthread);
|
98
105
|
|
99
106
|
// Apply a LoRA adapter to a loaded model
|
100
107
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -108,22 +115,23 @@ extern "C" {
|
|
108
115
|
const char * path_base_model,
|
109
116
|
int n_threads);
|
110
117
|
|
111
|
-
// Returns the KV cache that will contain the context for the
|
112
|
-
// ongoing prediction with the model.
|
113
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
114
|
-
|
115
|
-
// Returns the size of the KV cache
|
116
|
-
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
117
|
-
|
118
118
|
// Returns the number of tokens in the KV cache
|
119
119
|
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
120
120
|
|
121
|
-
// Sets the
|
122
|
-
LLAMA_API void
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
121
|
+
// Sets the current rng seed.
|
122
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
|
+
|
124
|
+
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
125
|
+
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
126
|
+
|
127
|
+
// Copies the state to the specified destination address.
|
128
|
+
// Destination needs to have allocated enough memory.
|
129
|
+
// Returns the number of bytes copied
|
130
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
131
|
+
|
132
|
+
// Set the state reading from the specified address
|
133
|
+
// Returns the number of bytes read
|
134
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
127
135
|
|
128
136
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
129
137
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -21,6 +21,9 @@
|
|
21
21
|
#if defined(_POSIX_MAPPED_FILES)
|
22
22
|
#include <sys/mman.h>
|
23
23
|
#endif
|
24
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
25
|
+
#include <sys/resource.h>
|
26
|
+
#endif
|
24
27
|
#endif
|
25
28
|
#endif
|
26
29
|
|
@@ -202,7 +205,6 @@ struct llama_mmap {
|
|
202
205
|
|
203
206
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
204
207
|
DWORD error = GetLastError();
|
205
|
-
CloseHandle(hFile);
|
206
208
|
|
207
209
|
if (hMapping == NULL) {
|
208
210
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
@@ -304,8 +306,18 @@ struct llama_mlock {
|
|
304
306
|
if (!mlock(addr, size)) {
|
305
307
|
return true;
|
306
308
|
} else {
|
307
|
-
|
308
|
-
|
309
|
+
char* errmsg = std::strerror(errno);
|
310
|
+
bool suggest = (errno == ENOMEM);
|
311
|
+
|
312
|
+
// Check if the resource limit is fine after all
|
313
|
+
struct rlimit lock_limit;
|
314
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
315
|
+
suggest = false;
|
316
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
317
|
+
suggest = false;
|
318
|
+
|
319
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
320
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
309
321
|
return false;
|
310
322
|
}
|
311
323
|
}
|