llama_cpp 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/ext/llama_cpp/extconf.rb +24 -1
- data/ext/llama_cpp/llama_cpp.cpp +72 -0
- data/ext/llama_cpp/src/ggml-cuda.h +44 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +2324 -969
- data/ext/llama_cpp/src/ggml.h +656 -619
- data/ext/llama_cpp/src/llama.cpp +269 -42
- data/ext/llama_cpp/src/llama.h +22 -14
- data/ext/llama_cpp/src/llama_util.h +15 -3
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +26 -2
- metadata +6 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -24,6 +24,10 @@
|
|
24
24
|
#include <memory>
|
25
25
|
#include <algorithm>
|
26
26
|
#include <initializer_list>
|
27
|
+
#include <thread>
|
28
|
+
#include <atomic>
|
29
|
+
#include <mutex>
|
30
|
+
#include <sstream>
|
27
31
|
|
28
32
|
#define LLAMA_USE_SCRATCH
|
29
33
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -50,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
50
54
|
{ MODEL_7B, 512ull * MB },
|
51
55
|
{ MODEL_13B, 512ull * MB },
|
52
56
|
{ MODEL_30B, 512ull * MB },
|
53
|
-
{ MODEL_65B,
|
57
|
+
{ MODEL_65B, 1024ull * MB },
|
54
58
|
};
|
55
59
|
return _MEM_REQ_SCRATCH0;
|
56
60
|
}
|
@@ -61,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
61
65
|
{ MODEL_7B, 512ull * MB },
|
62
66
|
{ MODEL_13B, 512ull * MB },
|
63
67
|
{ MODEL_30B, 512ull * MB },
|
64
|
-
{ MODEL_65B,
|
68
|
+
{ MODEL_65B, 1024ull * MB },
|
65
69
|
};
|
66
70
|
return _MEM_REQ_SCRATCH1;
|
67
|
-
}
|
71
|
+
}
|
68
72
|
|
69
73
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
70
74
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
@@ -76,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
76
80
|
{ MODEL_65B, 5120ull * MB },
|
77
81
|
};
|
78
82
|
return _MEM_REQ_KV_SELF;
|
79
|
-
}
|
83
|
+
}
|
80
84
|
|
81
85
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
82
86
|
// not actually needed if BLAS is disabled
|
@@ -89,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
89
93
|
{ MODEL_65B, 1536ull * MB },
|
90
94
|
};
|
91
95
|
return _MEM_REQ_EVAL;
|
92
|
-
}
|
96
|
+
}
|
93
97
|
|
94
98
|
// default hparams (LLaMA 7B)
|
95
99
|
struct llama_hparams {
|
@@ -478,6 +482,11 @@ struct llama_file_loader {
|
|
478
482
|
case GGML_TYPE_F16:
|
479
483
|
case GGML_TYPE_Q4_0:
|
480
484
|
case GGML_TYPE_Q4_1:
|
485
|
+
case GGML_TYPE_Q4_2:
|
486
|
+
case GGML_TYPE_Q4_3:
|
487
|
+
case GGML_TYPE_Q5_0:
|
488
|
+
case GGML_TYPE_Q5_1:
|
489
|
+
case GGML_TYPE_Q8_0:
|
481
490
|
break;
|
482
491
|
default: {
|
483
492
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -550,6 +559,11 @@ struct llama_file_saver {
|
|
550
559
|
case GGML_TYPE_F16:
|
551
560
|
case GGML_TYPE_Q4_0:
|
552
561
|
case GGML_TYPE_Q4_1:
|
562
|
+
case GGML_TYPE_Q4_2:
|
563
|
+
case GGML_TYPE_Q4_3:
|
564
|
+
case GGML_TYPE_Q5_0:
|
565
|
+
case GGML_TYPE_Q5_1:
|
566
|
+
case GGML_TYPE_Q8_0:
|
553
567
|
break;
|
554
568
|
default: LLAMA_ASSERT(false);
|
555
569
|
}
|
@@ -838,6 +852,11 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
838
852
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
839
853
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
840
854
|
return "mostly Q4_1, some F16";
|
855
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
856
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
841
860
|
default: return "unknown, may not work";
|
842
861
|
}
|
843
862
|
}
|
@@ -1066,7 +1085,7 @@ static bool llama_eval_internal(
|
|
1066
1085
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1067
1086
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1068
1087
|
ggml_cgraph gf = {};
|
1069
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
1088
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1070
1089
|
|
1071
1090
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1072
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1240,9 +1259,11 @@ static bool llama_eval_internal(
|
|
1240
1259
|
ggml_build_forward_expand(&gf, inpL);
|
1241
1260
|
ggml_graph_compute (ctx0, &gf);
|
1242
1261
|
|
1262
|
+
#ifdef GGML_PERF
|
1243
1263
|
// print timing information per ggml operation (for debugging purposes)
|
1244
1264
|
// requires GGML_PERF to be defined
|
1245
|
-
|
1265
|
+
ggml_graph_print(&gf);
|
1266
|
+
#endif
|
1246
1267
|
|
1247
1268
|
// plot the computation graph in dot format (for debugging purposes)
|
1248
1269
|
//if (n_past%100 == 0) {
|
@@ -1566,14 +1587,23 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|
1566
1587
|
// quantization
|
1567
1588
|
//
|
1568
1589
|
|
1569
|
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
1590
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
1570
1591
|
ggml_type quantized_type;
|
1571
1592
|
switch (ftype) {
|
1572
1593
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
1573
1594
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1595
|
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1596
|
+
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
1574
1600
|
default: throw format("invalid output file type %d\n", ftype);
|
1575
1601
|
};
|
1576
1602
|
|
1603
|
+
if (nthread <= 0) {
|
1604
|
+
nthread = std::thread::hardware_concurrency();
|
1605
|
+
}
|
1606
|
+
|
1577
1607
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
1578
1608
|
/*vocab_only*/ false));
|
1579
1609
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
@@ -1582,6 +1612,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1612
|
size_t total_size_new = 0;
|
1583
1613
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
1584
1614
|
|
1615
|
+
std::vector<std::thread> workers;
|
1616
|
+
std::mutex mutex;
|
1617
|
+
|
1585
1618
|
size_t idx = 0;
|
1586
1619
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
1587
1620
|
llama_buffer read_data;
|
@@ -1600,6 +1633,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1600
1633
|
// quantize only 2D tensors
|
1601
1634
|
quantize &= (tensor.ne.size() == 2);
|
1602
1635
|
|
1636
|
+
// uncomment this to keep the output layer in FP16
|
1637
|
+
//if (tensor.name == "output.weight") {
|
1638
|
+
// quantize = false;
|
1639
|
+
//}
|
1640
|
+
|
1603
1641
|
enum ggml_type new_type;
|
1604
1642
|
void * new_data;
|
1605
1643
|
size_t new_size;
|
@@ -1635,17 +1673,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1635
1673
|
new_data = work.addr;
|
1636
1674
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1637
1675
|
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1676
|
+
int chunk_size = 32 * 512;
|
1677
|
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
1678
|
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
1679
|
+
if (nthread_use < 2) {
|
1680
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
1681
|
+
} else {
|
1682
|
+
size_t counter = 0;
|
1683
|
+
new_size = 0;
|
1684
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
1685
|
+
std::vector<int64_t> local_hist;
|
1686
|
+
size_t local_size = 0;
|
1687
|
+
while (true) {
|
1688
|
+
std::unique_lock<std::mutex> lock(mutex);
|
1689
|
+
size_t first = counter; counter += chunk_size;
|
1690
|
+
if (first >= nelements) {
|
1691
|
+
if (!local_hist.empty()) {
|
1692
|
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
1693
|
+
new_size += local_size;
|
1694
|
+
}
|
1695
|
+
break;
|
1696
|
+
}
|
1697
|
+
lock.unlock();
|
1698
|
+
size_t last = std::min(nelements, first + chunk_size);
|
1699
|
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
1700
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
1701
|
+
}
|
1702
|
+
};
|
1703
|
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
1704
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
1705
|
+
compute();
|
1706
|
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
1649
1707
|
}
|
1650
1708
|
|
1651
1709
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -1744,7 +1802,7 @@ struct llama_context * llama_init_from_file(
|
|
1744
1802
|
if (params.logits_all) {
|
1745
1803
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1746
1804
|
} else {
|
1747
|
-
ctx->logits.reserve(hparams.
|
1805
|
+
ctx->logits.reserve(hparams.n_vocab);
|
1748
1806
|
}
|
1749
1807
|
|
1750
1808
|
if (params.embedding){
|
@@ -1767,9 +1825,10 @@ void llama_free(struct llama_context * ctx) {
|
|
1767
1825
|
int llama_model_quantize(
|
1768
1826
|
const char * fname_inp,
|
1769
1827
|
const char * fname_out,
|
1770
|
-
enum llama_ftype ftype
|
1828
|
+
enum llama_ftype ftype,
|
1829
|
+
int nthread) {
|
1771
1830
|
try {
|
1772
|
-
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
1831
|
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
1773
1832
|
return 0;
|
1774
1833
|
} catch (const std::string & err) {
|
1775
1834
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
@@ -1955,7 +2014,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
1955
2014
|
base_t = dest_t;
|
1956
2015
|
}
|
1957
2016
|
|
1958
|
-
if (base_t->type
|
2017
|
+
if (ggml_is_quantized(base_t->type)) {
|
1959
2018
|
if (!warned) {
|
1960
2019
|
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
1961
2020
|
"use a f16 or f32 base model with --lora-base\n", __func__);
|
@@ -2025,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2025
2084
|
}
|
2026
2085
|
}
|
2027
2086
|
|
2028
|
-
|
2029
|
-
|
2030
|
-
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
2031
|
-
return ctx->model.kv_self.buf.addr;
|
2087
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2088
|
+
return ctx->model.kv_self.n;
|
2032
2089
|
}
|
2033
2090
|
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2091
|
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
2092
|
+
|
2093
|
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
+
if (seed <= 0) {
|
2095
|
+
seed = time(NULL);
|
2096
|
+
}
|
2097
|
+
ctx->rng.seed(seed);
|
2037
2098
|
}
|
2038
2099
|
|
2039
|
-
|
2040
|
-
|
2100
|
+
// Returns the size of the state
|
2101
|
+
size_t llama_get_state_size(struct llama_context * ctx) {
|
2102
|
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
|
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
|
+
const size_t s_rng_size = sizeof(size_t);
|
2105
|
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
2106
|
+
const size_t s_logits_capacity = sizeof(size_t);
|
2107
|
+
const size_t s_logits_size = sizeof(size_t);
|
2108
|
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
2109
|
+
const size_t s_embedding_size = sizeof(size_t);
|
2110
|
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
2111
|
+
const size_t s_kv_size = sizeof(size_t);
|
2112
|
+
const size_t s_kv_ntok = sizeof(int);
|
2113
|
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
2114
|
+
|
2115
|
+
const size_t s_total = (
|
2116
|
+
+ s_rng_size
|
2117
|
+
+ s_rng
|
2118
|
+
+ s_logits_capacity
|
2119
|
+
+ s_logits_size
|
2120
|
+
+ s_logits
|
2121
|
+
+ s_embedding_size
|
2122
|
+
+ s_embedding
|
2123
|
+
+ s_kv_size
|
2124
|
+
+ s_kv_ntok
|
2125
|
+
+ s_kv
|
2126
|
+
);
|
2127
|
+
|
2128
|
+
return s_total;
|
2041
2129
|
}
|
2042
2130
|
|
2043
|
-
//
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2131
|
+
// Copies the state to the specified destination address
|
2132
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
2133
|
+
uint8_t * out = dest;
|
2134
|
+
|
2135
|
+
// copy rng
|
2136
|
+
{
|
2137
|
+
std::stringstream rng_ss;
|
2138
|
+
rng_ss << ctx->rng;
|
2139
|
+
|
2140
|
+
const size_t rng_size = rng_ss.str().size();
|
2141
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2142
|
+
|
2143
|
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
2144
|
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
2145
|
+
|
2146
|
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
2147
|
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
2148
|
+
}
|
2149
|
+
|
2150
|
+
// copy logits
|
2151
|
+
{
|
2152
|
+
const size_t logits_cap = ctx->logits.capacity();
|
2153
|
+
const size_t logits_size = ctx->logits.size();
|
2154
|
+
|
2155
|
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
2156
|
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
2157
|
+
|
2158
|
+
if (logits_size) {
|
2159
|
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
out += logits_cap * sizeof(float);
|
2163
|
+
}
|
2164
|
+
|
2165
|
+
// copy embeddings
|
2166
|
+
{
|
2167
|
+
const size_t embedding_size = ctx->embedding.size();
|
2168
|
+
|
2169
|
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
2170
|
+
|
2171
|
+
if (embedding_size) {
|
2172
|
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
2173
|
+
out += embedding_size * sizeof(float);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
// copy kv cache
|
2178
|
+
{
|
2179
|
+
const size_t kv_size = ctx->model.kv_self.buf.size;
|
2180
|
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
|
+
|
2182
|
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
|
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
|
+
|
2185
|
+
if (kv_size) {
|
2186
|
+
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
2187
|
+
}
|
2188
|
+
}
|
2189
|
+
|
2190
|
+
const size_t written = out - dest;
|
2191
|
+
const size_t expected = llama_get_state_size(ctx);
|
2192
|
+
|
2193
|
+
LLAMA_ASSERT(written == expected);
|
2194
|
+
|
2195
|
+
return written;
|
2196
|
+
}
|
2197
|
+
|
2198
|
+
// Sets the state reading from the specified source address
|
2199
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2200
|
+
const uint8_t * in = src;
|
2201
|
+
|
2202
|
+
// set rng
|
2203
|
+
{
|
2204
|
+
size_t rng_size;
|
2205
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2206
|
+
|
2207
|
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
2208
|
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
2209
|
+
|
2210
|
+
std::stringstream rng_ss;
|
2211
|
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
2212
|
+
rng_ss >> ctx->rng;
|
2213
|
+
|
2214
|
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
// set logits
|
2218
|
+
{
|
2219
|
+
size_t logits_cap;
|
2220
|
+
size_t logits_size;
|
2221
|
+
|
2222
|
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
2223
|
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
2224
|
+
|
2225
|
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2226
|
+
|
2227
|
+
if (logits_size) {
|
2228
|
+
ctx->logits.resize(logits_size);
|
2229
|
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
2230
|
+
}
|
2231
|
+
|
2232
|
+
in += logits_cap * sizeof(float);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
// set embeddings
|
2236
|
+
{
|
2237
|
+
size_t embedding_size;
|
2238
|
+
|
2239
|
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
2240
|
+
|
2241
|
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2242
|
+
|
2243
|
+
if (embedding_size) {
|
2244
|
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
2245
|
+
in += embedding_size * sizeof(float);
|
2246
|
+
}
|
2247
|
+
}
|
2248
|
+
|
2249
|
+
// set kv cache
|
2250
|
+
{
|
2251
|
+
size_t kv_size;
|
2252
|
+
int kv_ntok;
|
2253
|
+
|
2254
|
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
2255
|
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2256
|
+
|
2257
|
+
if (kv_size) {
|
2258
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
2259
|
+
|
2260
|
+
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
2261
|
+
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
2262
|
+
|
2263
|
+
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
2264
|
+
|
2265
|
+
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
2266
|
+
ctx->model.kv_self.v->data = v_data;
|
2267
|
+
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
ctx->model.kv_self.n = kv_ntok;
|
2271
|
+
}
|
2272
|
+
|
2273
|
+
const size_t nread = in - src;
|
2274
|
+
const size_t expected = llama_get_state_size(ctx);
|
2275
|
+
|
2276
|
+
LLAMA_ASSERT(nread == expected);
|
2277
|
+
|
2278
|
+
return nread;
|
2053
2279
|
}
|
2054
2280
|
|
2055
2281
|
int llama_eval(
|
@@ -2204,3 +2430,4 @@ const char * llama_print_system_info(void) {
|
|
2204
2430
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2205
2431
|
return ctx->model.tensors_by_name;
|
2206
2432
|
}
|
2433
|
+
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -72,6 +72,11 @@ extern "C" {
|
|
72
72
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
73
73
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
|
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
|
+
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
77
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
75
80
|
};
|
76
81
|
|
77
82
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -91,10 +96,12 @@ extern "C" {
|
|
91
96
|
|
92
97
|
// TODO: not great API - very likely to change
|
93
98
|
// Returns 0 on success
|
99
|
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
94
100
|
LLAMA_API int llama_model_quantize(
|
95
101
|
const char * fname_inp,
|
96
102
|
const char * fname_out,
|
97
|
-
enum llama_ftype ftype
|
103
|
+
enum llama_ftype ftype,
|
104
|
+
int nthread);
|
98
105
|
|
99
106
|
// Apply a LoRA adapter to a loaded model
|
100
107
|
// path_base_model is the path to a higher quality model to use as a base for
|
@@ -108,22 +115,23 @@ extern "C" {
|
|
108
115
|
const char * path_base_model,
|
109
116
|
int n_threads);
|
110
117
|
|
111
|
-
// Returns the KV cache that will contain the context for the
|
112
|
-
// ongoing prediction with the model.
|
113
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
114
|
-
|
115
|
-
// Returns the size of the KV cache
|
116
|
-
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
117
|
-
|
118
118
|
// Returns the number of tokens in the KV cache
|
119
119
|
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
120
120
|
|
121
|
-
// Sets the
|
122
|
-
LLAMA_API void
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
121
|
+
// Sets the current rng seed.
|
122
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
|
+
|
124
|
+
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
125
|
+
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
126
|
+
|
127
|
+
// Copies the state to the specified destination address.
|
128
|
+
// Destination needs to have allocated enough memory.
|
129
|
+
// Returns the number of bytes copied
|
130
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
131
|
+
|
132
|
+
// Set the state reading from the specified address
|
133
|
+
// Returns the number of bytes read
|
134
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
127
135
|
|
128
136
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
129
137
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -21,6 +21,9 @@
|
|
21
21
|
#if defined(_POSIX_MAPPED_FILES)
|
22
22
|
#include <sys/mman.h>
|
23
23
|
#endif
|
24
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
25
|
+
#include <sys/resource.h>
|
26
|
+
#endif
|
24
27
|
#endif
|
25
28
|
#endif
|
26
29
|
|
@@ -202,7 +205,6 @@ struct llama_mmap {
|
|
202
205
|
|
203
206
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
204
207
|
DWORD error = GetLastError();
|
205
|
-
CloseHandle(hFile);
|
206
208
|
|
207
209
|
if (hMapping == NULL) {
|
208
210
|
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
@@ -304,8 +306,18 @@ struct llama_mlock {
|
|
304
306
|
if (!mlock(addr, size)) {
|
305
307
|
return true;
|
306
308
|
} else {
|
307
|
-
|
308
|
-
|
309
|
+
char* errmsg = std::strerror(errno);
|
310
|
+
bool suggest = (errno == ENOMEM);
|
311
|
+
|
312
|
+
// Check if the resource limit is fine after all
|
313
|
+
struct rlimit lock_limit;
|
314
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
315
|
+
suggest = false;
|
316
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
317
|
+
suggest = false;
|
318
|
+
|
319
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
320
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
309
321
|
return false;
|
310
322
|
}
|
311
323
|
}
|