llama_cpp 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +26 -0
- data/ext/llama_cpp/src/ggml-cuda.h +32 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1436 -624
- data/ext/llama_cpp/src/ggml.h +654 -627
- data/ext/llama_cpp/src/llama.cpp +212 -29
- data/ext/llama_cpp/src/llama.h +17 -13
- data/ext/llama_cpp/src/llama_util.h +15 -2
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +16 -1
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -27,6 +27,7 @@
|
|
27
27
|
#include <thread>
|
28
28
|
#include <atomic>
|
29
29
|
#include <mutex>
|
30
|
+
#include <sstream>
|
30
31
|
|
31
32
|
#define LLAMA_USE_SCRATCH
|
32
33
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
53
54
|
{ MODEL_7B, 512ull * MB },
|
54
55
|
{ MODEL_13B, 512ull * MB },
|
55
56
|
{ MODEL_30B, 512ull * MB },
|
56
|
-
{ MODEL_65B,
|
57
|
+
{ MODEL_65B, 1024ull * MB },
|
57
58
|
};
|
58
59
|
return _MEM_REQ_SCRATCH0;
|
59
60
|
}
|
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
64
65
|
{ MODEL_7B, 512ull * MB },
|
65
66
|
{ MODEL_13B, 512ull * MB },
|
66
67
|
{ MODEL_30B, 512ull * MB },
|
67
|
-
{ MODEL_65B,
|
68
|
+
{ MODEL_65B, 1024ull * MB },
|
68
69
|
};
|
69
70
|
return _MEM_REQ_SCRATCH1;
|
70
|
-
}
|
71
|
+
}
|
71
72
|
|
72
73
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
73
74
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
79
80
|
{ MODEL_65B, 5120ull * MB },
|
80
81
|
};
|
81
82
|
return _MEM_REQ_KV_SELF;
|
82
|
-
}
|
83
|
+
}
|
83
84
|
|
84
85
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
85
86
|
// not actually needed if BLAS is disabled
|
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
92
93
|
{ MODEL_65B, 1536ull * MB },
|
93
94
|
};
|
94
95
|
return _MEM_REQ_EVAL;
|
95
|
-
}
|
96
|
+
}
|
96
97
|
|
97
98
|
// default hparams (LLaMA 7B)
|
98
99
|
struct llama_hparams {
|
@@ -483,6 +484,9 @@ struct llama_file_loader {
|
|
483
484
|
case GGML_TYPE_Q4_1:
|
484
485
|
case GGML_TYPE_Q4_2:
|
485
486
|
case GGML_TYPE_Q4_3:
|
487
|
+
case GGML_TYPE_Q5_0:
|
488
|
+
case GGML_TYPE_Q5_1:
|
489
|
+
case GGML_TYPE_Q8_0:
|
486
490
|
break;
|
487
491
|
default: {
|
488
492
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -557,6 +561,9 @@ struct llama_file_saver {
|
|
557
561
|
case GGML_TYPE_Q4_1:
|
558
562
|
case GGML_TYPE_Q4_2:
|
559
563
|
case GGML_TYPE_Q4_3:
|
564
|
+
case GGML_TYPE_Q5_0:
|
565
|
+
case GGML_TYPE_Q5_1:
|
566
|
+
case GGML_TYPE_Q8_0:
|
560
567
|
break;
|
561
568
|
default: LLAMA_ASSERT(false);
|
562
569
|
}
|
@@ -847,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
847
854
|
return "mostly Q4_1, some F16";
|
848
855
|
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
856
|
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
850
860
|
default: return "unknown, may not work";
|
851
861
|
}
|
852
862
|
}
|
@@ -1075,7 +1085,7 @@ static bool llama_eval_internal(
|
|
1075
1085
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1076
1086
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1077
1087
|
ggml_cgraph gf = {};
|
1078
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !
|
1088
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1079
1089
|
|
1080
1090
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1081
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1249,9 +1259,11 @@ static bool llama_eval_internal(
|
|
1249
1259
|
ggml_build_forward_expand(&gf, inpL);
|
1250
1260
|
ggml_graph_compute (ctx0, &gf);
|
1251
1261
|
|
1262
|
+
#ifdef GGML_PERF
|
1252
1263
|
// print timing information per ggml operation (for debugging purposes)
|
1253
1264
|
// requires GGML_PERF to be defined
|
1254
|
-
|
1265
|
+
ggml_graph_print(&gf);
|
1266
|
+
#endif
|
1255
1267
|
|
1256
1268
|
// plot the computation graph in dot format (for debugging purposes)
|
1257
1269
|
//if (n_past%100 == 0) {
|
@@ -1582,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1594
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
1595
|
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
1596
|
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
1585
1600
|
default: throw format("invalid output file type %d\n", ftype);
|
1586
1601
|
};
|
1587
1602
|
|
@@ -1618,8 +1633,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1618
1633
|
// quantize only 2D tensors
|
1619
1634
|
quantize &= (tensor.ne.size() == 2);
|
1620
1635
|
|
1621
|
-
//
|
1622
|
-
//if (tensor.name
|
1636
|
+
// uncomment this to keep the output layer in FP16
|
1637
|
+
//if (tensor.name == "output.weight") {
|
1623
1638
|
// quantize = false;
|
1624
1639
|
//}
|
1625
1640
|
|
@@ -1787,7 +1802,7 @@ struct llama_context * llama_init_from_file(
|
|
1787
1802
|
if (params.logits_all) {
|
1788
1803
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1789
1804
|
} else {
|
1790
|
-
ctx->logits.reserve(hparams.
|
1805
|
+
ctx->logits.reserve(hparams.n_vocab);
|
1791
1806
|
}
|
1792
1807
|
|
1793
1808
|
if (params.embedding){
|
@@ -2069,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2069
2084
|
}
|
2070
2085
|
}
|
2071
2086
|
|
2072
|
-
|
2073
|
-
|
2074
|
-
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
2075
|
-
return ctx->model.kv_self.buf.addr;
|
2087
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2088
|
+
return ctx->model.kv_self.n;
|
2076
2089
|
}
|
2077
2090
|
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2091
|
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
2092
|
+
|
2093
|
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
+
if (seed <= 0) {
|
2095
|
+
seed = time(NULL);
|
2096
|
+
}
|
2097
|
+
ctx->rng.seed(seed);
|
2081
2098
|
}
|
2082
2099
|
|
2083
|
-
|
2084
|
-
|
2100
|
+
// Returns the size of the state
|
2101
|
+
size_t llama_get_state_size(struct llama_context * ctx) {
|
2102
|
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
|
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
|
+
const size_t s_rng_size = sizeof(size_t);
|
2105
|
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
2106
|
+
const size_t s_logits_capacity = sizeof(size_t);
|
2107
|
+
const size_t s_logits_size = sizeof(size_t);
|
2108
|
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
2109
|
+
const size_t s_embedding_size = sizeof(size_t);
|
2110
|
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
2111
|
+
const size_t s_kv_size = sizeof(size_t);
|
2112
|
+
const size_t s_kv_ntok = sizeof(int);
|
2113
|
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
2114
|
+
|
2115
|
+
const size_t s_total = (
|
2116
|
+
+ s_rng_size
|
2117
|
+
+ s_rng
|
2118
|
+
+ s_logits_capacity
|
2119
|
+
+ s_logits_size
|
2120
|
+
+ s_logits
|
2121
|
+
+ s_embedding_size
|
2122
|
+
+ s_embedding
|
2123
|
+
+ s_kv_size
|
2124
|
+
+ s_kv_ntok
|
2125
|
+
+ s_kv
|
2126
|
+
);
|
2127
|
+
|
2128
|
+
return s_total;
|
2085
2129
|
}
|
2086
2130
|
|
2087
|
-
//
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2131
|
+
// Copies the state to the specified destination address
|
2132
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
2133
|
+
uint8_t * out = dest;
|
2134
|
+
|
2135
|
+
// copy rng
|
2136
|
+
{
|
2137
|
+
std::stringstream rng_ss;
|
2138
|
+
rng_ss << ctx->rng;
|
2139
|
+
|
2140
|
+
const size_t rng_size = rng_ss.str().size();
|
2141
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2142
|
+
|
2143
|
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
2144
|
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
2145
|
+
|
2146
|
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
2147
|
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
2148
|
+
}
|
2149
|
+
|
2150
|
+
// copy logits
|
2151
|
+
{
|
2152
|
+
const size_t logits_cap = ctx->logits.capacity();
|
2153
|
+
const size_t logits_size = ctx->logits.size();
|
2154
|
+
|
2155
|
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
2156
|
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
2157
|
+
|
2158
|
+
if (logits_size) {
|
2159
|
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
out += logits_cap * sizeof(float);
|
2163
|
+
}
|
2164
|
+
|
2165
|
+
// copy embeddings
|
2166
|
+
{
|
2167
|
+
const size_t embedding_size = ctx->embedding.size();
|
2168
|
+
|
2169
|
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
2170
|
+
|
2171
|
+
if (embedding_size) {
|
2172
|
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
2173
|
+
out += embedding_size * sizeof(float);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
// copy kv cache
|
2178
|
+
{
|
2179
|
+
const size_t kv_size = ctx->model.kv_self.buf.size;
|
2180
|
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
|
+
|
2182
|
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
|
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
|
+
|
2185
|
+
if (kv_size) {
|
2186
|
+
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
2187
|
+
}
|
2188
|
+
}
|
2189
|
+
|
2190
|
+
const size_t written = out - dest;
|
2191
|
+
const size_t expected = llama_get_state_size(ctx);
|
2192
|
+
|
2193
|
+
LLAMA_ASSERT(written == expected);
|
2194
|
+
|
2195
|
+
return written;
|
2196
|
+
}
|
2197
|
+
|
2198
|
+
// Sets the state reading from the specified source address
|
2199
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2200
|
+
const uint8_t * in = src;
|
2201
|
+
|
2202
|
+
// set rng
|
2203
|
+
{
|
2204
|
+
size_t rng_size;
|
2205
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2206
|
+
|
2207
|
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
2208
|
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
2209
|
+
|
2210
|
+
std::stringstream rng_ss;
|
2211
|
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
2212
|
+
rng_ss >> ctx->rng;
|
2213
|
+
|
2214
|
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
// set logits
|
2218
|
+
{
|
2219
|
+
size_t logits_cap;
|
2220
|
+
size_t logits_size;
|
2221
|
+
|
2222
|
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
2223
|
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
2224
|
+
|
2225
|
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2226
|
+
|
2227
|
+
if (logits_size) {
|
2228
|
+
ctx->logits.resize(logits_size);
|
2229
|
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
2230
|
+
}
|
2231
|
+
|
2232
|
+
in += logits_cap * sizeof(float);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
// set embeddings
|
2236
|
+
{
|
2237
|
+
size_t embedding_size;
|
2238
|
+
|
2239
|
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
2240
|
+
|
2241
|
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2242
|
+
|
2243
|
+
if (embedding_size) {
|
2244
|
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
2245
|
+
in += embedding_size * sizeof(float);
|
2246
|
+
}
|
2247
|
+
}
|
2248
|
+
|
2249
|
+
// set kv cache
|
2250
|
+
{
|
2251
|
+
size_t kv_size;
|
2252
|
+
int kv_ntok;
|
2253
|
+
|
2254
|
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
2255
|
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2256
|
+
|
2257
|
+
if (kv_size) {
|
2258
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
2259
|
+
|
2260
|
+
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
2261
|
+
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
2262
|
+
|
2263
|
+
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
2264
|
+
|
2265
|
+
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
2266
|
+
ctx->model.kv_self.v->data = v_data;
|
2267
|
+
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
ctx->model.kv_self.n = kv_ntok;
|
2271
|
+
}
|
2272
|
+
|
2273
|
+
const size_t nread = in - src;
|
2274
|
+
const size_t expected = llama_get_state_size(ctx);
|
2275
|
+
|
2276
|
+
LLAMA_ASSERT(nread == expected);
|
2277
|
+
|
2278
|
+
return nread;
|
2097
2279
|
}
|
2098
2280
|
|
2099
2281
|
int llama_eval(
|
@@ -2248,3 +2430,4 @@ const char * llama_print_system_info(void) {
|
|
2248
2430
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2249
2431
|
return ctx->model.tensors_by_name;
|
2250
2432
|
}
|
2433
|
+
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -74,6 +74,9 @@ extern "C" {
|
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
75
|
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
76
|
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
77
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
77
80
|
};
|
78
81
|
|
79
82
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -112,22 +115,23 @@ extern "C" {
|
|
112
115
|
const char * path_base_model,
|
113
116
|
int n_threads);
|
114
117
|
|
115
|
-
// Returns the KV cache that will contain the context for the
|
116
|
-
// ongoing prediction with the model.
|
117
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
118
|
-
|
119
|
-
// Returns the size of the KV cache
|
120
|
-
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
121
|
-
|
122
118
|
// Returns the number of tokens in the KV cache
|
123
119
|
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
124
120
|
|
125
|
-
// Sets the
|
126
|
-
LLAMA_API void
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
121
|
+
// Sets the current rng seed.
|
122
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
|
+
|
124
|
+
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
125
|
+
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
126
|
+
|
127
|
+
// Copies the state to the specified destination address.
|
128
|
+
// Destination needs to have allocated enough memory.
|
129
|
+
// Returns the number of bytes copied
|
130
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
131
|
+
|
132
|
+
// Set the state reading from the specified address
|
133
|
+
// Returns the number of bytes read
|
134
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
131
135
|
|
132
136
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
133
137
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -21,6 +21,9 @@
|
|
21
21
|
#if defined(_POSIX_MAPPED_FILES)
|
22
22
|
#include <sys/mman.h>
|
23
23
|
#endif
|
24
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
25
|
+
#include <sys/resource.h>
|
26
|
+
#endif
|
24
27
|
#endif
|
25
28
|
#endif
|
26
29
|
|
@@ -303,8 +306,18 @@ struct llama_mlock {
|
|
303
306
|
if (!mlock(addr, size)) {
|
304
307
|
return true;
|
305
308
|
} else {
|
306
|
-
|
307
|
-
|
309
|
+
char* errmsg = std::strerror(errno);
|
310
|
+
bool suggest = (errno == ENOMEM);
|
311
|
+
|
312
|
+
// Check if the resource limit is fine after all
|
313
|
+
struct rlimit lock_limit;
|
314
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
315
|
+
suggest = false;
|
316
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
317
|
+
suggest = false;
|
318
|
+
|
319
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
320
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
308
321
|
return false;
|
309
322
|
}
|
310
323
|
}
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param top_k [Integer] The top-k value.
|
65
|
+
# @param top_p [Float] The top-p value.
|
66
|
+
# @param temperature [Float] The temperature value.
|
67
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
68
|
+
# @return [String]
|
69
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
+
top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
|
72
|
+
embd_input = tokenize_prompt(prompt)
|
73
|
+
|
74
|
+
n_ctx = @context.n_ctx
|
75
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
76
|
+
|
77
|
+
last_n_tokens = [0] * n_ctx
|
78
|
+
|
79
|
+
embd = []
|
80
|
+
n_consumed = 0
|
81
|
+
n_past = 0
|
82
|
+
n_remain = max_tokens
|
83
|
+
output = []
|
84
|
+
|
85
|
+
while n_remain != 0
|
86
|
+
unless embd.empty?
|
87
|
+
if n_past + embd.size > n_ctx
|
88
|
+
n_left = n_past - n_keep
|
89
|
+
n_past = n_keep
|
90
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
91
|
+
end
|
92
|
+
|
93
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
94
|
+
end
|
95
|
+
|
96
|
+
n_past += embd.size
|
97
|
+
embd.clear
|
98
|
+
|
99
|
+
if embd_input.size <= n_consumed
|
100
|
+
start = n_ctx - repeat_last_n
|
101
|
+
id = @context.sample_top_p_top_k(
|
102
|
+
last_n_tokens[start...(start + repeat_last_n)],
|
103
|
+
top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
|
104
|
+
)
|
105
|
+
last_n_tokens.shift
|
106
|
+
last_n_tokens.push(id)
|
107
|
+
|
108
|
+
embd.push(id)
|
109
|
+
n_remain -= 1
|
110
|
+
else
|
111
|
+
while embd_input.size > n_consumed
|
112
|
+
embd.push(embd_input[n_consumed])
|
113
|
+
last_n_tokens.shift
|
114
|
+
last_n_tokens.push(embd_input[n_consumed])
|
115
|
+
n_consumed += 1
|
116
|
+
break if embd.size >= n_batch
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
121
|
+
|
122
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
123
|
+
end
|
124
|
+
|
125
|
+
output.join.delete_prefix(" #{prompt}").strip
|
126
|
+
end
|
127
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
128
|
+
|
129
|
+
# def chat(prompt); end
|
130
|
+
|
131
|
+
# Obtains the embedding for a given text.
|
132
|
+
#
|
133
|
+
# @param text [String] The text to obtain the embedding for.
|
134
|
+
# @return [Array<Float>]
|
135
|
+
def embeddings(text)
|
136
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
137
|
+
|
138
|
+
embd_input = tokenize_prompt(text)
|
139
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
140
|
+
|
141
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
142
|
+
@context.embeddings
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def tokenize_prompt(prompt)
|
148
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-11d9023'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,31 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
n_batch = 512
|
33
41
|
output = []
|
34
42
|
|
35
43
|
while n_remain != 0
|
@@ -62,13 +70,13 @@ module LLaMACpp
|
|
62
70
|
last_n_tokens.shift
|
63
71
|
last_n_tokens.push(embd_input[n_consumed])
|
64
72
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
73
|
+
break if embd.size >= n_batch
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
77
|
embd.each { |token| output << context.token_to_str(token) }
|
70
78
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
79
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
80
|
end
|
73
81
|
|
74
82
|
output.join.delete_prefix(spaced_prompt).strip
|