llama_cpp 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +26 -0
- data/ext/llama_cpp/src/ggml-cuda.h +32 -0
- data/ext/llama_cpp/src/ggml-opencl.c +216 -0
- data/ext/llama_cpp/src/ggml-opencl.h +24 -0
- data/ext/llama_cpp/src/ggml.c +1436 -624
- data/ext/llama_cpp/src/ggml.h +654 -627
- data/ext/llama_cpp/src/llama.cpp +212 -29
- data/ext/llama_cpp/src/llama.h +17 -13
- data/ext/llama_cpp/src/llama_util.h +15 -2
- data/lib/llama_cpp/client.rb +151 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -8
- data/sig/llama_cpp.rbs +16 -1
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -27,6 +27,7 @@
|
|
27
27
|
#include <thread>
|
28
28
|
#include <atomic>
|
29
29
|
#include <mutex>
|
30
|
+
#include <sstream>
|
30
31
|
|
31
32
|
#define LLAMA_USE_SCRATCH
|
32
33
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
53
54
|
{ MODEL_7B, 512ull * MB },
|
54
55
|
{ MODEL_13B, 512ull * MB },
|
55
56
|
{ MODEL_30B, 512ull * MB },
|
56
|
-
{ MODEL_65B,
|
57
|
+
{ MODEL_65B, 1024ull * MB },
|
57
58
|
};
|
58
59
|
return _MEM_REQ_SCRATCH0;
|
59
60
|
}
|
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
64
65
|
{ MODEL_7B, 512ull * MB },
|
65
66
|
{ MODEL_13B, 512ull * MB },
|
66
67
|
{ MODEL_30B, 512ull * MB },
|
67
|
-
{ MODEL_65B,
|
68
|
+
{ MODEL_65B, 1024ull * MB },
|
68
69
|
};
|
69
70
|
return _MEM_REQ_SCRATCH1;
|
70
|
-
}
|
71
|
+
}
|
71
72
|
|
72
73
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
73
74
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
79
80
|
{ MODEL_65B, 5120ull * MB },
|
80
81
|
};
|
81
82
|
return _MEM_REQ_KV_SELF;
|
82
|
-
}
|
83
|
+
}
|
83
84
|
|
84
85
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
85
86
|
// not actually needed if BLAS is disabled
|
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
92
93
|
{ MODEL_65B, 1536ull * MB },
|
93
94
|
};
|
94
95
|
return _MEM_REQ_EVAL;
|
95
|
-
}
|
96
|
+
}
|
96
97
|
|
97
98
|
// default hparams (LLaMA 7B)
|
98
99
|
struct llama_hparams {
|
@@ -483,6 +484,9 @@ struct llama_file_loader {
|
|
483
484
|
case GGML_TYPE_Q4_1:
|
484
485
|
case GGML_TYPE_Q4_2:
|
485
486
|
case GGML_TYPE_Q4_3:
|
487
|
+
case GGML_TYPE_Q5_0:
|
488
|
+
case GGML_TYPE_Q5_1:
|
489
|
+
case GGML_TYPE_Q8_0:
|
486
490
|
break;
|
487
491
|
default: {
|
488
492
|
throw format("unrecognized tensor type %u\n", shard.type);
|
@@ -557,6 +561,9 @@ struct llama_file_saver {
|
|
557
561
|
case GGML_TYPE_Q4_1:
|
558
562
|
case GGML_TYPE_Q4_2:
|
559
563
|
case GGML_TYPE_Q4_3:
|
564
|
+
case GGML_TYPE_Q5_0:
|
565
|
+
case GGML_TYPE_Q5_1:
|
566
|
+
case GGML_TYPE_Q8_0:
|
560
567
|
break;
|
561
568
|
default: LLAMA_ASSERT(false);
|
562
569
|
}
|
@@ -847,6 +854,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
847
854
|
return "mostly Q4_1, some F16";
|
848
855
|
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
849
856
|
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
857
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
858
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
859
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
850
860
|
default: return "unknown, may not work";
|
851
861
|
}
|
852
862
|
}
|
@@ -1075,7 +1085,7 @@ static bool llama_eval_internal(
|
|
1075
1085
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1076
1086
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1077
1087
|
ggml_cgraph gf = {};
|
1078
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !
|
1088
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1079
1089
|
|
1080
1090
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1081
1091
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
@@ -1249,9 +1259,11 @@ static bool llama_eval_internal(
|
|
1249
1259
|
ggml_build_forward_expand(&gf, inpL);
|
1250
1260
|
ggml_graph_compute (ctx0, &gf);
|
1251
1261
|
|
1262
|
+
#ifdef GGML_PERF
|
1252
1263
|
// print timing information per ggml operation (for debugging purposes)
|
1253
1264
|
// requires GGML_PERF to be defined
|
1254
|
-
|
1265
|
+
ggml_graph_print(&gf);
|
1266
|
+
#endif
|
1255
1267
|
|
1256
1268
|
// plot the computation graph in dot format (for debugging purposes)
|
1257
1269
|
//if (n_past%100 == 0) {
|
@@ -1582,6 +1594,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1582
1594
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
1583
1595
|
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
1584
1596
|
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
1597
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
1598
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
1599
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
1585
1600
|
default: throw format("invalid output file type %d\n", ftype);
|
1586
1601
|
};
|
1587
1602
|
|
@@ -1618,8 +1633,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
1618
1633
|
// quantize only 2D tensors
|
1619
1634
|
quantize &= (tensor.ne.size() == 2);
|
1620
1635
|
|
1621
|
-
//
|
1622
|
-
//if (tensor.name
|
1636
|
+
// uncomment this to keep the output layer in FP16
|
1637
|
+
//if (tensor.name == "output.weight") {
|
1623
1638
|
// quantize = false;
|
1624
1639
|
//}
|
1625
1640
|
|
@@ -1787,7 +1802,7 @@ struct llama_context * llama_init_from_file(
|
|
1787
1802
|
if (params.logits_all) {
|
1788
1803
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1789
1804
|
} else {
|
1790
|
-
ctx->logits.reserve(hparams.
|
1805
|
+
ctx->logits.reserve(hparams.n_vocab);
|
1791
1806
|
}
|
1792
1807
|
|
1793
1808
|
if (params.embedding){
|
@@ -2069,31 +2084,198 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2069
2084
|
}
|
2070
2085
|
}
|
2071
2086
|
|
2072
|
-
|
2073
|
-
|
2074
|
-
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
2075
|
-
return ctx->model.kv_self.buf.addr;
|
2087
|
+
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
2088
|
+
return ctx->model.kv_self.n;
|
2076
2089
|
}
|
2077
2090
|
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2091
|
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
2092
|
+
|
2093
|
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
2094
|
+
if (seed <= 0) {
|
2095
|
+
seed = time(NULL);
|
2096
|
+
}
|
2097
|
+
ctx->rng.seed(seed);
|
2081
2098
|
}
|
2082
2099
|
|
2083
|
-
|
2084
|
-
|
2100
|
+
// Returns the size of the state
|
2101
|
+
size_t llama_get_state_size(struct llama_context * ctx) {
|
2102
|
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
2103
|
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
2104
|
+
const size_t s_rng_size = sizeof(size_t);
|
2105
|
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
2106
|
+
const size_t s_logits_capacity = sizeof(size_t);
|
2107
|
+
const size_t s_logits_size = sizeof(size_t);
|
2108
|
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
2109
|
+
const size_t s_embedding_size = sizeof(size_t);
|
2110
|
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
2111
|
+
const size_t s_kv_size = sizeof(size_t);
|
2112
|
+
const size_t s_kv_ntok = sizeof(int);
|
2113
|
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
2114
|
+
|
2115
|
+
const size_t s_total = (
|
2116
|
+
+ s_rng_size
|
2117
|
+
+ s_rng
|
2118
|
+
+ s_logits_capacity
|
2119
|
+
+ s_logits_size
|
2120
|
+
+ s_logits
|
2121
|
+
+ s_embedding_size
|
2122
|
+
+ s_embedding
|
2123
|
+
+ s_kv_size
|
2124
|
+
+ s_kv_ntok
|
2125
|
+
+ s_kv
|
2126
|
+
);
|
2127
|
+
|
2128
|
+
return s_total;
|
2085
2129
|
}
|
2086
2130
|
|
2087
|
-
//
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2131
|
+
// Copies the state to the specified destination address
|
2132
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
2133
|
+
uint8_t * out = dest;
|
2134
|
+
|
2135
|
+
// copy rng
|
2136
|
+
{
|
2137
|
+
std::stringstream rng_ss;
|
2138
|
+
rng_ss << ctx->rng;
|
2139
|
+
|
2140
|
+
const size_t rng_size = rng_ss.str().size();
|
2141
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2142
|
+
|
2143
|
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
2144
|
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
2145
|
+
|
2146
|
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
2147
|
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
2148
|
+
}
|
2149
|
+
|
2150
|
+
// copy logits
|
2151
|
+
{
|
2152
|
+
const size_t logits_cap = ctx->logits.capacity();
|
2153
|
+
const size_t logits_size = ctx->logits.size();
|
2154
|
+
|
2155
|
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
2156
|
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
2157
|
+
|
2158
|
+
if (logits_size) {
|
2159
|
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
out += logits_cap * sizeof(float);
|
2163
|
+
}
|
2164
|
+
|
2165
|
+
// copy embeddings
|
2166
|
+
{
|
2167
|
+
const size_t embedding_size = ctx->embedding.size();
|
2168
|
+
|
2169
|
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
2170
|
+
|
2171
|
+
if (embedding_size) {
|
2172
|
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
2173
|
+
out += embedding_size * sizeof(float);
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
// copy kv cache
|
2178
|
+
{
|
2179
|
+
const size_t kv_size = ctx->model.kv_self.buf.size;
|
2180
|
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
2181
|
+
|
2182
|
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
2183
|
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
2184
|
+
|
2185
|
+
if (kv_size) {
|
2186
|
+
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
2187
|
+
}
|
2188
|
+
}
|
2189
|
+
|
2190
|
+
const size_t written = out - dest;
|
2191
|
+
const size_t expected = llama_get_state_size(ctx);
|
2192
|
+
|
2193
|
+
LLAMA_ASSERT(written == expected);
|
2194
|
+
|
2195
|
+
return written;
|
2196
|
+
}
|
2197
|
+
|
2198
|
+
// Sets the state reading from the specified source address
|
2199
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
2200
|
+
const uint8_t * in = src;
|
2201
|
+
|
2202
|
+
// set rng
|
2203
|
+
{
|
2204
|
+
size_t rng_size;
|
2205
|
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
2206
|
+
|
2207
|
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
2208
|
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
2209
|
+
|
2210
|
+
std::stringstream rng_ss;
|
2211
|
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
2212
|
+
rng_ss >> ctx->rng;
|
2213
|
+
|
2214
|
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
// set logits
|
2218
|
+
{
|
2219
|
+
size_t logits_cap;
|
2220
|
+
size_t logits_size;
|
2221
|
+
|
2222
|
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
2223
|
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
2224
|
+
|
2225
|
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
2226
|
+
|
2227
|
+
if (logits_size) {
|
2228
|
+
ctx->logits.resize(logits_size);
|
2229
|
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
2230
|
+
}
|
2231
|
+
|
2232
|
+
in += logits_cap * sizeof(float);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
// set embeddings
|
2236
|
+
{
|
2237
|
+
size_t embedding_size;
|
2238
|
+
|
2239
|
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
2240
|
+
|
2241
|
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
2242
|
+
|
2243
|
+
if (embedding_size) {
|
2244
|
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
2245
|
+
in += embedding_size * sizeof(float);
|
2246
|
+
}
|
2247
|
+
}
|
2248
|
+
|
2249
|
+
// set kv cache
|
2250
|
+
{
|
2251
|
+
size_t kv_size;
|
2252
|
+
int kv_ntok;
|
2253
|
+
|
2254
|
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
2255
|
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
2256
|
+
|
2257
|
+
if (kv_size) {
|
2258
|
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
2259
|
+
|
2260
|
+
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
2261
|
+
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
2262
|
+
|
2263
|
+
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
2264
|
+
|
2265
|
+
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
2266
|
+
ctx->model.kv_self.v->data = v_data;
|
2267
|
+
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
ctx->model.kv_self.n = kv_ntok;
|
2271
|
+
}
|
2272
|
+
|
2273
|
+
const size_t nread = in - src;
|
2274
|
+
const size_t expected = llama_get_state_size(ctx);
|
2275
|
+
|
2276
|
+
LLAMA_ASSERT(nread == expected);
|
2277
|
+
|
2278
|
+
return nread;
|
2097
2279
|
}
|
2098
2280
|
|
2099
2281
|
int llama_eval(
|
@@ -2248,3 +2430,4 @@ const char * llama_print_system_info(void) {
|
|
2248
2430
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
2249
2431
|
return ctx->model.tensors_by_name;
|
2250
2432
|
}
|
2433
|
+
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -74,6 +74,9 @@ extern "C" {
|
|
74
74
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
75
75
|
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
76
76
|
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
77
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
78
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
79
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
77
80
|
};
|
78
81
|
|
79
82
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -112,22 +115,23 @@ extern "C" {
|
|
112
115
|
const char * path_base_model,
|
113
116
|
int n_threads);
|
114
117
|
|
115
|
-
// Returns the KV cache that will contain the context for the
|
116
|
-
// ongoing prediction with the model.
|
117
|
-
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
118
|
-
|
119
|
-
// Returns the size of the KV cache
|
120
|
-
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
121
|
-
|
122
118
|
// Returns the number of tokens in the KV cache
|
123
119
|
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
124
120
|
|
125
|
-
// Sets the
|
126
|
-
LLAMA_API void
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
121
|
+
// Sets the current rng seed.
|
122
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
123
|
+
|
124
|
+
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
125
|
+
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
126
|
+
|
127
|
+
// Copies the state to the specified destination address.
|
128
|
+
// Destination needs to have allocated enough memory.
|
129
|
+
// Returns the number of bytes copied
|
130
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
131
|
+
|
132
|
+
// Set the state reading from the specified address
|
133
|
+
// Returns the number of bytes read
|
134
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
131
135
|
|
132
136
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
133
137
|
// tokens + n_tokens is the provided batch of new tokens to process
|
@@ -21,6 +21,9 @@
|
|
21
21
|
#if defined(_POSIX_MAPPED_FILES)
|
22
22
|
#include <sys/mman.h>
|
23
23
|
#endif
|
24
|
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
25
|
+
#include <sys/resource.h>
|
26
|
+
#endif
|
24
27
|
#endif
|
25
28
|
#endif
|
26
29
|
|
@@ -303,8 +306,18 @@ struct llama_mlock {
|
|
303
306
|
if (!mlock(addr, size)) {
|
304
307
|
return true;
|
305
308
|
} else {
|
306
|
-
|
307
|
-
|
309
|
+
char* errmsg = std::strerror(errno);
|
310
|
+
bool suggest = (errno == ENOMEM);
|
311
|
+
|
312
|
+
// Check if the resource limit is fine after all
|
313
|
+
struct rlimit lock_limit;
|
314
|
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
315
|
+
suggest = false;
|
316
|
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
317
|
+
suggest = false;
|
318
|
+
|
319
|
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
320
|
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
308
321
|
return false;
|
309
322
|
}
|
310
323
|
}
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLaMACpp
|
4
|
+
# Client provides a high-level interface to the LLM model.
|
5
|
+
class Client
|
6
|
+
# Creates a new client.
|
7
|
+
#
|
8
|
+
# @param model_path [String] The path to the model file.
|
9
|
+
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
|
+
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
|
+
# @param n_ctx [Integer] The context size.
|
12
|
+
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
|
+
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
|
+
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
|
+
# @param use_mlock [Boolean] The flag hether to use mlock.
|
16
|
+
# @param embedding [Boolean] The flag whether to calculate embedding.
|
17
|
+
# @param n_threads [Integer] The number of threads to use.
|
18
|
+
# @param seed [Integer] The seed for the random number generator.
|
19
|
+
# @return [Client]
|
20
|
+
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
|
+
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
+
n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
|
+
embedding: false,
|
24
|
+
n_threads: 1, seed: 0)
|
25
|
+
@params = {
|
26
|
+
model_path: model_path,
|
27
|
+
lora_adapter_path: lora_adapter_path,
|
28
|
+
lora_base_path: lora_base_path,
|
29
|
+
n_ctx: n_ctx,
|
30
|
+
n_parts: n_parts,
|
31
|
+
memory_f16: memory_f16,
|
32
|
+
use_mmap: use_mmap,
|
33
|
+
use_mlock: use_mlock,
|
34
|
+
embedding: embedding,
|
35
|
+
n_threads: n_threads,
|
36
|
+
seed: seed
|
37
|
+
}
|
38
|
+
@context_params = ContextParams.new
|
39
|
+
@context_params.n_ctx = n_ctx
|
40
|
+
@context_params.n_parts = n_parts
|
41
|
+
@context_params.f16_kv = memory_f16
|
42
|
+
@context_params.use_mmap = use_mmap
|
43
|
+
@context_params.use_mlock = use_mlock
|
44
|
+
@context_params.embedding = embedding
|
45
|
+
@context_params.seed = seed
|
46
|
+
@context = Context.new(model_path: model_path, params: @context_params)
|
47
|
+
return unless lora_adapter_path.is_a?(String)
|
48
|
+
|
49
|
+
if lora_base_path.is_a?(String)
|
50
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
|
51
|
+
else
|
52
|
+
@context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
|
56
|
+
|
57
|
+
# Generates completions for a given prompt.
|
58
|
+
#
|
59
|
+
# @param prompt [String] The prompt to generate completions for.
|
60
|
+
# @param max_tokens [Integer] The maximum number of tokens to generate.
|
61
|
+
# @param n_keep [Integer] The number of tokens to keep from the initial prompt.
|
62
|
+
# @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
|
63
|
+
# @param n_batch [Integer] The batch size.
|
64
|
+
# @param top_k [Integer] The top-k value.
|
65
|
+
# @param top_p [Float] The top-p value.
|
66
|
+
# @param temperature [Float] The temperature value.
|
67
|
+
# @param repeat_penalty [Float] The repeat penalty value.
|
68
|
+
# @return [String]
|
69
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
70
|
+
def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
|
71
|
+
top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
|
72
|
+
embd_input = tokenize_prompt(prompt)
|
73
|
+
|
74
|
+
n_ctx = @context.n_ctx
|
75
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
76
|
+
|
77
|
+
last_n_tokens = [0] * n_ctx
|
78
|
+
|
79
|
+
embd = []
|
80
|
+
n_consumed = 0
|
81
|
+
n_past = 0
|
82
|
+
n_remain = max_tokens
|
83
|
+
output = []
|
84
|
+
|
85
|
+
while n_remain != 0
|
86
|
+
unless embd.empty?
|
87
|
+
if n_past + embd.size > n_ctx
|
88
|
+
n_left = n_past - n_keep
|
89
|
+
n_past = n_keep
|
90
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
91
|
+
end
|
92
|
+
|
93
|
+
@context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
|
94
|
+
end
|
95
|
+
|
96
|
+
n_past += embd.size
|
97
|
+
embd.clear
|
98
|
+
|
99
|
+
if embd_input.size <= n_consumed
|
100
|
+
start = n_ctx - repeat_last_n
|
101
|
+
id = @context.sample_top_p_top_k(
|
102
|
+
last_n_tokens[start...(start + repeat_last_n)],
|
103
|
+
top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
|
104
|
+
)
|
105
|
+
last_n_tokens.shift
|
106
|
+
last_n_tokens.push(id)
|
107
|
+
|
108
|
+
embd.push(id)
|
109
|
+
n_remain -= 1
|
110
|
+
else
|
111
|
+
while embd_input.size > n_consumed
|
112
|
+
embd.push(embd_input[n_consumed])
|
113
|
+
last_n_tokens.shift
|
114
|
+
last_n_tokens.push(embd_input[n_consumed])
|
115
|
+
n_consumed += 1
|
116
|
+
break if embd.size >= n_batch
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
embd.each { |token| output << @context.token_to_str(token) }
|
121
|
+
|
122
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
123
|
+
end
|
124
|
+
|
125
|
+
output.join.delete_prefix(" #{prompt}").strip
|
126
|
+
end
|
127
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
128
|
+
|
129
|
+
# def chat(prompt); end
|
130
|
+
|
131
|
+
# Obtains the embedding for a given text.
|
132
|
+
#
|
133
|
+
# @param text [String] The text to obtain the embedding for.
|
134
|
+
# @return [Array<Float>]
|
135
|
+
def embeddings(text)
|
136
|
+
raise 'The embedding option is set to false' unless @params[:embedding]
|
137
|
+
|
138
|
+
embd_input = tokenize_prompt(text)
|
139
|
+
raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
|
140
|
+
|
141
|
+
@context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
|
142
|
+
@context.embeddings
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def tokenize_prompt(prompt)
|
148
|
+
@context.tokenize(text: " #{prompt}", add_bos: true)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.0.
|
6
|
+
VERSION = '0.0.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-11d9023'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative 'llama_cpp/version'
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
|
+
require_relative 'llama_cpp/client'
|
5
6
|
|
6
7
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
8
|
module LLaMACpp
|
@@ -12,24 +13,31 @@ module LLaMACpp
|
|
12
13
|
|
13
14
|
# Generates sentences following the given prompt for operation check.
|
14
15
|
#
|
15
|
-
# @param context [LLaMACpp::Context]
|
16
|
-
# @param prompt [String]
|
17
|
-
# @param
|
16
|
+
# @param context [LLaMACpp::Context] The context to use.
|
17
|
+
# @param prompt [String] The prompt to start generation with.
|
18
|
+
# @param n_predict [Integer] The number of tokens to predict.
|
19
|
+
# @param n_threads [Integer] The number of threads.
|
18
20
|
# @return [String]
|
19
|
-
def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
20
|
-
|
21
|
+
def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
22
|
+
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
23
|
+
raise ArgumentError, 'context must have loaded the model' if context.empty?
|
24
|
+
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
21
25
|
|
26
|
+
spaced_prompt = " #{prompt}"
|
22
27
|
embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
|
23
28
|
|
24
29
|
n_ctx = context.n_ctx
|
30
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
31
|
+
|
25
32
|
last_n_tokens = [0] * n_ctx
|
26
33
|
|
27
34
|
embd = []
|
28
35
|
n_consumed = 0
|
29
36
|
n_keep = 10
|
30
37
|
n_past = 0
|
31
|
-
n_remain =
|
38
|
+
n_remain = n_predict
|
32
39
|
repeat_last_n = 64
|
40
|
+
n_batch = 512
|
33
41
|
output = []
|
34
42
|
|
35
43
|
while n_remain != 0
|
@@ -62,13 +70,13 @@ module LLaMACpp
|
|
62
70
|
last_n_tokens.shift
|
63
71
|
last_n_tokens.push(embd_input[n_consumed])
|
64
72
|
n_consumed += 1
|
65
|
-
break if embd.size >=
|
73
|
+
break if embd.size >= n_batch
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
77
|
embd.each { |token| output << context.token_to_str(token) }
|
70
78
|
|
71
|
-
break if embd[-1] == LLaMACpp.token_eos
|
79
|
+
break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
|
72
80
|
end
|
73
81
|
|
74
82
|
output.join.delete_prefix(spaced_prompt).strip
|