llama_cpp 0.3.7 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +36 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +680 -428
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +73 -128
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +176 -64
- data/ext/llama_cpp/src/llama.h +3 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -273,14 +273,16 @@ struct llama_mmap {
|
|
273
273
|
|
274
274
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
275
275
|
if (prefetch) {
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
276
|
+
// Advise the kernel to preload the mapped memory
|
277
|
+
|
278
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
279
|
+
range.VirtualAddress = addr;
|
280
|
+
|
281
|
+
range.NumberOfBytes = (SIZE_T)size;
|
282
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
283
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
284
|
+
llama_format_win_err(GetLastError()).c_str());
|
285
|
+
}
|
284
286
|
}
|
285
287
|
#else
|
286
288
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
|
|
63
63
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
64
64
|
|
65
65
|
|
66
|
-
#if !defined(GGML_USE_CUBLAS)
|
66
|
+
#if !defined(GGML_USE_CUBLAS)
|
67
67
|
#include "ggml-alloc.h"
|
68
68
|
#define LLAMA_USE_ALLOCATOR
|
69
69
|
#else
|
@@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
115
115
|
// memory sizes (calculated for n_batch == 512)
|
116
116
|
//
|
117
117
|
|
118
|
-
static
|
118
|
+
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
119
119
|
{
|
120
|
-
|
120
|
+
std::map<e_model, size_t> k_sizes = {
|
121
121
|
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
122
122
|
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
123
123
|
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
@@ -984,7 +984,7 @@ int64_t llama_time_us() {
|
|
984
984
|
// model loading
|
985
985
|
//
|
986
986
|
|
987
|
-
static const char *llama_file_version_name(llama_file_version version) {
|
987
|
+
static const char * llama_file_version_name(llama_file_version version) {
|
988
988
|
switch (version) {
|
989
989
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
990
990
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
@@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
996
996
|
return "unknown";
|
997
997
|
}
|
998
998
|
|
999
|
-
|
999
|
+
const char * llama_ftype_name(enum llama_ftype ftype) {
|
1000
1000
|
switch (ftype) {
|
1001
1001
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
1002
1002
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
@@ -1021,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
1021
1021
|
}
|
1022
1022
|
}
|
1023
1023
|
|
1024
|
-
static const char *llama_model_type_name(e_model type) {
|
1024
|
+
static const char * llama_model_type_name(e_model type) {
|
1025
1025
|
switch (type) {
|
1026
1026
|
case MODEL_3B: return "3B";
|
1027
1027
|
case MODEL_7B: return "7B";
|
@@ -1609,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1609
1609
|
ggml_set_name(Q, "Q");
|
1610
1610
|
|
1611
1611
|
struct ggml_tensor * K =
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1612
|
+
ggml_view_3d(ctx0, kv_self.k,
|
1613
|
+
n_embd_head, n_past + N, n_head_kv,
|
1614
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
1615
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
1616
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
1617
1617
|
offload_func_kq(K);
|
1618
1618
|
ggml_set_name(K, "K");
|
1619
1619
|
|
@@ -1642,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1642
1642
|
struct ggml_tensor * V =
|
1643
1643
|
ggml_view_3d(ctx0, kv_self.v,
|
1644
1644
|
n_past + N, n_embd_head, n_head_kv,
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1645
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
1646
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
1647
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
1648
1648
|
offload_func_v(V);
|
1649
1649
|
ggml_set_name(V, "V");
|
1650
1650
|
|
@@ -1799,6 +1799,13 @@ static bool llama_eval_internal(
|
|
1799
1799
|
|
1800
1800
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1801
1801
|
|
1802
|
+
LLAMA_ASSERT(n_tokens > 0);
|
1803
|
+
LLAMA_ASSERT(n_past >= 0);
|
1804
|
+
LLAMA_ASSERT(n_threads > 0);
|
1805
|
+
// TODO: keep the values of n_batch and n_ctx
|
1806
|
+
// LLAMA_ASSERT(n_tokens <= n_batch);
|
1807
|
+
// LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
|
1808
|
+
|
1802
1809
|
const int64_t t_start_us = ggml_time_us();
|
1803
1810
|
|
1804
1811
|
#ifdef GGML_USE_MPI
|
@@ -1845,11 +1852,7 @@ static bool llama_eval_internal(
|
|
1845
1852
|
#endif
|
1846
1853
|
|
1847
1854
|
#ifdef GGML_USE_METAL
|
1848
|
-
if (lctx.ctx_metal
|
1849
|
-
// TODO: disabled until #2413 is resolved
|
1850
|
-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1851
|
-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1852
|
-
//}
|
1855
|
+
if (lctx.ctx_metal) {
|
1853
1856
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1854
1857
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1855
1858
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
@@ -1857,22 +1860,6 @@ static bool llama_eval_internal(
|
|
1857
1860
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1858
1861
|
}
|
1859
1862
|
} else {
|
1860
|
-
// IMPORTANT:
|
1861
|
-
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1862
|
-
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1863
|
-
// coprocessor.
|
1864
|
-
//
|
1865
|
-
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1866
|
-
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1867
|
-
//
|
1868
|
-
// TODO: avoid these syncs via shared memory (ref #1696)
|
1869
|
-
//
|
1870
|
-
if (lctx.ctx_metal) {
|
1871
|
-
// We need to sync the GPU KV cache with the CPU KV cache
|
1872
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1873
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1874
|
-
}
|
1875
|
-
|
1876
1863
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1877
1864
|
}
|
1878
1865
|
#else
|
@@ -2097,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
2097
2084
|
// grammar - internal
|
2098
2085
|
//
|
2099
2086
|
|
2087
|
+
struct llama_partial_utf8 {
|
2088
|
+
uint32_t value; // bit value so far (unshifted)
|
2089
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
2090
|
+
};
|
2091
|
+
|
2100
2092
|
struct llama_grammar {
|
2101
2093
|
const std::vector<std::vector<llama_grammar_element>> rules;
|
2102
2094
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2095
|
+
|
2096
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
2097
|
+
llama_partial_utf8 partial_utf8;
|
2103
2098
|
};
|
2104
2099
|
|
2105
2100
|
struct llama_grammar_candidate {
|
2106
|
-
size_t
|
2107
|
-
const uint32_t
|
2101
|
+
size_t index;
|
2102
|
+
const uint32_t * code_points;
|
2103
|
+
llama_partial_utf8 partial_utf8;
|
2108
2104
|
};
|
2109
2105
|
|
2110
|
-
//
|
2111
|
-
//
|
2112
|
-
std::vector<uint32_t> decode_utf8(
|
2113
|
-
|
2106
|
+
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
2107
|
+
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
2108
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
2109
|
+
const char * src,
|
2110
|
+
llama_partial_utf8 partial_start) {
|
2111
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
2114
2112
|
const char * pos = src;
|
2115
2113
|
std::vector<uint32_t> code_points;
|
2114
|
+
uint32_t value = partial_start.value;
|
2115
|
+
int n_remain = partial_start.n_remain;
|
2116
|
+
|
2117
|
+
// continue previous decode, if applicable
|
2118
|
+
while (*pos != 0 && n_remain > 0) {
|
2119
|
+
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
2120
|
+
if ((next_byte >> 6) != 2) {
|
2121
|
+
// invalid sequence, abort
|
2122
|
+
code_points.push_back(0);
|
2123
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
2124
|
+
}
|
2125
|
+
value = (value << 6) + (next_byte & 0x3F);
|
2126
|
+
++pos;
|
2127
|
+
--n_remain;
|
2128
|
+
}
|
2129
|
+
|
2130
|
+
if (partial_start.n_remain > 0 && n_remain == 0) {
|
2131
|
+
code_points.push_back(value);
|
2132
|
+
}
|
2133
|
+
|
2134
|
+
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
2116
2135
|
while (*pos != 0) {
|
2117
2136
|
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2118
2137
|
uint8_t highbits = first_byte >> 4;
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2138
|
+
n_remain = lookup[highbits] - 1;
|
2139
|
+
|
2140
|
+
if (n_remain < 0) {
|
2141
|
+
// invalid sequence, abort
|
2142
|
+
code_points.clear();
|
2143
|
+
code_points.push_back(0);
|
2144
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
2145
|
+
}
|
2146
|
+
|
2147
|
+
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
2148
|
+
value = first_byte & mask;
|
2123
2149
|
++pos;
|
2124
|
-
|
2150
|
+
while (*pos != 0 && n_remain > 0) {
|
2125
2151
|
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2152
|
+
++pos;
|
2153
|
+
--n_remain;
|
2154
|
+
}
|
2155
|
+
if (n_remain == 0) {
|
2156
|
+
code_points.push_back(value);
|
2126
2157
|
}
|
2127
|
-
code_points.push_back(value);
|
2128
2158
|
}
|
2129
2159
|
code_points.push_back(0);
|
2130
|
-
|
2160
|
+
|
2161
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
2131
2162
|
}
|
2132
2163
|
|
2133
2164
|
// returns true iff pos points to the end of one of the definitions of a rule
|
@@ -2164,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
2164
2195
|
return std::make_pair(found == is_positive_char, pos);
|
2165
2196
|
}
|
2166
2197
|
|
2198
|
+
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
2199
|
+
// range at pos (regular or inverse range)
|
2200
|
+
// asserts that pos is pointing to a char range element
|
2201
|
+
static bool llama_grammar_match_partial_char(
|
2202
|
+
const llama_grammar_element * pos,
|
2203
|
+
const llama_partial_utf8 partial_utf8) {
|
2204
|
+
|
2205
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2206
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2207
|
+
|
2208
|
+
uint32_t partial_value = partial_utf8.value;
|
2209
|
+
int n_remain = partial_utf8.n_remain;
|
2210
|
+
|
2211
|
+
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
2212
|
+
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
2213
|
+
return false;
|
2214
|
+
}
|
2215
|
+
|
2216
|
+
// range of possible code points this partial UTF-8 sequence could complete to
|
2217
|
+
uint32_t low = partial_value << (n_remain * 6);
|
2218
|
+
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
2219
|
+
|
2220
|
+
if (low == 0) {
|
2221
|
+
if (n_remain == 2) {
|
2222
|
+
low = 1 << 11;
|
2223
|
+
} else if (n_remain == 3) {
|
2224
|
+
low = 1 << 16;
|
2225
|
+
}
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
do {
|
2229
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2230
|
+
// inclusive range, e.g. [a-z]
|
2231
|
+
if (pos->value <= high && low <= pos[1].value) {
|
2232
|
+
return is_positive_char;
|
2233
|
+
}
|
2234
|
+
pos += 2;
|
2235
|
+
} else {
|
2236
|
+
// exact char match, e.g. [a] or "a"
|
2237
|
+
if (low <= pos->value && pos->value <= high) {
|
2238
|
+
return is_positive_char;
|
2239
|
+
}
|
2240
|
+
pos += 1;
|
2241
|
+
}
|
2242
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2243
|
+
|
2244
|
+
return !is_positive_char;
|
2245
|
+
}
|
2246
|
+
|
2247
|
+
|
2167
2248
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2168
2249
|
// at a character range (terminal element)
|
2169
2250
|
static void llama_grammar_advance_stack(
|
@@ -2264,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2264
2345
|
std::vector<llama_grammar_candidate> rejects;
|
2265
2346
|
|
2266
2347
|
if (stack.empty()) {
|
2267
|
-
|
2268
|
-
|
2348
|
+
for (auto tok : candidates) {
|
2349
|
+
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
2350
|
+
rejects.push_back(tok);
|
2351
|
+
}
|
2352
|
+
}
|
2269
2353
|
return rejects;
|
2270
2354
|
}
|
2271
2355
|
|
@@ -2273,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2273
2357
|
|
2274
2358
|
std::vector<llama_grammar_candidate> next_candidates;
|
2275
2359
|
for (auto tok : candidates) {
|
2276
|
-
if (
|
2277
|
-
|
2278
|
-
|
2360
|
+
if (*tok.code_points == 0) {
|
2361
|
+
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
2362
|
+
// that cannot satisfy this position in grammar
|
2363
|
+
if (tok.partial_utf8.n_remain != 0 &&
|
2364
|
+
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
2365
|
+
rejects.push_back(tok);
|
2279
2366
|
}
|
2367
|
+
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
2368
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
2280
2369
|
} else {
|
2281
2370
|
rejects.push_back(tok);
|
2282
2371
|
}
|
@@ -2294,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2294
2383
|
|
2295
2384
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2296
2385
|
for (auto tok : next_rejects) {
|
2297
|
-
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2386
|
+
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
2298
2387
|
}
|
2299
2388
|
|
2300
2389
|
return rejects;
|
@@ -2359,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
|
|
2359
2448
|
}
|
2360
2449
|
} while (true);
|
2361
2450
|
|
2362
|
-
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2451
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
2363
2452
|
}
|
2364
2453
|
|
2365
2454
|
void llama_grammar_free(struct llama_grammar * grammar) {
|
@@ -2665,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2665
2754
|
|
2666
2755
|
const llama_token eos = llama_token_eos();
|
2667
2756
|
|
2668
|
-
std::vector<std::vector<uint32_t>>
|
2669
|
-
std::vector<llama_grammar_candidate>
|
2757
|
+
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
2758
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2670
2759
|
|
2671
2760
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2672
2761
|
const llama_token id = candidates->data[i].id;
|
@@ -2678,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2678
2767
|
} else if (*str == 0) {
|
2679
2768
|
candidates->data[i].logit = -INFINITY;
|
2680
2769
|
} else {
|
2681
|
-
candidates_decoded.push_back(decode_utf8(str));
|
2682
|
-
candidates_grammar.push_back({
|
2770
|
+
candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
|
2771
|
+
candidates_grammar.push_back({
|
2772
|
+
i, candidates_decoded.back().first.data(), candidates_decoded.back().second
|
2773
|
+
});
|
2683
2774
|
}
|
2684
2775
|
}
|
2685
2776
|
|
@@ -2880,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
2880
2971
|
}
|
2881
2972
|
|
2882
2973
|
const char * str = llama_token_to_str(ctx, token);
|
2974
|
+
|
2883
2975
|
// Note terminating 0 in decoded string
|
2884
|
-
auto
|
2976
|
+
const auto decoded = decode_utf8(str, grammar->partial_utf8);
|
2977
|
+
const auto & code_points = decoded.first;
|
2885
2978
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2886
2979
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2887
2980
|
}
|
2981
|
+
grammar->partial_utf8 = decoded.second;
|
2888
2982
|
LLAMA_ASSERT(!grammar->stacks.empty());
|
2889
2983
|
|
2890
2984
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -3303,7 +3397,18 @@ struct llama_context * llama_new_context_with_model(
|
|
3303
3397
|
int n_past = hparams.n_ctx - n_tokens;
|
3304
3398
|
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3305
3399
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3306
|
-
|
3400
|
+
#ifdef GGML_USE_METAL
|
3401
|
+
if (params.n_gpu_layers > 0) {
|
3402
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
3403
|
+
if (!ctx->ctx_metal) {
|
3404
|
+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
3405
|
+
llama_free(ctx);
|
3406
|
+
return NULL;
|
3407
|
+
}
|
3408
|
+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
3409
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3410
|
+
}
|
3411
|
+
#endif
|
3307
3412
|
// measure memory requirements for the graph
|
3308
3413
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3309
3414
|
|
@@ -3321,6 +3426,11 @@ struct llama_context * llama_new_context_with_model(
|
|
3321
3426
|
|
3322
3427
|
ctx->buf_alloc.resize(alloc_size);
|
3323
3428
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3429
|
+
#ifdef GGML_USE_METAL
|
3430
|
+
if (ctx->ctx_metal) {
|
3431
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3432
|
+
}
|
3433
|
+
#endif
|
3324
3434
|
}
|
3325
3435
|
#else
|
3326
3436
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
@@ -3335,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
|
|
3335
3445
|
#ifdef GGML_USE_METAL
|
3336
3446
|
if (params.n_gpu_layers > 0) {
|
3337
3447
|
// this allocates all Metal resources and memory buffers
|
3338
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
3339
3448
|
|
3340
3449
|
void * data_ptr = NULL;
|
3341
3450
|
size_t data_size = 0;
|
@@ -3364,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
|
|
3364
3473
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
3365
3474
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
3366
3475
|
|
3367
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
3368
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
3476
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
|
3369
3477
|
#undef LLAMA_METAL_CHECK_BUF
|
3370
3478
|
}
|
3371
3479
|
#endif
|
@@ -4173,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
4173
4281
|
return ctx->model.hparams.n_embd;
|
4174
4282
|
}
|
4175
4283
|
|
4284
|
+
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
4285
|
+
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
|
4286
|
+
}
|
4287
|
+
|
4176
4288
|
int llama_get_vocab_from_model(
|
4177
4289
|
const struct llama_model * model,
|
4178
4290
|
const char * * strings,
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -97,7 +97,7 @@ extern "C" {
|
|
97
97
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
98
|
// if it exists.
|
99
99
|
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
100
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
101
|
|
102
102
|
struct llama_context_params {
|
103
103
|
uint32_t seed; // RNG seed, -1 for random
|
@@ -351,6 +351,8 @@ extern "C" {
|
|
351
351
|
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
352
|
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
353
|
|
354
|
+
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
+
|
354
356
|
// Get the vocabulary as output parameters.
|
355
357
|
// Returns number of results.
|
356
358
|
LLAMA_API int llama_get_vocab(
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.8'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-097e121'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -84,6 +84,7 @@ module LLaMACpp
|
|
84
84
|
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
85
|
def token_to_str: (Integer) -> String
|
86
86
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
|
+
def type: () -> String
|
87
88
|
end
|
88
89
|
|
89
90
|
class Timings
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|