llama_cpp 0.3.7 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +36 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +680 -428
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +73 -128
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +176 -64
- data/ext/llama_cpp/src/llama.h +3 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -273,14 +273,16 @@ struct llama_mmap {
|
|
273
273
|
|
274
274
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
275
275
|
if (prefetch) {
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
276
|
+
// Advise the kernel to preload the mapped memory
|
277
|
+
|
278
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
279
|
+
range.VirtualAddress = addr;
|
280
|
+
|
281
|
+
range.NumberOfBytes = (SIZE_T)size;
|
282
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
283
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
284
|
+
llama_format_win_err(GetLastError()).c_str());
|
285
|
+
}
|
284
286
|
}
|
285
287
|
#else
|
286
288
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
|
|
63
63
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
64
64
|
|
65
65
|
|
66
|
-
#if !defined(GGML_USE_CUBLAS)
|
66
|
+
#if !defined(GGML_USE_CUBLAS)
|
67
67
|
#include "ggml-alloc.h"
|
68
68
|
#define LLAMA_USE_ALLOCATOR
|
69
69
|
#else
|
@@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
115
115
|
// memory sizes (calculated for n_batch == 512)
|
116
116
|
//
|
117
117
|
|
118
|
-
static
|
118
|
+
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
119
119
|
{
|
120
|
-
|
120
|
+
std::map<e_model, size_t> k_sizes = {
|
121
121
|
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
122
122
|
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
123
123
|
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
@@ -984,7 +984,7 @@ int64_t llama_time_us() {
|
|
984
984
|
// model loading
|
985
985
|
//
|
986
986
|
|
987
|
-
static const char *llama_file_version_name(llama_file_version version) {
|
987
|
+
static const char * llama_file_version_name(llama_file_version version) {
|
988
988
|
switch (version) {
|
989
989
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
990
990
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
@@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
996
996
|
return "unknown";
|
997
997
|
}
|
998
998
|
|
999
|
-
|
999
|
+
const char * llama_ftype_name(enum llama_ftype ftype) {
|
1000
1000
|
switch (ftype) {
|
1001
1001
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
1002
1002
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
@@ -1021,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
1021
1021
|
}
|
1022
1022
|
}
|
1023
1023
|
|
1024
|
-
static const char *llama_model_type_name(e_model type) {
|
1024
|
+
static const char * llama_model_type_name(e_model type) {
|
1025
1025
|
switch (type) {
|
1026
1026
|
case MODEL_3B: return "3B";
|
1027
1027
|
case MODEL_7B: return "7B";
|
@@ -1609,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1609
1609
|
ggml_set_name(Q, "Q");
|
1610
1610
|
|
1611
1611
|
struct ggml_tensor * K =
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1612
|
+
ggml_view_3d(ctx0, kv_self.k,
|
1613
|
+
n_embd_head, n_past + N, n_head_kv,
|
1614
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
1615
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
1616
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
1617
1617
|
offload_func_kq(K);
|
1618
1618
|
ggml_set_name(K, "K");
|
1619
1619
|
|
@@ -1642,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
1642
1642
|
struct ggml_tensor * V =
|
1643
1643
|
ggml_view_3d(ctx0, kv_self.v,
|
1644
1644
|
n_past + N, n_embd_head, n_head_kv,
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1645
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
1646
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
1647
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
1648
1648
|
offload_func_v(V);
|
1649
1649
|
ggml_set_name(V, "V");
|
1650
1650
|
|
@@ -1799,6 +1799,13 @@ static bool llama_eval_internal(
|
|
1799
1799
|
|
1800
1800
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1801
1801
|
|
1802
|
+
LLAMA_ASSERT(n_tokens > 0);
|
1803
|
+
LLAMA_ASSERT(n_past >= 0);
|
1804
|
+
LLAMA_ASSERT(n_threads > 0);
|
1805
|
+
// TODO: keep the values of n_batch and n_ctx
|
1806
|
+
// LLAMA_ASSERT(n_tokens <= n_batch);
|
1807
|
+
// LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
|
1808
|
+
|
1802
1809
|
const int64_t t_start_us = ggml_time_us();
|
1803
1810
|
|
1804
1811
|
#ifdef GGML_USE_MPI
|
@@ -1845,11 +1852,7 @@ static bool llama_eval_internal(
|
|
1845
1852
|
#endif
|
1846
1853
|
|
1847
1854
|
#ifdef GGML_USE_METAL
|
1848
|
-
if (lctx.ctx_metal
|
1849
|
-
// TODO: disabled until #2413 is resolved
|
1850
|
-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
1851
|
-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
1852
|
-
//}
|
1855
|
+
if (lctx.ctx_metal) {
|
1853
1856
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1854
1857
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1855
1858
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
@@ -1857,22 +1860,6 @@ static bool llama_eval_internal(
|
|
1857
1860
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1858
1861
|
}
|
1859
1862
|
} else {
|
1860
|
-
// IMPORTANT:
|
1861
|
-
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1862
|
-
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1863
|
-
// coprocessor.
|
1864
|
-
//
|
1865
|
-
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1866
|
-
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1867
|
-
//
|
1868
|
-
// TODO: avoid these syncs via shared memory (ref #1696)
|
1869
|
-
//
|
1870
|
-
if (lctx.ctx_metal) {
|
1871
|
-
// We need to sync the GPU KV cache with the CPU KV cache
|
1872
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1873
|
-
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1874
|
-
}
|
1875
|
-
|
1876
1863
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
1877
1864
|
}
|
1878
1865
|
#else
|
@@ -2097,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
2097
2084
|
// grammar - internal
|
2098
2085
|
//
|
2099
2086
|
|
2087
|
+
struct llama_partial_utf8 {
|
2088
|
+
uint32_t value; // bit value so far (unshifted)
|
2089
|
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
2090
|
+
};
|
2091
|
+
|
2100
2092
|
struct llama_grammar {
|
2101
2093
|
const std::vector<std::vector<llama_grammar_element>> rules;
|
2102
2094
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
2095
|
+
|
2096
|
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
2097
|
+
llama_partial_utf8 partial_utf8;
|
2103
2098
|
};
|
2104
2099
|
|
2105
2100
|
struct llama_grammar_candidate {
|
2106
|
-
size_t
|
2107
|
-
const uint32_t
|
2101
|
+
size_t index;
|
2102
|
+
const uint32_t * code_points;
|
2103
|
+
llama_partial_utf8 partial_utf8;
|
2108
2104
|
};
|
2109
2105
|
|
2110
|
-
//
|
2111
|
-
//
|
2112
|
-
std::vector<uint32_t> decode_utf8(
|
2113
|
-
|
2106
|
+
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
2107
|
+
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
2108
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
2109
|
+
const char * src,
|
2110
|
+
llama_partial_utf8 partial_start) {
|
2111
|
+
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
2114
2112
|
const char * pos = src;
|
2115
2113
|
std::vector<uint32_t> code_points;
|
2114
|
+
uint32_t value = partial_start.value;
|
2115
|
+
int n_remain = partial_start.n_remain;
|
2116
|
+
|
2117
|
+
// continue previous decode, if applicable
|
2118
|
+
while (*pos != 0 && n_remain > 0) {
|
2119
|
+
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
2120
|
+
if ((next_byte >> 6) != 2) {
|
2121
|
+
// invalid sequence, abort
|
2122
|
+
code_points.push_back(0);
|
2123
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
2124
|
+
}
|
2125
|
+
value = (value << 6) + (next_byte & 0x3F);
|
2126
|
+
++pos;
|
2127
|
+
--n_remain;
|
2128
|
+
}
|
2129
|
+
|
2130
|
+
if (partial_start.n_remain > 0 && n_remain == 0) {
|
2131
|
+
code_points.push_back(value);
|
2132
|
+
}
|
2133
|
+
|
2134
|
+
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
2116
2135
|
while (*pos != 0) {
|
2117
2136
|
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
2118
2137
|
uint8_t highbits = first_byte >> 4;
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2138
|
+
n_remain = lookup[highbits] - 1;
|
2139
|
+
|
2140
|
+
if (n_remain < 0) {
|
2141
|
+
// invalid sequence, abort
|
2142
|
+
code_points.clear();
|
2143
|
+
code_points.push_back(0);
|
2144
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
2145
|
+
}
|
2146
|
+
|
2147
|
+
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
2148
|
+
value = first_byte & mask;
|
2123
2149
|
++pos;
|
2124
|
-
|
2150
|
+
while (*pos != 0 && n_remain > 0) {
|
2125
2151
|
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
2152
|
+
++pos;
|
2153
|
+
--n_remain;
|
2154
|
+
}
|
2155
|
+
if (n_remain == 0) {
|
2156
|
+
code_points.push_back(value);
|
2126
2157
|
}
|
2127
|
-
code_points.push_back(value);
|
2128
2158
|
}
|
2129
2159
|
code_points.push_back(0);
|
2130
|
-
|
2160
|
+
|
2161
|
+
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
2131
2162
|
}
|
2132
2163
|
|
2133
2164
|
// returns true iff pos points to the end of one of the definitions of a rule
|
@@ -2164,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
2164
2195
|
return std::make_pair(found == is_positive_char, pos);
|
2165
2196
|
}
|
2166
2197
|
|
2198
|
+
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
2199
|
+
// range at pos (regular or inverse range)
|
2200
|
+
// asserts that pos is pointing to a char range element
|
2201
|
+
static bool llama_grammar_match_partial_char(
|
2202
|
+
const llama_grammar_element * pos,
|
2203
|
+
const llama_partial_utf8 partial_utf8) {
|
2204
|
+
|
2205
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
2206
|
+
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
2207
|
+
|
2208
|
+
uint32_t partial_value = partial_utf8.value;
|
2209
|
+
int n_remain = partial_utf8.n_remain;
|
2210
|
+
|
2211
|
+
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
2212
|
+
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
2213
|
+
return false;
|
2214
|
+
}
|
2215
|
+
|
2216
|
+
// range of possible code points this partial UTF-8 sequence could complete to
|
2217
|
+
uint32_t low = partial_value << (n_remain * 6);
|
2218
|
+
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
2219
|
+
|
2220
|
+
if (low == 0) {
|
2221
|
+
if (n_remain == 2) {
|
2222
|
+
low = 1 << 11;
|
2223
|
+
} else if (n_remain == 3) {
|
2224
|
+
low = 1 << 16;
|
2225
|
+
}
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
do {
|
2229
|
+
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
2230
|
+
// inclusive range, e.g. [a-z]
|
2231
|
+
if (pos->value <= high && low <= pos[1].value) {
|
2232
|
+
return is_positive_char;
|
2233
|
+
}
|
2234
|
+
pos += 2;
|
2235
|
+
} else {
|
2236
|
+
// exact char match, e.g. [a] or "a"
|
2237
|
+
if (low <= pos->value && pos->value <= high) {
|
2238
|
+
return is_positive_char;
|
2239
|
+
}
|
2240
|
+
pos += 1;
|
2241
|
+
}
|
2242
|
+
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
2243
|
+
|
2244
|
+
return !is_positive_char;
|
2245
|
+
}
|
2246
|
+
|
2247
|
+
|
2167
2248
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
2168
2249
|
// at a character range (terminal element)
|
2169
2250
|
static void llama_grammar_advance_stack(
|
@@ -2264,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2264
2345
|
std::vector<llama_grammar_candidate> rejects;
|
2265
2346
|
|
2266
2347
|
if (stack.empty()) {
|
2267
|
-
|
2268
|
-
|
2348
|
+
for (auto tok : candidates) {
|
2349
|
+
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
2350
|
+
rejects.push_back(tok);
|
2351
|
+
}
|
2352
|
+
}
|
2269
2353
|
return rejects;
|
2270
2354
|
}
|
2271
2355
|
|
@@ -2273,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2273
2357
|
|
2274
2358
|
std::vector<llama_grammar_candidate> next_candidates;
|
2275
2359
|
for (auto tok : candidates) {
|
2276
|
-
if (
|
2277
|
-
|
2278
|
-
|
2360
|
+
if (*tok.code_points == 0) {
|
2361
|
+
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
2362
|
+
// that cannot satisfy this position in grammar
|
2363
|
+
if (tok.partial_utf8.n_remain != 0 &&
|
2364
|
+
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
2365
|
+
rejects.push_back(tok);
|
2279
2366
|
}
|
2367
|
+
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
2368
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
2280
2369
|
} else {
|
2281
2370
|
rejects.push_back(tok);
|
2282
2371
|
}
|
@@ -2294,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
2294
2383
|
|
2295
2384
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
2296
2385
|
for (auto tok : next_rejects) {
|
2297
|
-
rejects.push_back({ tok.index, tok.code_points - 1 });
|
2386
|
+
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
2298
2387
|
}
|
2299
2388
|
|
2300
2389
|
return rejects;
|
@@ -2359,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
|
|
2359
2448
|
}
|
2360
2449
|
} while (true);
|
2361
2450
|
|
2362
|
-
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
|
2451
|
+
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
2363
2452
|
}
|
2364
2453
|
|
2365
2454
|
void llama_grammar_free(struct llama_grammar * grammar) {
|
@@ -2665,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2665
2754
|
|
2666
2755
|
const llama_token eos = llama_token_eos();
|
2667
2756
|
|
2668
|
-
std::vector<std::vector<uint32_t>>
|
2669
|
-
std::vector<llama_grammar_candidate>
|
2757
|
+
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
2758
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
2670
2759
|
|
2671
2760
|
for (size_t i = 0; i < candidates->size; ++i) {
|
2672
2761
|
const llama_token id = candidates->data[i].id;
|
@@ -2678,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
2678
2767
|
} else if (*str == 0) {
|
2679
2768
|
candidates->data[i].logit = -INFINITY;
|
2680
2769
|
} else {
|
2681
|
-
candidates_decoded.push_back(decode_utf8(str));
|
2682
|
-
candidates_grammar.push_back({
|
2770
|
+
candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
|
2771
|
+
candidates_grammar.push_back({
|
2772
|
+
i, candidates_decoded.back().first.data(), candidates_decoded.back().second
|
2773
|
+
});
|
2683
2774
|
}
|
2684
2775
|
}
|
2685
2776
|
|
@@ -2880,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
2880
2971
|
}
|
2881
2972
|
|
2882
2973
|
const char * str = llama_token_to_str(ctx, token);
|
2974
|
+
|
2883
2975
|
// Note terminating 0 in decoded string
|
2884
|
-
auto
|
2976
|
+
const auto decoded = decode_utf8(str, grammar->partial_utf8);
|
2977
|
+
const auto & code_points = decoded.first;
|
2885
2978
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
2886
2979
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
2887
2980
|
}
|
2981
|
+
grammar->partial_utf8 = decoded.second;
|
2888
2982
|
LLAMA_ASSERT(!grammar->stacks.empty());
|
2889
2983
|
|
2890
2984
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -3303,7 +3397,18 @@ struct llama_context * llama_new_context_with_model(
|
|
3303
3397
|
int n_past = hparams.n_ctx - n_tokens;
|
3304
3398
|
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3305
3399
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3306
|
-
|
3400
|
+
#ifdef GGML_USE_METAL
|
3401
|
+
if (params.n_gpu_layers > 0) {
|
3402
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
3403
|
+
if (!ctx->ctx_metal) {
|
3404
|
+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
3405
|
+
llama_free(ctx);
|
3406
|
+
return NULL;
|
3407
|
+
}
|
3408
|
+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
3409
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3410
|
+
}
|
3411
|
+
#endif
|
3307
3412
|
// measure memory requirements for the graph
|
3308
3413
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3309
3414
|
|
@@ -3321,6 +3426,11 @@ struct llama_context * llama_new_context_with_model(
|
|
3321
3426
|
|
3322
3427
|
ctx->buf_alloc.resize(alloc_size);
|
3323
3428
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3429
|
+
#ifdef GGML_USE_METAL
|
3430
|
+
if (ctx->ctx_metal) {
|
3431
|
+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
3432
|
+
}
|
3433
|
+
#endif
|
3324
3434
|
}
|
3325
3435
|
#else
|
3326
3436
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
@@ -3335,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
|
|
3335
3445
|
#ifdef GGML_USE_METAL
|
3336
3446
|
if (params.n_gpu_layers > 0) {
|
3337
3447
|
// this allocates all Metal resources and memory buffers
|
3338
|
-
ctx->ctx_metal = ggml_metal_init(1);
|
3339
3448
|
|
3340
3449
|
void * data_ptr = NULL;
|
3341
3450
|
size_t data_size = 0;
|
@@ -3364,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
|
|
3364
3473
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
3365
3474
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
3366
3475
|
|
3367
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
3368
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
3476
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
|
3369
3477
|
#undef LLAMA_METAL_CHECK_BUF
|
3370
3478
|
}
|
3371
3479
|
#endif
|
@@ -4173,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
4173
4281
|
return ctx->model.hparams.n_embd;
|
4174
4282
|
}
|
4175
4283
|
|
4284
|
+
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
4285
|
+
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
|
4286
|
+
}
|
4287
|
+
|
4176
4288
|
int llama_get_vocab_from_model(
|
4177
4289
|
const struct llama_model * model,
|
4178
4290
|
const char * * strings,
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -97,7 +97,7 @@ extern "C" {
|
|
97
97
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
98
|
// if it exists.
|
99
99
|
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
100
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
101
|
|
102
102
|
struct llama_context_params {
|
103
103
|
uint32_t seed; // RNG seed, -1 for random
|
@@ -351,6 +351,8 @@ extern "C" {
|
|
351
351
|
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
352
|
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
353
|
|
354
|
+
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
+
|
354
356
|
// Get the vocabulary as output parameters.
|
355
357
|
// Returns number of results.
|
356
358
|
LLAMA_API int llama_get_vocab(
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.8'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-097e121'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -84,6 +84,7 @@ module LLaMACpp
|
|
84
84
|
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
85
|
def token_to_str: (Integer) -> String
|
86
86
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
|
+
def type: () -> String
|
87
88
|
end
|
88
89
|
|
89
90
|
class Timings
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|