llama_cpp 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -273,14 +273,16 @@ struct llama_mmap {
273
273
 
274
274
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
275
275
  if (prefetch) {
276
- // Advise the kernel to preload the mapped memory
277
- WIN32_MEMORY_RANGE_ENTRY range;
278
- range.VirtualAddress = addr;
279
- range.NumberOfBytes = (SIZE_T)size;
280
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
281
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
282
- llama_format_win_err(GetLastError()).c_str());
283
- }
276
+ // Advise the kernel to preload the mapped memory
277
+
278
+ WIN32_MEMORY_RANGE_ENTRY range;
279
+ range.VirtualAddress = addr;
280
+
281
+ range.NumberOfBytes = (SIZE_T)size;
282
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
283
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
284
+ llama_format_win_err(GetLastError()).c_str());
285
+ }
284
286
  }
285
287
  #else
286
288
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
63
63
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
64
64
 
65
65
 
66
- #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
66
+ #if !defined(GGML_USE_CUBLAS)
67
67
  #include "ggml-alloc.h"
68
68
  #define LLAMA_USE_ALLOCATOR
69
69
  #else
@@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
115
115
  // memory sizes (calculated for n_batch == 512)
116
116
  //
117
117
 
118
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
118
+ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
119
119
  {
120
- static std::map<e_model, size_t> k_sizes = {
120
+ std::map<e_model, size_t> k_sizes = {
121
121
  { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
122
122
  { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
123
123
  { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
@@ -984,7 +984,7 @@ int64_t llama_time_us() {
984
984
  // model loading
985
985
  //
986
986
 
987
- static const char *llama_file_version_name(llama_file_version version) {
987
+ static const char * llama_file_version_name(llama_file_version version) {
988
988
  switch (version) {
989
989
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
990
990
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
@@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
996
996
  return "unknown";
997
997
  }
998
998
 
999
- static const char *llama_ftype_name(enum llama_ftype ftype) {
999
+ const char * llama_ftype_name(enum llama_ftype ftype) {
1000
1000
  switch (ftype) {
1001
1001
  case LLAMA_FTYPE_ALL_F32: return "all F32";
1002
1002
  case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@@ -1021,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1021
1021
  }
1022
1022
  }
1023
1023
 
1024
- static const char *llama_model_type_name(e_model type) {
1024
+ static const char * llama_model_type_name(e_model type) {
1025
1025
  switch (type) {
1026
1026
  case MODEL_3B: return "3B";
1027
1027
  case MODEL_7B: return "7B";
@@ -1609,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
1609
1609
  ggml_set_name(Q, "Q");
1610
1610
 
1611
1611
  struct ggml_tensor * K =
1612
- ggml_permute(ctx0,
1613
- ggml_reshape_3d(ctx0,
1614
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1615
- n_embd_head, n_head_kv, n_past + N),
1616
- 0, 2, 1, 3);
1612
+ ggml_view_3d(ctx0, kv_self.k,
1613
+ n_embd_head, n_past + N, n_head_kv,
1614
+ ggml_element_size(kv_self.k)*n_embd_gqa,
1615
+ ggml_element_size(kv_self.k)*n_embd_head,
1616
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
1617
1617
  offload_func_kq(K);
1618
1618
  ggml_set_name(K, "K");
1619
1619
 
@@ -1642,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
1642
1642
  struct ggml_tensor * V =
1643
1643
  ggml_view_3d(ctx0, kv_self.v,
1644
1644
  n_past + N, n_embd_head, n_head_kv,
1645
- n_ctx*ggml_element_size(kv_self.v),
1646
- n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1647
- n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1645
+ ggml_element_size(kv_self.v)*n_ctx,
1646
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
1647
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
1648
1648
  offload_func_v(V);
1649
1649
  ggml_set_name(V, "V");
1650
1650
 
@@ -1799,6 +1799,13 @@ static bool llama_eval_internal(
1799
1799
 
1800
1800
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1801
1801
 
1802
+ LLAMA_ASSERT(n_tokens > 0);
1803
+ LLAMA_ASSERT(n_past >= 0);
1804
+ LLAMA_ASSERT(n_threads > 0);
1805
+ // TODO: keep the values of n_batch and n_ctx
1806
+ // LLAMA_ASSERT(n_tokens <= n_batch);
1807
+ // LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
1808
+
1802
1809
  const int64_t t_start_us = ggml_time_us();
1803
1810
 
1804
1811
  #ifdef GGML_USE_MPI
@@ -1845,11 +1852,7 @@ static bool llama_eval_internal(
1845
1852
  #endif
1846
1853
 
1847
1854
  #ifdef GGML_USE_METAL
1848
- if (lctx.ctx_metal && N == 1) {
1849
- // TODO: disabled until #2413 is resolved
1850
- //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1851
- // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1852
- //}
1855
+ if (lctx.ctx_metal) {
1853
1856
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1854
1857
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1855
1858
  ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -1857,22 +1860,6 @@ static bool llama_eval_internal(
1857
1860
  ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1858
1861
  }
1859
1862
  } else {
1860
- // IMPORTANT:
1861
- // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1862
- // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1863
- // coprocessor.
1864
- //
1865
- // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1866
- // But for now, we have focused only on Matrix x Vector Metal multiplication.
1867
- //
1868
- // TODO: avoid these syncs via shared memory (ref #1696)
1869
- //
1870
- if (lctx.ctx_metal) {
1871
- // We need to sync the GPU KV cache with the CPU KV cache
1872
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1873
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1874
- }
1875
-
1876
1863
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1877
1864
  }
1878
1865
  #else
@@ -2097,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
2097
2084
  // grammar - internal
2098
2085
  //
2099
2086
 
2087
+ struct llama_partial_utf8 {
2088
+ uint32_t value; // bit value so far (unshifted)
2089
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
2090
+ };
2091
+
2100
2092
  struct llama_grammar {
2101
2093
  const std::vector<std::vector<llama_grammar_element>> rules;
2102
2094
  std::vector<std::vector<const llama_grammar_element *>> stacks;
2095
+
2096
+ // buffer for partially generated UTF-8 sequence from accepted tokens
2097
+ llama_partial_utf8 partial_utf8;
2103
2098
  };
2104
2099
 
2105
2100
  struct llama_grammar_candidate {
2106
- size_t index;
2107
- const uint32_t * code_points;
2101
+ size_t index;
2102
+ const uint32_t * code_points;
2103
+ llama_partial_utf8 partial_utf8;
2108
2104
  };
2109
2105
 
2110
- // NOTE: assumes valid utf8 (but checks for overrun)
2111
- // adds a terminating 0 for use as pointer
2112
- std::vector<uint32_t> decode_utf8(const char * src) {
2113
- static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2106
+ // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
2107
+ // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
2108
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
2109
+ const char * src,
2110
+ llama_partial_utf8 partial_start) {
2111
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
2114
2112
  const char * pos = src;
2115
2113
  std::vector<uint32_t> code_points;
2114
+ uint32_t value = partial_start.value;
2115
+ int n_remain = partial_start.n_remain;
2116
+
2117
+ // continue previous decode, if applicable
2118
+ while (*pos != 0 && n_remain > 0) {
2119
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
2120
+ if ((next_byte >> 6) != 2) {
2121
+ // invalid sequence, abort
2122
+ code_points.push_back(0);
2123
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
2124
+ }
2125
+ value = (value << 6) + (next_byte & 0x3F);
2126
+ ++pos;
2127
+ --n_remain;
2128
+ }
2129
+
2130
+ if (partial_start.n_remain > 0 && n_remain == 0) {
2131
+ code_points.push_back(value);
2132
+ }
2133
+
2134
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
2116
2135
  while (*pos != 0) {
2117
2136
  uint8_t first_byte = static_cast<uint8_t>(*pos);
2118
2137
  uint8_t highbits = first_byte >> 4;
2119
- int len = lookup[highbits];
2120
- uint8_t mask = (1 << (8 - len)) - 1;
2121
- uint32_t value = first_byte & mask;
2122
- const char * end = pos + len; // may overrun!
2138
+ n_remain = lookup[highbits] - 1;
2139
+
2140
+ if (n_remain < 0) {
2141
+ // invalid sequence, abort
2142
+ code_points.clear();
2143
+ code_points.push_back(0);
2144
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
2145
+ }
2146
+
2147
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
2148
+ value = first_byte & mask;
2123
2149
  ++pos;
2124
- for ( ; pos < end && *pos != 0; ++pos) {
2150
+ while (*pos != 0 && n_remain > 0) {
2125
2151
  value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2152
+ ++pos;
2153
+ --n_remain;
2154
+ }
2155
+ if (n_remain == 0) {
2156
+ code_points.push_back(value);
2126
2157
  }
2127
- code_points.push_back(value);
2128
2158
  }
2129
2159
  code_points.push_back(0);
2130
- return code_points;
2160
+
2161
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
2131
2162
  }
2132
2163
 
2133
2164
  // returns true iff pos points to the end of one of the definitions of a rule
@@ -2164,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2164
2195
  return std::make_pair(found == is_positive_char, pos);
2165
2196
  }
2166
2197
 
2198
+ // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
2199
+ // range at pos (regular or inverse range)
2200
+ // asserts that pos is pointing to a char range element
2201
+ static bool llama_grammar_match_partial_char(
2202
+ const llama_grammar_element * pos,
2203
+ const llama_partial_utf8 partial_utf8) {
2204
+
2205
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2206
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2207
+
2208
+ uint32_t partial_value = partial_utf8.value;
2209
+ int n_remain = partial_utf8.n_remain;
2210
+
2211
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
2212
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
2213
+ return false;
2214
+ }
2215
+
2216
+ // range of possible code points this partial UTF-8 sequence could complete to
2217
+ uint32_t low = partial_value << (n_remain * 6);
2218
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
2219
+
2220
+ if (low == 0) {
2221
+ if (n_remain == 2) {
2222
+ low = 1 << 11;
2223
+ } else if (n_remain == 3) {
2224
+ low = 1 << 16;
2225
+ }
2226
+ }
2227
+
2228
+ do {
2229
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2230
+ // inclusive range, e.g. [a-z]
2231
+ if (pos->value <= high && low <= pos[1].value) {
2232
+ return is_positive_char;
2233
+ }
2234
+ pos += 2;
2235
+ } else {
2236
+ // exact char match, e.g. [a] or "a"
2237
+ if (low <= pos->value && pos->value <= high) {
2238
+ return is_positive_char;
2239
+ }
2240
+ pos += 1;
2241
+ }
2242
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2243
+
2244
+ return !is_positive_char;
2245
+ }
2246
+
2247
+
2167
2248
  // transforms a grammar pushdown stack into N possible stacks, all ending
2168
2249
  // at a character range (terminal element)
2169
2250
  static void llama_grammar_advance_stack(
@@ -2264,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2264
2345
  std::vector<llama_grammar_candidate> rejects;
2265
2346
 
2266
2347
  if (stack.empty()) {
2267
- // accept nothing; EOS is handled elsewhere
2268
- rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2348
+ for (auto tok : candidates) {
2349
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
2350
+ rejects.push_back(tok);
2351
+ }
2352
+ }
2269
2353
  return rejects;
2270
2354
  }
2271
2355
 
@@ -2273,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2273
2357
 
2274
2358
  std::vector<llama_grammar_candidate> next_candidates;
2275
2359
  for (auto tok : candidates) {
2276
- if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2277
- if (tok.code_points[1] != 0) {
2278
- next_candidates.push_back({ tok.index, tok.code_points + 1 });
2360
+ if (*tok.code_points == 0) {
2361
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
2362
+ // that cannot satisfy this position in grammar
2363
+ if (tok.partial_utf8.n_remain != 0 &&
2364
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
2365
+ rejects.push_back(tok);
2279
2366
  }
2367
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
2368
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
2280
2369
  } else {
2281
2370
  rejects.push_back(tok);
2282
2371
  }
@@ -2294,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2294
2383
 
2295
2384
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2296
2385
  for (auto tok : next_rejects) {
2297
- rejects.push_back({ tok.index, tok.code_points - 1 });
2386
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
2298
2387
  }
2299
2388
 
2300
2389
  return rejects;
@@ -2359,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
2359
2448
  }
2360
2449
  } while (true);
2361
2450
 
2362
- return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2451
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
2363
2452
  }
2364
2453
 
2365
2454
  void llama_grammar_free(struct llama_grammar * grammar) {
@@ -2665,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2665
2754
 
2666
2755
  const llama_token eos = llama_token_eos();
2667
2756
 
2668
- std::vector<std::vector<uint32_t>> candidates_decoded;
2669
- std::vector<llama_grammar_candidate> candidates_grammar;
2757
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
2758
+ std::vector<llama_grammar_candidate> candidates_grammar;
2670
2759
 
2671
2760
  for (size_t i = 0; i < candidates->size; ++i) {
2672
2761
  const llama_token id = candidates->data[i].id;
@@ -2678,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2678
2767
  } else if (*str == 0) {
2679
2768
  candidates->data[i].logit = -INFINITY;
2680
2769
  } else {
2681
- candidates_decoded.push_back(decode_utf8(str));
2682
- candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2770
+ candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
2771
+ candidates_grammar.push_back({
2772
+ i, candidates_decoded.back().first.data(), candidates_decoded.back().second
2773
+ });
2683
2774
  }
2684
2775
  }
2685
2776
 
@@ -2880,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
2880
2971
  }
2881
2972
 
2882
2973
  const char * str = llama_token_to_str(ctx, token);
2974
+
2883
2975
  // Note terminating 0 in decoded string
2884
- auto code_points = decode_utf8(str);
2976
+ const auto decoded = decode_utf8(str, grammar->partial_utf8);
2977
+ const auto & code_points = decoded.first;
2885
2978
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2886
2979
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2887
2980
  }
2981
+ grammar->partial_utf8 = decoded.second;
2888
2982
  LLAMA_ASSERT(!grammar->stacks.empty());
2889
2983
 
2890
2984
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -3303,7 +3397,18 @@ struct llama_context * llama_new_context_with_model(
3303
3397
  int n_past = hparams.n_ctx - n_tokens;
3304
3398
  llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3305
3399
  ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3306
-
3400
+ #ifdef GGML_USE_METAL
3401
+ if (params.n_gpu_layers > 0) {
3402
+ ctx->ctx_metal = ggml_metal_init(1);
3403
+ if (!ctx->ctx_metal) {
3404
+ LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3405
+ llama_free(ctx);
3406
+ return NULL;
3407
+ }
3408
+ ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3409
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3410
+ }
3411
+ #endif
3307
3412
  // measure memory requirements for the graph
3308
3413
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3309
3414
 
@@ -3321,6 +3426,11 @@ struct llama_context * llama_new_context_with_model(
3321
3426
 
3322
3427
  ctx->buf_alloc.resize(alloc_size);
3323
3428
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3429
+ #ifdef GGML_USE_METAL
3430
+ if (ctx->ctx_metal) {
3431
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3432
+ }
3433
+ #endif
3324
3434
  }
3325
3435
  #else
3326
3436
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3335,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
3335
3445
  #ifdef GGML_USE_METAL
3336
3446
  if (params.n_gpu_layers > 0) {
3337
3447
  // this allocates all Metal resources and memory buffers
3338
- ctx->ctx_metal = ggml_metal_init(1);
3339
3448
 
3340
3449
  void * data_ptr = NULL;
3341
3450
  size_t data_size = 0;
@@ -3364,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
3364
3473
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
3365
3474
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
3366
3475
 
3367
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3368
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3476
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
3369
3477
  #undef LLAMA_METAL_CHECK_BUF
3370
3478
  }
3371
3479
  #endif
@@ -4173,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
4173
4281
  return ctx->model.hparams.n_embd;
4174
4282
  }
4175
4283
 
4284
+ int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
4285
+ return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
4286
+ }
4287
+
4176
4288
  int llama_get_vocab_from_model(
4177
4289
  const struct llama_model * model,
4178
4290
  const char * * strings,
@@ -97,7 +97,7 @@ extern "C" {
97
97
  // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
98
98
  // if it exists.
99
99
  // It might not exist for progress report where '.' is output repeatedly.
100
- typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
100
+ typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
101
101
 
102
102
  struct llama_context_params {
103
103
  uint32_t seed; // RNG seed, -1 for random
@@ -351,6 +351,8 @@ extern "C" {
351
351
  LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
352
352
  LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
353
353
 
354
+ LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
355
+
354
356
  // Get the vocabulary as output parameters.
355
357
  // Returns number of results.
356
358
  LLAMA_API int llama_get_vocab(
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.7'
6
+ VERSION = '0.3.8'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9ca4abe'
9
+ LLAMA_CPP_VERSION = 'master-097e121'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -84,6 +84,7 @@ module LLaMACpp
84
84
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
85
85
  def token_to_str: (Integer) -> String
86
86
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
87
+ def type: () -> String
87
88
  end
88
89
 
89
90
  class Timings
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-11 00:00:00.000000000 Z
11
+ date: 2023-08-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: