llama_cpp 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -273,14 +273,16 @@ struct llama_mmap {
273
273
 
274
274
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
275
275
  if (prefetch) {
276
- // Advise the kernel to preload the mapped memory
277
- WIN32_MEMORY_RANGE_ENTRY range;
278
- range.VirtualAddress = addr;
279
- range.NumberOfBytes = (SIZE_T)size;
280
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
281
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
282
- llama_format_win_err(GetLastError()).c_str());
283
- }
276
+ // Advise the kernel to preload the mapped memory
277
+
278
+ WIN32_MEMORY_RANGE_ENTRY range;
279
+ range.VirtualAddress = addr;
280
+
281
+ range.NumberOfBytes = (SIZE_T)size;
282
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
283
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
284
+ llama_format_win_err(GetLastError()).c_str());
285
+ }
284
286
  }
285
287
  #else
286
288
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
63
63
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
64
64
 
65
65
 
66
- #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
66
+ #if !defined(GGML_USE_CUBLAS)
67
67
  #include "ggml-alloc.h"
68
68
  #define LLAMA_USE_ALLOCATOR
69
69
  #else
@@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
115
115
  // memory sizes (calculated for n_batch == 512)
116
116
  //
117
117
 
118
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
118
+ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
119
119
  {
120
- static std::map<e_model, size_t> k_sizes = {
120
+ std::map<e_model, size_t> k_sizes = {
121
121
  { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
122
122
  { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
123
123
  { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
@@ -984,7 +984,7 @@ int64_t llama_time_us() {
984
984
  // model loading
985
985
  //
986
986
 
987
- static const char *llama_file_version_name(llama_file_version version) {
987
+ static const char * llama_file_version_name(llama_file_version version) {
988
988
  switch (version) {
989
989
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
990
990
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
@@ -996,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
996
996
  return "unknown";
997
997
  }
998
998
 
999
- static const char *llama_ftype_name(enum llama_ftype ftype) {
999
+ const char * llama_ftype_name(enum llama_ftype ftype) {
1000
1000
  switch (ftype) {
1001
1001
  case LLAMA_FTYPE_ALL_F32: return "all F32";
1002
1002
  case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@@ -1021,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1021
1021
  }
1022
1022
  }
1023
1023
 
1024
- static const char *llama_model_type_name(e_model type) {
1024
+ static const char * llama_model_type_name(e_model type) {
1025
1025
  switch (type) {
1026
1026
  case MODEL_3B: return "3B";
1027
1027
  case MODEL_7B: return "7B";
@@ -1609,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
1609
1609
  ggml_set_name(Q, "Q");
1610
1610
 
1611
1611
  struct ggml_tensor * K =
1612
- ggml_permute(ctx0,
1613
- ggml_reshape_3d(ctx0,
1614
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1615
- n_embd_head, n_head_kv, n_past + N),
1616
- 0, 2, 1, 3);
1612
+ ggml_view_3d(ctx0, kv_self.k,
1613
+ n_embd_head, n_past + N, n_head_kv,
1614
+ ggml_element_size(kv_self.k)*n_embd_gqa,
1615
+ ggml_element_size(kv_self.k)*n_embd_head,
1616
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
1617
1617
  offload_func_kq(K);
1618
1618
  ggml_set_name(K, "K");
1619
1619
 
@@ -1642,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
1642
1642
  struct ggml_tensor * V =
1643
1643
  ggml_view_3d(ctx0, kv_self.v,
1644
1644
  n_past + N, n_embd_head, n_head_kv,
1645
- n_ctx*ggml_element_size(kv_self.v),
1646
- n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1647
- n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1645
+ ggml_element_size(kv_self.v)*n_ctx,
1646
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
1647
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
1648
1648
  offload_func_v(V);
1649
1649
  ggml_set_name(V, "V");
1650
1650
 
@@ -1799,6 +1799,13 @@ static bool llama_eval_internal(
1799
1799
 
1800
1800
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1801
1801
 
1802
+ LLAMA_ASSERT(n_tokens > 0);
1803
+ LLAMA_ASSERT(n_past >= 0);
1804
+ LLAMA_ASSERT(n_threads > 0);
1805
+ // TODO: keep the values of n_batch and n_ctx
1806
+ // LLAMA_ASSERT(n_tokens <= n_batch);
1807
+ // LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
1808
+
1802
1809
  const int64_t t_start_us = ggml_time_us();
1803
1810
 
1804
1811
  #ifdef GGML_USE_MPI
@@ -1845,11 +1852,7 @@ static bool llama_eval_internal(
1845
1852
  #endif
1846
1853
 
1847
1854
  #ifdef GGML_USE_METAL
1848
- if (lctx.ctx_metal && N == 1) {
1849
- // TODO: disabled until #2413 is resolved
1850
- //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1851
- // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1852
- //}
1855
+ if (lctx.ctx_metal) {
1853
1856
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1854
1857
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1855
1858
  ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -1857,22 +1860,6 @@ static bool llama_eval_internal(
1857
1860
  ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1858
1861
  }
1859
1862
  } else {
1860
- // IMPORTANT:
1861
- // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1862
- // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1863
- // coprocessor.
1864
- //
1865
- // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1866
- // But for now, we have focused only on Matrix x Vector Metal multiplication.
1867
- //
1868
- // TODO: avoid these syncs via shared memory (ref #1696)
1869
- //
1870
- if (lctx.ctx_metal) {
1871
- // We need to sync the GPU KV cache with the CPU KV cache
1872
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1873
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1874
- }
1875
-
1876
1863
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1877
1864
  }
1878
1865
  #else
@@ -2097,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
2097
2084
  // grammar - internal
2098
2085
  //
2099
2086
 
2087
+ struct llama_partial_utf8 {
2088
+ uint32_t value; // bit value so far (unshifted)
2089
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
2090
+ };
2091
+
2100
2092
  struct llama_grammar {
2101
2093
  const std::vector<std::vector<llama_grammar_element>> rules;
2102
2094
  std::vector<std::vector<const llama_grammar_element *>> stacks;
2095
+
2096
+ // buffer for partially generated UTF-8 sequence from accepted tokens
2097
+ llama_partial_utf8 partial_utf8;
2103
2098
  };
2104
2099
 
2105
2100
  struct llama_grammar_candidate {
2106
- size_t index;
2107
- const uint32_t * code_points;
2101
+ size_t index;
2102
+ const uint32_t * code_points;
2103
+ llama_partial_utf8 partial_utf8;
2108
2104
  };
2109
2105
 
2110
- // NOTE: assumes valid utf8 (but checks for overrun)
2111
- // adds a terminating 0 for use as pointer
2112
- std::vector<uint32_t> decode_utf8(const char * src) {
2113
- static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2106
+ // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
2107
+ // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
2108
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
2109
+ const char * src,
2110
+ llama_partial_utf8 partial_start) {
2111
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
2114
2112
  const char * pos = src;
2115
2113
  std::vector<uint32_t> code_points;
2114
+ uint32_t value = partial_start.value;
2115
+ int n_remain = partial_start.n_remain;
2116
+
2117
+ // continue previous decode, if applicable
2118
+ while (*pos != 0 && n_remain > 0) {
2119
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
2120
+ if ((next_byte >> 6) != 2) {
2121
+ // invalid sequence, abort
2122
+ code_points.push_back(0);
2123
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
2124
+ }
2125
+ value = (value << 6) + (next_byte & 0x3F);
2126
+ ++pos;
2127
+ --n_remain;
2128
+ }
2129
+
2130
+ if (partial_start.n_remain > 0 && n_remain == 0) {
2131
+ code_points.push_back(value);
2132
+ }
2133
+
2134
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
2116
2135
  while (*pos != 0) {
2117
2136
  uint8_t first_byte = static_cast<uint8_t>(*pos);
2118
2137
  uint8_t highbits = first_byte >> 4;
2119
- int len = lookup[highbits];
2120
- uint8_t mask = (1 << (8 - len)) - 1;
2121
- uint32_t value = first_byte & mask;
2122
- const char * end = pos + len; // may overrun!
2138
+ n_remain = lookup[highbits] - 1;
2139
+
2140
+ if (n_remain < 0) {
2141
+ // invalid sequence, abort
2142
+ code_points.clear();
2143
+ code_points.push_back(0);
2144
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
2145
+ }
2146
+
2147
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
2148
+ value = first_byte & mask;
2123
2149
  ++pos;
2124
- for ( ; pos < end && *pos != 0; ++pos) {
2150
+ while (*pos != 0 && n_remain > 0) {
2125
2151
  value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2152
+ ++pos;
2153
+ --n_remain;
2154
+ }
2155
+ if (n_remain == 0) {
2156
+ code_points.push_back(value);
2126
2157
  }
2127
- code_points.push_back(value);
2128
2158
  }
2129
2159
  code_points.push_back(0);
2130
- return code_points;
2160
+
2161
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
2131
2162
  }
2132
2163
 
2133
2164
  // returns true iff pos points to the end of one of the definitions of a rule
@@ -2164,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2164
2195
  return std::make_pair(found == is_positive_char, pos);
2165
2196
  }
2166
2197
 
2198
+ // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
2199
+ // range at pos (regular or inverse range)
2200
+ // asserts that pos is pointing to a char range element
2201
+ static bool llama_grammar_match_partial_char(
2202
+ const llama_grammar_element * pos,
2203
+ const llama_partial_utf8 partial_utf8) {
2204
+
2205
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2206
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2207
+
2208
+ uint32_t partial_value = partial_utf8.value;
2209
+ int n_remain = partial_utf8.n_remain;
2210
+
2211
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
2212
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
2213
+ return false;
2214
+ }
2215
+
2216
+ // range of possible code points this partial UTF-8 sequence could complete to
2217
+ uint32_t low = partial_value << (n_remain * 6);
2218
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
2219
+
2220
+ if (low == 0) {
2221
+ if (n_remain == 2) {
2222
+ low = 1 << 11;
2223
+ } else if (n_remain == 3) {
2224
+ low = 1 << 16;
2225
+ }
2226
+ }
2227
+
2228
+ do {
2229
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2230
+ // inclusive range, e.g. [a-z]
2231
+ if (pos->value <= high && low <= pos[1].value) {
2232
+ return is_positive_char;
2233
+ }
2234
+ pos += 2;
2235
+ } else {
2236
+ // exact char match, e.g. [a] or "a"
2237
+ if (low <= pos->value && pos->value <= high) {
2238
+ return is_positive_char;
2239
+ }
2240
+ pos += 1;
2241
+ }
2242
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2243
+
2244
+ return !is_positive_char;
2245
+ }
2246
+
2247
+
2167
2248
  // transforms a grammar pushdown stack into N possible stacks, all ending
2168
2249
  // at a character range (terminal element)
2169
2250
  static void llama_grammar_advance_stack(
@@ -2264,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2264
2345
  std::vector<llama_grammar_candidate> rejects;
2265
2346
 
2266
2347
  if (stack.empty()) {
2267
- // accept nothing; EOS is handled elsewhere
2268
- rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2348
+ for (auto tok : candidates) {
2349
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
2350
+ rejects.push_back(tok);
2351
+ }
2352
+ }
2269
2353
  return rejects;
2270
2354
  }
2271
2355
 
@@ -2273,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2273
2357
 
2274
2358
  std::vector<llama_grammar_candidate> next_candidates;
2275
2359
  for (auto tok : candidates) {
2276
- if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2277
- if (tok.code_points[1] != 0) {
2278
- next_candidates.push_back({ tok.index, tok.code_points + 1 });
2360
+ if (*tok.code_points == 0) {
2361
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
2362
+ // that cannot satisfy this position in grammar
2363
+ if (tok.partial_utf8.n_remain != 0 &&
2364
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
2365
+ rejects.push_back(tok);
2279
2366
  }
2367
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
2368
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
2280
2369
  } else {
2281
2370
  rejects.push_back(tok);
2282
2371
  }
@@ -2294,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2294
2383
 
2295
2384
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2296
2385
  for (auto tok : next_rejects) {
2297
- rejects.push_back({ tok.index, tok.code_points - 1 });
2386
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
2298
2387
  }
2299
2388
 
2300
2389
  return rejects;
@@ -2359,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
2359
2448
  }
2360
2449
  } while (true);
2361
2450
 
2362
- return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2451
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
2363
2452
  }
2364
2453
 
2365
2454
  void llama_grammar_free(struct llama_grammar * grammar) {
@@ -2665,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2665
2754
 
2666
2755
  const llama_token eos = llama_token_eos();
2667
2756
 
2668
- std::vector<std::vector<uint32_t>> candidates_decoded;
2669
- std::vector<llama_grammar_candidate> candidates_grammar;
2757
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
2758
+ std::vector<llama_grammar_candidate> candidates_grammar;
2670
2759
 
2671
2760
  for (size_t i = 0; i < candidates->size; ++i) {
2672
2761
  const llama_token id = candidates->data[i].id;
@@ -2678,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2678
2767
  } else if (*str == 0) {
2679
2768
  candidates->data[i].logit = -INFINITY;
2680
2769
  } else {
2681
- candidates_decoded.push_back(decode_utf8(str));
2682
- candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2770
+ candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
2771
+ candidates_grammar.push_back({
2772
+ i, candidates_decoded.back().first.data(), candidates_decoded.back().second
2773
+ });
2683
2774
  }
2684
2775
  }
2685
2776
 
@@ -2880,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
2880
2971
  }
2881
2972
 
2882
2973
  const char * str = llama_token_to_str(ctx, token);
2974
+
2883
2975
  // Note terminating 0 in decoded string
2884
- auto code_points = decode_utf8(str);
2976
+ const auto decoded = decode_utf8(str, grammar->partial_utf8);
2977
+ const auto & code_points = decoded.first;
2885
2978
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2886
2979
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2887
2980
  }
2981
+ grammar->partial_utf8 = decoded.second;
2888
2982
  LLAMA_ASSERT(!grammar->stacks.empty());
2889
2983
 
2890
2984
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -3303,7 +3397,18 @@ struct llama_context * llama_new_context_with_model(
3303
3397
  int n_past = hparams.n_ctx - n_tokens;
3304
3398
  llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3305
3399
  ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3306
-
3400
+ #ifdef GGML_USE_METAL
3401
+ if (params.n_gpu_layers > 0) {
3402
+ ctx->ctx_metal = ggml_metal_init(1);
3403
+ if (!ctx->ctx_metal) {
3404
+ LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3405
+ llama_free(ctx);
3406
+ return NULL;
3407
+ }
3408
+ ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3409
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3410
+ }
3411
+ #endif
3307
3412
  // measure memory requirements for the graph
3308
3413
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3309
3414
 
@@ -3321,6 +3426,11 @@ struct llama_context * llama_new_context_with_model(
3321
3426
 
3322
3427
  ctx->buf_alloc.resize(alloc_size);
3323
3428
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3429
+ #ifdef GGML_USE_METAL
3430
+ if (ctx->ctx_metal) {
3431
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3432
+ }
3433
+ #endif
3324
3434
  }
3325
3435
  #else
3326
3436
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3335,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
3335
3445
  #ifdef GGML_USE_METAL
3336
3446
  if (params.n_gpu_layers > 0) {
3337
3447
  // this allocates all Metal resources and memory buffers
3338
- ctx->ctx_metal = ggml_metal_init(1);
3339
3448
 
3340
3449
  void * data_ptr = NULL;
3341
3450
  size_t data_size = 0;
@@ -3364,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
3364
3473
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
3365
3474
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
3366
3475
 
3367
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3368
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3476
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
3369
3477
  #undef LLAMA_METAL_CHECK_BUF
3370
3478
  }
3371
3479
  #endif
@@ -4173,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
4173
4281
  return ctx->model.hparams.n_embd;
4174
4282
  }
4175
4283
 
4284
+ int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
4285
+ return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
4286
+ }
4287
+
4176
4288
  int llama_get_vocab_from_model(
4177
4289
  const struct llama_model * model,
4178
4290
  const char * * strings,
@@ -97,7 +97,7 @@ extern "C" {
97
97
  // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
98
98
  // if it exists.
99
99
  // It might not exist for progress report where '.' is output repeatedly.
100
- typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
100
+ typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
101
101
 
102
102
  struct llama_context_params {
103
103
  uint32_t seed; // RNG seed, -1 for random
@@ -351,6 +351,8 @@ extern "C" {
351
351
  LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
352
352
  LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
353
353
 
354
+ LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
355
+
354
356
  // Get the vocabulary as output parameters.
355
357
  // Returns number of results.
356
358
  LLAMA_API int llama_get_vocab(
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.7'
6
+ VERSION = '0.3.8'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9ca4abe'
9
+ LLAMA_CPP_VERSION = 'master-097e121'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -84,6 +84,7 @@ module LLaMACpp
84
84
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
85
85
  def token_to_str: (Integer) -> String
86
86
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
87
+ def type: () -> String
87
88
  end
88
89
 
89
90
  class Timings
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.3.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-11 00:00:00.000000000 Z
11
+ date: 2023-08-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: