llama_cpp 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,8 +56,14 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
60
+ #include "ggml-alloc.h"
61
+ #define LLAMA_USE_ALLOCATOR
62
+ #else
59
63
  #define LLAMA_USE_SCRATCH
60
64
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
65
+ #endif
66
+
61
67
 
62
68
  // available llama models
63
69
  enum e_model {
@@ -327,13 +333,22 @@ struct llama_model {
327
333
 
328
334
  struct llama_context {
329
335
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
330
- #ifdef GGML_USE_METAL
331
336
  ~llama_context() {
337
+ if (model_owner) {
338
+ delete &model;
339
+ }
340
+ #ifdef GGML_USE_METAL
332
341
  if (ctx_metal) {
333
342
  ggml_metal_free(ctx_metal);
334
343
  }
335
- }
336
344
  #endif
345
+ #ifdef LLAMA_USE_ALLOCATOR
346
+ if (alloc) {
347
+ ggml_allocr_free(alloc);
348
+ }
349
+ #endif
350
+ }
351
+
337
352
  std::mt19937 rng;
338
353
 
339
354
  bool has_evaluated_once = false;
@@ -371,7 +386,17 @@ struct llama_context {
371
386
  // memory buffers used to evaluate the model
372
387
  // TODO: move in llama_state
373
388
  llama_ctx_buffer buf_compute;
389
+
390
+ #ifdef LLAMA_USE_ALLOCATOR
391
+ llama_ctx_buffer buf_alloc;
392
+ ggml_allocr * alloc = NULL;
393
+ #endif
394
+
395
+ #ifdef LLAMA_USE_SCRATCH
374
396
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
397
+ int buf_last = 0;
398
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
399
+ #endif
375
400
 
376
401
  #ifdef GGML_USE_METAL
377
402
  ggml_metal_context * ctx_metal = NULL;
@@ -381,9 +406,6 @@ struct llama_context {
381
406
  ggml_mpi_context * ctx_mpi = NULL;
382
407
  #endif
383
408
 
384
- int buf_last = 0;
385
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
386
-
387
409
  void use_buf(struct ggml_context * ctx, int i) {
388
410
  #if defined(LLAMA_USE_SCRATCH)
389
411
  size_t last_size = 0;
@@ -879,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
879
901
  /*.progress_callback =*/ nullptr,
880
902
  /*.progress_callback_user_data =*/ nullptr,
881
903
  /*.low_vram =*/ false,
904
+ /*.mul_mat_q =*/ false,
882
905
  /*.f16_kv =*/ true,
883
906
  /*.logits_all =*/ false,
884
907
  /*.vocab_only =*/ false,
@@ -1006,6 +1029,7 @@ static void llama_model_load_internal(
1006
1029
  int n_gpu_layers,
1007
1030
  int main_gpu,
1008
1031
  const float * tensor_split,
1032
+ const bool mul_mat_q,
1009
1033
  float rope_freq_base,
1010
1034
  float rope_freq_scale,
1011
1035
  bool low_vram,
@@ -1134,9 +1158,11 @@ static void llama_model_load_internal(
1134
1158
  }
1135
1159
 
1136
1160
  (void) main_gpu;
1161
+ (void) mul_mat_q;
1137
1162
  #if defined(GGML_USE_CUBLAS)
1138
1163
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1139
1164
  ggml_cuda_set_main_device(main_gpu);
1165
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1140
1166
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1141
1167
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1142
1168
  #elif defined(GGML_USE_CLBLAST)
@@ -1230,12 +1256,16 @@ static void llama_model_load_internal(
1230
1256
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1231
1257
 
1232
1258
  // this is the total memory required to run the inference
1233
- const size_t mem_required =
1259
+ size_t mem_required =
1234
1260
  ctx_size +
1235
- mmapped_size - vram_weights + // weights in VRAM not in memory
1261
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1262
+
1263
+ #ifndef LLAMA_USE_ALLOCATOR
1264
+ mem_required +=
1236
1265
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1237
1266
  MEM_REQ_SCRATCH1().at(model.type) +
1238
1267
  MEM_REQ_EVAL().at(model.type);
1268
+ #endif
1239
1269
 
1240
1270
  // this is the memory required by one llama_state
1241
1271
  const size_t mem_required_state =
@@ -1341,6 +1371,7 @@ static bool llama_model_load(
1341
1371
  int n_gpu_layers,
1342
1372
  int main_gpu,
1343
1373
  const float * tensor_split,
1374
+ const bool mul_mat_q,
1344
1375
  float rope_freq_base,
1345
1376
  float rope_freq_scale,
1346
1377
  bool low_vram,
@@ -1351,7 +1382,8 @@ static bool llama_model_load(
1351
1382
  llama_progress_callback progress_callback,
1352
1383
  void *progress_callback_user_data) {
1353
1384
  try {
1354
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1385
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1386
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1355
1387
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1356
1388
  return true;
1357
1389
  } catch (const std::exception & err) {
@@ -1360,32 +1392,15 @@ static bool llama_model_load(
1360
1392
  }
1361
1393
  }
1362
1394
 
1363
- // evaluate the transformer
1364
- //
1365
- // - lctx: llama context
1366
- // - tokens: new batch of tokens to process
1367
- // - embd embeddings input
1368
- // - n_tokens number of tokens
1369
- // - n_past: the context size so far
1370
- // - n_threads: number of threads to use
1371
- //
1372
- static bool llama_eval_internal(
1395
+ static struct ggml_cgraph * llama_build_graph(
1373
1396
  llama_context & lctx,
1374
1397
  const llama_token * tokens,
1375
1398
  const float * embd,
1376
1399
  int n_tokens,
1377
- int n_past,
1378
- int n_threads,
1379
- const char * cgraph_fname) {
1400
+ int n_past) {
1380
1401
 
1381
1402
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1382
1403
 
1383
- #ifdef GGML_USE_MPI
1384
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1385
- #endif
1386
-
1387
- const int64_t t_start_us = ggml_time_us();
1388
-
1389
1404
  const int N = n_tokens;
1390
1405
 
1391
1406
  const auto & model = lctx.model;
@@ -1401,10 +1416,8 @@ static bool llama_eval_internal(
1401
1416
  const int64_t n_head = hparams.n_head;
1402
1417
  const int64_t n_head_kv = hparams.n_head_kv;
1403
1418
  const int64_t n_embd_head = hparams.n_embd_head();
1404
- const int64_t n_vocab = hparams.n_vocab;
1405
1419
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
1420
 
1407
-
1408
1421
  LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1409
1422
 
1410
1423
  const float freq_base = hparams.rope_freq_base;
@@ -1416,26 +1429,35 @@ static bool llama_eval_internal(
1416
1429
  auto & mem_per_token = lctx.mem_per_token;
1417
1430
  auto & buf_compute = lctx.buf_compute;
1418
1431
 
1432
+
1419
1433
  struct ggml_init_params params = {
1420
1434
  /*.mem_size =*/ buf_compute.size,
1421
1435
  /*.mem_buffer =*/ buf_compute.addr,
1422
1436
  /*.no_alloc =*/ false,
1423
1437
  };
1424
1438
 
1439
+ #ifdef LLAMA_USE_ALLOCATOR
1440
+ params.no_alloc = true;
1441
+ #endif
1442
+
1425
1443
  struct ggml_context * ctx0 = ggml_init(params);
1426
1444
 
1427
1445
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1428
1446
 
1429
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1430
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1431
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1432
-
1433
1447
  struct ggml_tensor * cur;
1434
1448
  struct ggml_tensor * inpL;
1435
1449
 
1436
1450
  if (tokens) {
1437
1451
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1452
+
1453
+ #ifdef LLAMA_USE_ALLOCATOR
1454
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1455
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1456
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1457
+ }
1458
+ #else
1438
1459
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1460
+ #endif
1439
1461
  ggml_set_name(inp_tokens, "inp_tokens");
1440
1462
 
1441
1463
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1445,7 +1467,15 @@ static bool llama_eval_internal(
1445
1467
  #endif
1446
1468
 
1447
1469
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1470
+
1471
+ #ifdef LLAMA_USE_ALLOCATOR
1472
+ ggml_allocr_alloc(lctx.alloc, inpL);
1473
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1474
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1475
+ }
1476
+ #else
1448
1477
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1478
+ #endif
1449
1479
  }
1450
1480
 
1451
1481
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1472,6 +1502,17 @@ static bool llama_eval_internal(
1472
1502
  }
1473
1503
  #endif // GGML_USE_CUBLAS
1474
1504
 
1505
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1506
+ #ifdef LLAMA_USE_ALLOCATOR
1507
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1508
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1509
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1510
+ }
1511
+ #else
1512
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1513
+ #endif
1514
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1515
+
1475
1516
  for (int il = 0; il < n_layer; ++il) {
1476
1517
  ggml_format_name(inpL, "layer_inp_%d", il);
1477
1518
 
@@ -1567,9 +1608,6 @@ static bool llama_eval_internal(
1567
1608
  ggml_set_name(KQ, "KQ");
1568
1609
 
1569
1610
  // KQ_scaled = KQ / sqrt(n_embd_head)
1570
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1571
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1572
-
1573
1611
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1574
1612
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1575
1613
  offload_func_kq(KQ_scaled);
@@ -1685,9 +1723,6 @@ static bool llama_eval_internal(
1685
1723
 
1686
1724
  lctx.use_buf(ctx0, 0);
1687
1725
 
1688
- // used at the end to optionally extract the embeddings
1689
- struct ggml_tensor * embeddings = NULL;
1690
-
1691
1726
  // norm
1692
1727
  {
1693
1728
  cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
@@ -1698,8 +1733,6 @@ static bool llama_eval_internal(
1698
1733
  cur = ggml_mul(ctx0, cur, model.norm);
1699
1734
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1700
1735
  ggml_set_name(cur, "result_norm");
1701
-
1702
- embeddings = cur;
1703
1736
  }
1704
1737
 
1705
1738
  // lm_head
@@ -1711,12 +1744,88 @@ static bool llama_eval_internal(
1711
1744
  // logits -> probs
1712
1745
  //cur = ggml_soft_max_inplace(ctx0, cur);
1713
1746
 
1714
- // run the computation
1715
1747
  ggml_build_forward_expand(gf, cur);
1716
1748
 
1717
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1749
+ if (mem_per_token == 0) {
1750
+ mem_per_token = ggml_used_mem(ctx0)/N;
1751
+ }
1752
+
1753
+ #if 0
1754
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1756
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1758
+ lctx.work_buffer.size()/1024.0/1024.0,
1759
+ n_past, N);
1760
+ #endif
1761
+
1762
+ ggml_free(ctx0);
1763
+
1764
+ return gf;
1765
+ }
1766
+
1767
+ // evaluate the transformer
1768
+ //
1769
+ // - lctx: llama context
1770
+ // - tokens: new batch of tokens to process
1771
+ // - embd embeddings input
1772
+ // - n_tokens number of tokens
1773
+ // - n_past: the context size so far
1774
+ // - n_threads: number of threads to use
1775
+ //
1776
+ static bool llama_eval_internal(
1777
+ llama_context & lctx,
1778
+ const llama_token * tokens,
1779
+ const float * embd,
1780
+ int n_tokens,
1781
+ int n_past,
1782
+ int n_threads,
1783
+ const char * cgraph_fname) {
1784
+
1785
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1786
+
1787
+ const int64_t t_start_us = ggml_time_us();
1788
+
1789
+ #ifdef GGML_USE_MPI
1790
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1791
+ #endif
1792
+
1793
+ const int N = n_tokens;
1794
+
1795
+ const auto & model = lctx.model;
1796
+ const auto & hparams = model.hparams;
1797
+
1798
+ const auto & kv_self = lctx.kv_self;
1799
+
1800
+ LLAMA_ASSERT(!!kv_self.ctx);
1801
+
1802
+ const int64_t n_embd = hparams.n_embd;
1803
+ const int64_t n_vocab = hparams.n_vocab;
1804
+
1805
+ #ifdef LLAMA_USE_ALLOCATOR
1806
+ ggml_allocr_reset(lctx.alloc);
1807
+ #endif
1808
+
1809
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1810
+
1811
+ #ifdef LLAMA_USE_ALLOCATOR
1812
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
+ #endif
1814
+
1815
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
+
1817
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1819
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1820
+
1821
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1822
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1823
+
1824
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1825
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1718
1826
 
1719
1827
  #if GGML_USE_MPI
1828
+ const int64_t n_layer = hparams.n_layer;
1720
1829
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1721
1830
  #endif
1722
1831
 
@@ -1728,7 +1837,10 @@ static bool llama_eval_internal(
1728
1837
  //}
1729
1838
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1730
1839
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1731
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
1840
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1841
+ if (!lctx.embedding.empty()) {
1842
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1843
+ }
1732
1844
  } else {
1733
1845
  // IMPORTANT:
1734
1846
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1759,8 +1871,6 @@ static bool llama_eval_internal(
1759
1871
  // update kv token count
1760
1872
  lctx.kv_self.n = n_past + N;
1761
1873
 
1762
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1763
-
1764
1874
  if (cgraph_fname) {
1765
1875
  ggml_graph_export(gf, cgraph_fname);
1766
1876
  }
@@ -1798,21 +1908,6 @@ static bool llama_eval_internal(
1798
1908
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1799
1909
  }
1800
1910
 
1801
- if (mem_per_token == 0) {
1802
- mem_per_token = ggml_used_mem(ctx0)/N;
1803
- }
1804
-
1805
- #if 0
1806
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1807
- ggml_used_mem(ctx0)/1024.0/1024.0,
1808
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1809
- lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
- lctx.work_buffer.size()/1024.0/1024.0,
1811
- n_past, N);
1812
- #endif
1813
-
1814
- ggml_free(ctx0);
1815
-
1816
1911
  // measure the performance only for the single-token evals
1817
1912
  if (N == 1) {
1818
1913
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1924,7 +2019,9 @@ struct llama_tokenizer {
1924
2019
  if (token == vocab_.token_to_id.end()) {
1925
2020
  // output any symbols that did not form tokens as bytes.
1926
2021
  for (int j = 0; j < (int) symbol.n; ++j) {
1927
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2022
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2023
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2024
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
1928
2025
  output.push_back(token_id);
1929
2026
  }
1930
2027
  } else {
@@ -3101,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
3101
3198
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3102
3199
 
3103
3200
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3104
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3201
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3105
3202
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3106
3203
  params.progress_callback_user_data)) {
3107
3204
  delete model;
@@ -3178,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
3178
3275
  ctx->embedding.resize(hparams.n_embd);
3179
3276
  }
3180
3277
 
3278
+ #ifdef LLAMA_USE_ALLOCATOR
3279
+ {
3280
+ static const size_t tensor_alignment = 32;
3281
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3282
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3283
+
3284
+ // create measure allocator
3285
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3286
+
3287
+ // build worst-case graph
3288
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3289
+ int n_past = hparams.n_ctx - n_tokens;
3290
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3291
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3292
+
3293
+ // measure memory requirements for the graph
3294
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3295
+
3296
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
+
3298
+ // debug - for comparison with scratch buffer
3299
+ //size_t prev_req =
3300
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
+ // MEM_REQ_EVAL().at(ctx->model.type);
3303
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
+
3305
+ // recreate allocator with exact memory requirements
3306
+ ggml_allocr_free(ctx->alloc);
3307
+
3308
+ ctx->buf_alloc.resize(alloc_size);
3309
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3310
+ }
3311
+ #else
3181
3312
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3313
+ #endif
3182
3314
 
3315
+ #ifdef LLAMA_USE_SCRATCH
3183
3316
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
3184
3317
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
3318
+ #endif
3185
3319
  }
3186
3320
 
3187
3321
  #ifdef GGML_USE_METAL
@@ -3251,9 +3385,6 @@ struct llama_context * llama_init_from_file(
3251
3385
  }
3252
3386
 
3253
3387
  void llama_free(struct llama_context * ctx) {
3254
- if (ctx->model_owner) {
3255
- delete &ctx->model;
3256
- }
3257
3388
  delete ctx;
3258
3389
  }
3259
3390
 
@@ -3663,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3663
3794
  const auto & kv_self = ctx->kv_self;
3664
3795
  const auto & hparams = ctx->model.hparams;
3665
3796
  const int n_layer = hparams.n_layer;
3666
- const int n_embd = hparams.n_embd;
3797
+ const int n_embd = hparams.n_embd_gqa();
3667
3798
  const int n_ctx = hparams.n_ctx;
3668
3799
 
3669
3800
  const size_t kv_size = kv_self.buf.size;
@@ -3766,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3766
3897
  const auto & kv_self = ctx->kv_self;
3767
3898
  const auto & hparams = ctx->model.hparams;
3768
3899
  const int n_layer = hparams.n_layer;
3769
- const int n_embd = hparams.n_embd;
3900
+ const int n_embd = hparams.n_embd_gqa();
3770
3901
  const int n_ctx = hparams.n_ctx;
3771
3902
 
3772
3903
  size_t kv_size;
@@ -108,6 +108,7 @@ extern "C" {
108
108
 
109
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
110
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
111
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
111
112
  bool f16_kv; // use fp16 for KV cache
112
113
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
113
114
  bool vocab_only; // only load the vocabulary, no weights
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.5'
6
+ VERSION = '0.3.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-1a94186'
9
+ LLAMA_CPP_VERSION = 'master-468ea24'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -163,6 +163,8 @@ module LLaMACpp
163
163
  def rope_freq_scale: () -> Float
164
164
  def low_vram: () -> bool
165
165
  def low_vram=: (bool) -> bool
166
+ def mul_mat_q: () -> bool
167
+ def mul_mat_q=: (bool) -> bool
166
168
  def seed: () -> Integer
167
169
  def seed=: (Integer) -> Integer
168
170
  def use_mlock: () -> bool
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-29 00:00:00.000000000 Z
11
+ date: 2023-08-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,8 @@ files:
30
30
  - ext/llama_cpp/llama_cpp.cpp
31
31
  - ext/llama_cpp/llama_cpp.h
32
32
  - ext/llama_cpp/src/LICENSE
33
+ - ext/llama_cpp/src/ggml-alloc.c
34
+ - ext/llama_cpp/src/ggml-alloc.h
33
35
  - ext/llama_cpp/src/ggml-cuda.cu
34
36
  - ext/llama_cpp/src/ggml-cuda.h
35
37
  - ext/llama_cpp/src/ggml-metal.h