llama_cpp 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -56,8 +56,14 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
60
+ #include "ggml-alloc.h"
61
+ #define LLAMA_USE_ALLOCATOR
62
+ #else
59
63
  #define LLAMA_USE_SCRATCH
60
64
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
65
+ #endif
66
+
61
67
 
62
68
  // available llama models
63
69
  enum e_model {
@@ -327,13 +333,22 @@ struct llama_model {
327
333
 
328
334
  struct llama_context {
329
335
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
330
- #ifdef GGML_USE_METAL
331
336
  ~llama_context() {
337
+ if (model_owner) {
338
+ delete &model;
339
+ }
340
+ #ifdef GGML_USE_METAL
332
341
  if (ctx_metal) {
333
342
  ggml_metal_free(ctx_metal);
334
343
  }
335
- }
336
344
  #endif
345
+ #ifdef LLAMA_USE_ALLOCATOR
346
+ if (alloc) {
347
+ ggml_allocr_free(alloc);
348
+ }
349
+ #endif
350
+ }
351
+
337
352
  std::mt19937 rng;
338
353
 
339
354
  bool has_evaluated_once = false;
@@ -371,7 +386,17 @@ struct llama_context {
371
386
  // memory buffers used to evaluate the model
372
387
  // TODO: move in llama_state
373
388
  llama_ctx_buffer buf_compute;
389
+
390
+ #ifdef LLAMA_USE_ALLOCATOR
391
+ llama_ctx_buffer buf_alloc;
392
+ ggml_allocr * alloc = NULL;
393
+ #endif
394
+
395
+ #ifdef LLAMA_USE_SCRATCH
374
396
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
397
+ int buf_last = 0;
398
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
399
+ #endif
375
400
 
376
401
  #ifdef GGML_USE_METAL
377
402
  ggml_metal_context * ctx_metal = NULL;
@@ -381,9 +406,6 @@ struct llama_context {
381
406
  ggml_mpi_context * ctx_mpi = NULL;
382
407
  #endif
383
408
 
384
- int buf_last = 0;
385
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
386
-
387
409
  void use_buf(struct ggml_context * ctx, int i) {
388
410
  #if defined(LLAMA_USE_SCRATCH)
389
411
  size_t last_size = 0;
@@ -879,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
879
901
  /*.progress_callback =*/ nullptr,
880
902
  /*.progress_callback_user_data =*/ nullptr,
881
903
  /*.low_vram =*/ false,
904
+ /*.mul_mat_q =*/ false,
882
905
  /*.f16_kv =*/ true,
883
906
  /*.logits_all =*/ false,
884
907
  /*.vocab_only =*/ false,
@@ -1006,6 +1029,7 @@ static void llama_model_load_internal(
1006
1029
  int n_gpu_layers,
1007
1030
  int main_gpu,
1008
1031
  const float * tensor_split,
1032
+ const bool mul_mat_q,
1009
1033
  float rope_freq_base,
1010
1034
  float rope_freq_scale,
1011
1035
  bool low_vram,
@@ -1134,9 +1158,11 @@ static void llama_model_load_internal(
1134
1158
  }
1135
1159
 
1136
1160
  (void) main_gpu;
1161
+ (void) mul_mat_q;
1137
1162
  #if defined(GGML_USE_CUBLAS)
1138
1163
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1139
1164
  ggml_cuda_set_main_device(main_gpu);
1165
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1140
1166
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1141
1167
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1142
1168
  #elif defined(GGML_USE_CLBLAST)
@@ -1230,12 +1256,16 @@ static void llama_model_load_internal(
1230
1256
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1231
1257
 
1232
1258
  // this is the total memory required to run the inference
1233
- const size_t mem_required =
1259
+ size_t mem_required =
1234
1260
  ctx_size +
1235
- mmapped_size - vram_weights + // weights in VRAM not in memory
1261
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1262
+
1263
+ #ifndef LLAMA_USE_ALLOCATOR
1264
+ mem_required +=
1236
1265
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1237
1266
  MEM_REQ_SCRATCH1().at(model.type) +
1238
1267
  MEM_REQ_EVAL().at(model.type);
1268
+ #endif
1239
1269
 
1240
1270
  // this is the memory required by one llama_state
1241
1271
  const size_t mem_required_state =
@@ -1341,6 +1371,7 @@ static bool llama_model_load(
1341
1371
  int n_gpu_layers,
1342
1372
  int main_gpu,
1343
1373
  const float * tensor_split,
1374
+ const bool mul_mat_q,
1344
1375
  float rope_freq_base,
1345
1376
  float rope_freq_scale,
1346
1377
  bool low_vram,
@@ -1351,7 +1382,8 @@ static bool llama_model_load(
1351
1382
  llama_progress_callback progress_callback,
1352
1383
  void *progress_callback_user_data) {
1353
1384
  try {
1354
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1385
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1386
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1355
1387
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1356
1388
  return true;
1357
1389
  } catch (const std::exception & err) {
@@ -1360,32 +1392,15 @@ static bool llama_model_load(
1360
1392
  }
1361
1393
  }
1362
1394
 
1363
- // evaluate the transformer
1364
- //
1365
- // - lctx: llama context
1366
- // - tokens: new batch of tokens to process
1367
- // - embd embeddings input
1368
- // - n_tokens number of tokens
1369
- // - n_past: the context size so far
1370
- // - n_threads: number of threads to use
1371
- //
1372
- static bool llama_eval_internal(
1395
+ static struct ggml_cgraph * llama_build_graph(
1373
1396
  llama_context & lctx,
1374
1397
  const llama_token * tokens,
1375
1398
  const float * embd,
1376
1399
  int n_tokens,
1377
- int n_past,
1378
- int n_threads,
1379
- const char * cgraph_fname) {
1400
+ int n_past) {
1380
1401
 
1381
1402
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1382
1403
 
1383
- #ifdef GGML_USE_MPI
1384
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1385
- #endif
1386
-
1387
- const int64_t t_start_us = ggml_time_us();
1388
-
1389
1404
  const int N = n_tokens;
1390
1405
 
1391
1406
  const auto & model = lctx.model;
@@ -1401,10 +1416,8 @@ static bool llama_eval_internal(
1401
1416
  const int64_t n_head = hparams.n_head;
1402
1417
  const int64_t n_head_kv = hparams.n_head_kv;
1403
1418
  const int64_t n_embd_head = hparams.n_embd_head();
1404
- const int64_t n_vocab = hparams.n_vocab;
1405
1419
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
1420
 
1407
-
1408
1421
  LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1409
1422
 
1410
1423
  const float freq_base = hparams.rope_freq_base;
@@ -1416,26 +1429,35 @@ static bool llama_eval_internal(
1416
1429
  auto & mem_per_token = lctx.mem_per_token;
1417
1430
  auto & buf_compute = lctx.buf_compute;
1418
1431
 
1432
+
1419
1433
  struct ggml_init_params params = {
1420
1434
  /*.mem_size =*/ buf_compute.size,
1421
1435
  /*.mem_buffer =*/ buf_compute.addr,
1422
1436
  /*.no_alloc =*/ false,
1423
1437
  };
1424
1438
 
1439
+ #ifdef LLAMA_USE_ALLOCATOR
1440
+ params.no_alloc = true;
1441
+ #endif
1442
+
1425
1443
  struct ggml_context * ctx0 = ggml_init(params);
1426
1444
 
1427
1445
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1428
1446
 
1429
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1430
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1431
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1432
-
1433
1447
  struct ggml_tensor * cur;
1434
1448
  struct ggml_tensor * inpL;
1435
1449
 
1436
1450
  if (tokens) {
1437
1451
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1452
+
1453
+ #ifdef LLAMA_USE_ALLOCATOR
1454
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1455
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1456
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1457
+ }
1458
+ #else
1438
1459
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1460
+ #endif
1439
1461
  ggml_set_name(inp_tokens, "inp_tokens");
1440
1462
 
1441
1463
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1445,7 +1467,15 @@ static bool llama_eval_internal(
1445
1467
  #endif
1446
1468
 
1447
1469
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1470
+
1471
+ #ifdef LLAMA_USE_ALLOCATOR
1472
+ ggml_allocr_alloc(lctx.alloc, inpL);
1473
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1474
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1475
+ }
1476
+ #else
1448
1477
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1478
+ #endif
1449
1479
  }
1450
1480
 
1451
1481
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1472,6 +1502,17 @@ static bool llama_eval_internal(
1472
1502
  }
1473
1503
  #endif // GGML_USE_CUBLAS
1474
1504
 
1505
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1506
+ #ifdef LLAMA_USE_ALLOCATOR
1507
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1508
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1509
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1510
+ }
1511
+ #else
1512
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1513
+ #endif
1514
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1515
+
1475
1516
  for (int il = 0; il < n_layer; ++il) {
1476
1517
  ggml_format_name(inpL, "layer_inp_%d", il);
1477
1518
 
@@ -1567,9 +1608,6 @@ static bool llama_eval_internal(
1567
1608
  ggml_set_name(KQ, "KQ");
1568
1609
 
1569
1610
  // KQ_scaled = KQ / sqrt(n_embd_head)
1570
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1571
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1572
-
1573
1611
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1574
1612
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1575
1613
  offload_func_kq(KQ_scaled);
@@ -1685,9 +1723,6 @@ static bool llama_eval_internal(
1685
1723
 
1686
1724
  lctx.use_buf(ctx0, 0);
1687
1725
 
1688
- // used at the end to optionally extract the embeddings
1689
- struct ggml_tensor * embeddings = NULL;
1690
-
1691
1726
  // norm
1692
1727
  {
1693
1728
  cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
@@ -1698,8 +1733,6 @@ static bool llama_eval_internal(
1698
1733
  cur = ggml_mul(ctx0, cur, model.norm);
1699
1734
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1700
1735
  ggml_set_name(cur, "result_norm");
1701
-
1702
- embeddings = cur;
1703
1736
  }
1704
1737
 
1705
1738
  // lm_head
@@ -1711,12 +1744,88 @@ static bool llama_eval_internal(
1711
1744
  // logits -> probs
1712
1745
  //cur = ggml_soft_max_inplace(ctx0, cur);
1713
1746
 
1714
- // run the computation
1715
1747
  ggml_build_forward_expand(gf, cur);
1716
1748
 
1717
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1749
+ if (mem_per_token == 0) {
1750
+ mem_per_token = ggml_used_mem(ctx0)/N;
1751
+ }
1752
+
1753
+ #if 0
1754
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1756
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1758
+ lctx.work_buffer.size()/1024.0/1024.0,
1759
+ n_past, N);
1760
+ #endif
1761
+
1762
+ ggml_free(ctx0);
1763
+
1764
+ return gf;
1765
+ }
1766
+
1767
+ // evaluate the transformer
1768
+ //
1769
+ // - lctx: llama context
1770
+ // - tokens: new batch of tokens to process
1771
+ // - embd embeddings input
1772
+ // - n_tokens number of tokens
1773
+ // - n_past: the context size so far
1774
+ // - n_threads: number of threads to use
1775
+ //
1776
+ static bool llama_eval_internal(
1777
+ llama_context & lctx,
1778
+ const llama_token * tokens,
1779
+ const float * embd,
1780
+ int n_tokens,
1781
+ int n_past,
1782
+ int n_threads,
1783
+ const char * cgraph_fname) {
1784
+
1785
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1786
+
1787
+ const int64_t t_start_us = ggml_time_us();
1788
+
1789
+ #ifdef GGML_USE_MPI
1790
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1791
+ #endif
1792
+
1793
+ const int N = n_tokens;
1794
+
1795
+ const auto & model = lctx.model;
1796
+ const auto & hparams = model.hparams;
1797
+
1798
+ const auto & kv_self = lctx.kv_self;
1799
+
1800
+ LLAMA_ASSERT(!!kv_self.ctx);
1801
+
1802
+ const int64_t n_embd = hparams.n_embd;
1803
+ const int64_t n_vocab = hparams.n_vocab;
1804
+
1805
+ #ifdef LLAMA_USE_ALLOCATOR
1806
+ ggml_allocr_reset(lctx.alloc);
1807
+ #endif
1808
+
1809
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1810
+
1811
+ #ifdef LLAMA_USE_ALLOCATOR
1812
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
+ #endif
1814
+
1815
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
+
1817
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1819
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1820
+
1821
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1822
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1823
+
1824
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1825
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1718
1826
 
1719
1827
  #if GGML_USE_MPI
1828
+ const int64_t n_layer = hparams.n_layer;
1720
1829
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1721
1830
  #endif
1722
1831
 
@@ -1728,7 +1837,10 @@ static bool llama_eval_internal(
1728
1837
  //}
1729
1838
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1730
1839
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1731
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
1840
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1841
+ if (!lctx.embedding.empty()) {
1842
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1843
+ }
1732
1844
  } else {
1733
1845
  // IMPORTANT:
1734
1846
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1759,8 +1871,6 @@ static bool llama_eval_internal(
1759
1871
  // update kv token count
1760
1872
  lctx.kv_self.n = n_past + N;
1761
1873
 
1762
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1763
-
1764
1874
  if (cgraph_fname) {
1765
1875
  ggml_graph_export(gf, cgraph_fname);
1766
1876
  }
@@ -1798,21 +1908,6 @@ static bool llama_eval_internal(
1798
1908
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1799
1909
  }
1800
1910
 
1801
- if (mem_per_token == 0) {
1802
- mem_per_token = ggml_used_mem(ctx0)/N;
1803
- }
1804
-
1805
- #if 0
1806
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1807
- ggml_used_mem(ctx0)/1024.0/1024.0,
1808
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1809
- lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
- lctx.work_buffer.size()/1024.0/1024.0,
1811
- n_past, N);
1812
- #endif
1813
-
1814
- ggml_free(ctx0);
1815
-
1816
1911
  // measure the performance only for the single-token evals
1817
1912
  if (N == 1) {
1818
1913
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1924,7 +2019,9 @@ struct llama_tokenizer {
1924
2019
  if (token == vocab_.token_to_id.end()) {
1925
2020
  // output any symbols that did not form tokens as bytes.
1926
2021
  for (int j = 0; j < (int) symbol.n; ++j) {
1927
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2022
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2023
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2024
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
1928
2025
  output.push_back(token_id);
1929
2026
  }
1930
2027
  } else {
@@ -3101,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
3101
3198
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3102
3199
 
3103
3200
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3104
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3201
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3105
3202
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3106
3203
  params.progress_callback_user_data)) {
3107
3204
  delete model;
@@ -3178,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
3178
3275
  ctx->embedding.resize(hparams.n_embd);
3179
3276
  }
3180
3277
 
3278
+ #ifdef LLAMA_USE_ALLOCATOR
3279
+ {
3280
+ static const size_t tensor_alignment = 32;
3281
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3282
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3283
+
3284
+ // create measure allocator
3285
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3286
+
3287
+ // build worst-case graph
3288
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3289
+ int n_past = hparams.n_ctx - n_tokens;
3290
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3291
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3292
+
3293
+ // measure memory requirements for the graph
3294
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3295
+
3296
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
+
3298
+ // debug - for comparison with scratch buffer
3299
+ //size_t prev_req =
3300
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
+ // MEM_REQ_EVAL().at(ctx->model.type);
3303
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
+
3305
+ // recreate allocator with exact memory requirements
3306
+ ggml_allocr_free(ctx->alloc);
3307
+
3308
+ ctx->buf_alloc.resize(alloc_size);
3309
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3310
+ }
3311
+ #else
3181
3312
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3313
+ #endif
3182
3314
 
3315
+ #ifdef LLAMA_USE_SCRATCH
3183
3316
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
3184
3317
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
3318
+ #endif
3185
3319
  }
3186
3320
 
3187
3321
  #ifdef GGML_USE_METAL
@@ -3251,9 +3385,6 @@ struct llama_context * llama_init_from_file(
3251
3385
  }
3252
3386
 
3253
3387
  void llama_free(struct llama_context * ctx) {
3254
- if (ctx->model_owner) {
3255
- delete &ctx->model;
3256
- }
3257
3388
  delete ctx;
3258
3389
  }
3259
3390
 
@@ -3663,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3663
3794
  const auto & kv_self = ctx->kv_self;
3664
3795
  const auto & hparams = ctx->model.hparams;
3665
3796
  const int n_layer = hparams.n_layer;
3666
- const int n_embd = hparams.n_embd;
3797
+ const int n_embd = hparams.n_embd_gqa();
3667
3798
  const int n_ctx = hparams.n_ctx;
3668
3799
 
3669
3800
  const size_t kv_size = kv_self.buf.size;
@@ -3766,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3766
3897
  const auto & kv_self = ctx->kv_self;
3767
3898
  const auto & hparams = ctx->model.hparams;
3768
3899
  const int n_layer = hparams.n_layer;
3769
- const int n_embd = hparams.n_embd;
3900
+ const int n_embd = hparams.n_embd_gqa();
3770
3901
  const int n_ctx = hparams.n_ctx;
3771
3902
 
3772
3903
  size_t kv_size;
@@ -108,6 +108,7 @@ extern "C" {
108
108
 
109
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
110
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
111
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
111
112
  bool f16_kv; // use fp16 for KV cache
112
113
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
113
114
  bool vocab_only; // only load the vocabulary, no weights
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.5'
6
+ VERSION = '0.3.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-1a94186'
9
+ LLAMA_CPP_VERSION = 'master-468ea24'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -163,6 +163,8 @@ module LLaMACpp
163
163
  def rope_freq_scale: () -> Float
164
164
  def low_vram: () -> bool
165
165
  def low_vram=: (bool) -> bool
166
+ def mul_mat_q: () -> bool
167
+ def mul_mat_q=: (bool) -> bool
166
168
  def seed: () -> Integer
167
169
  def seed=: (Integer) -> Integer
168
170
  def use_mlock: () -> bool
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-29 00:00:00.000000000 Z
11
+ date: 2023-08-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,8 @@ files:
30
30
  - ext/llama_cpp/llama_cpp.cpp
31
31
  - ext/llama_cpp/llama_cpp.h
32
32
  - ext/llama_cpp/src/LICENSE
33
+ - ext/llama_cpp/src/ggml-alloc.c
34
+ - ext/llama_cpp/src/ggml-alloc.h
33
35
  - ext/llama_cpp/src/ggml-cuda.cu
34
36
  - ext/llama_cpp/src/ggml-cuda.h
35
37
  - ext/llama_cpp/src/ggml-metal.h