llama_cpp 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,8 +56,14 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
60
|
+
#include "ggml-alloc.h"
|
61
|
+
#define LLAMA_USE_ALLOCATOR
|
62
|
+
#else
|
59
63
|
#define LLAMA_USE_SCRATCH
|
60
64
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
65
|
+
#endif
|
66
|
+
|
61
67
|
|
62
68
|
// available llama models
|
63
69
|
enum e_model {
|
@@ -327,13 +333,22 @@ struct llama_model {
|
|
327
333
|
|
328
334
|
struct llama_context {
|
329
335
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
330
|
-
#ifdef GGML_USE_METAL
|
331
336
|
~llama_context() {
|
337
|
+
if (model_owner) {
|
338
|
+
delete &model;
|
339
|
+
}
|
340
|
+
#ifdef GGML_USE_METAL
|
332
341
|
if (ctx_metal) {
|
333
342
|
ggml_metal_free(ctx_metal);
|
334
343
|
}
|
335
|
-
}
|
336
344
|
#endif
|
345
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
346
|
+
if (alloc) {
|
347
|
+
ggml_allocr_free(alloc);
|
348
|
+
}
|
349
|
+
#endif
|
350
|
+
}
|
351
|
+
|
337
352
|
std::mt19937 rng;
|
338
353
|
|
339
354
|
bool has_evaluated_once = false;
|
@@ -371,7 +386,17 @@ struct llama_context {
|
|
371
386
|
// memory buffers used to evaluate the model
|
372
387
|
// TODO: move in llama_state
|
373
388
|
llama_ctx_buffer buf_compute;
|
389
|
+
|
390
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
391
|
+
llama_ctx_buffer buf_alloc;
|
392
|
+
ggml_allocr * alloc = NULL;
|
393
|
+
#endif
|
394
|
+
|
395
|
+
#ifdef LLAMA_USE_SCRATCH
|
374
396
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
397
|
+
int buf_last = 0;
|
398
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
399
|
+
#endif
|
375
400
|
|
376
401
|
#ifdef GGML_USE_METAL
|
377
402
|
ggml_metal_context * ctx_metal = NULL;
|
@@ -381,9 +406,6 @@ struct llama_context {
|
|
381
406
|
ggml_mpi_context * ctx_mpi = NULL;
|
382
407
|
#endif
|
383
408
|
|
384
|
-
int buf_last = 0;
|
385
|
-
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
386
|
-
|
387
409
|
void use_buf(struct ggml_context * ctx, int i) {
|
388
410
|
#if defined(LLAMA_USE_SCRATCH)
|
389
411
|
size_t last_size = 0;
|
@@ -879,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
|
|
879
901
|
/*.progress_callback =*/ nullptr,
|
880
902
|
/*.progress_callback_user_data =*/ nullptr,
|
881
903
|
/*.low_vram =*/ false,
|
904
|
+
/*.mul_mat_q =*/ false,
|
882
905
|
/*.f16_kv =*/ true,
|
883
906
|
/*.logits_all =*/ false,
|
884
907
|
/*.vocab_only =*/ false,
|
@@ -1006,6 +1029,7 @@ static void llama_model_load_internal(
|
|
1006
1029
|
int n_gpu_layers,
|
1007
1030
|
int main_gpu,
|
1008
1031
|
const float * tensor_split,
|
1032
|
+
const bool mul_mat_q,
|
1009
1033
|
float rope_freq_base,
|
1010
1034
|
float rope_freq_scale,
|
1011
1035
|
bool low_vram,
|
@@ -1134,9 +1158,11 @@ static void llama_model_load_internal(
|
|
1134
1158
|
}
|
1135
1159
|
|
1136
1160
|
(void) main_gpu;
|
1161
|
+
(void) mul_mat_q;
|
1137
1162
|
#if defined(GGML_USE_CUBLAS)
|
1138
1163
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1139
1164
|
ggml_cuda_set_main_device(main_gpu);
|
1165
|
+
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1140
1166
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1141
1167
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1142
1168
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1230,12 +1256,16 @@ static void llama_model_load_internal(
|
|
1230
1256
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1231
1257
|
|
1232
1258
|
// this is the total memory required to run the inference
|
1233
|
-
|
1259
|
+
size_t mem_required =
|
1234
1260
|
ctx_size +
|
1235
|
-
mmapped_size - vram_weights
|
1261
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
1262
|
+
|
1263
|
+
#ifndef LLAMA_USE_ALLOCATOR
|
1264
|
+
mem_required +=
|
1236
1265
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1237
1266
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1238
1267
|
MEM_REQ_EVAL().at(model.type);
|
1268
|
+
#endif
|
1239
1269
|
|
1240
1270
|
// this is the memory required by one llama_state
|
1241
1271
|
const size_t mem_required_state =
|
@@ -1341,6 +1371,7 @@ static bool llama_model_load(
|
|
1341
1371
|
int n_gpu_layers,
|
1342
1372
|
int main_gpu,
|
1343
1373
|
const float * tensor_split,
|
1374
|
+
const bool mul_mat_q,
|
1344
1375
|
float rope_freq_base,
|
1345
1376
|
float rope_freq_scale,
|
1346
1377
|
bool low_vram,
|
@@ -1351,7 +1382,8 @@ static bool llama_model_load(
|
|
1351
1382
|
llama_progress_callback progress_callback,
|
1352
1383
|
void *progress_callback_user_data) {
|
1353
1384
|
try {
|
1354
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1385
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1386
|
+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1355
1387
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1356
1388
|
return true;
|
1357
1389
|
} catch (const std::exception & err) {
|
@@ -1360,32 +1392,15 @@ static bool llama_model_load(
|
|
1360
1392
|
}
|
1361
1393
|
}
|
1362
1394
|
|
1363
|
-
|
1364
|
-
//
|
1365
|
-
// - lctx: llama context
|
1366
|
-
// - tokens: new batch of tokens to process
|
1367
|
-
// - embd embeddings input
|
1368
|
-
// - n_tokens number of tokens
|
1369
|
-
// - n_past: the context size so far
|
1370
|
-
// - n_threads: number of threads to use
|
1371
|
-
//
|
1372
|
-
static bool llama_eval_internal(
|
1395
|
+
static struct ggml_cgraph * llama_build_graph(
|
1373
1396
|
llama_context & lctx,
|
1374
1397
|
const llama_token * tokens,
|
1375
1398
|
const float * embd,
|
1376
1399
|
int n_tokens,
|
1377
|
-
int n_past
|
1378
|
-
int n_threads,
|
1379
|
-
const char * cgraph_fname) {
|
1400
|
+
int n_past) {
|
1380
1401
|
|
1381
1402
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1382
1403
|
|
1383
|
-
#ifdef GGML_USE_MPI
|
1384
|
-
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1385
|
-
#endif
|
1386
|
-
|
1387
|
-
const int64_t t_start_us = ggml_time_us();
|
1388
|
-
|
1389
1404
|
const int N = n_tokens;
|
1390
1405
|
|
1391
1406
|
const auto & model = lctx.model;
|
@@ -1401,10 +1416,8 @@ static bool llama_eval_internal(
|
|
1401
1416
|
const int64_t n_head = hparams.n_head;
|
1402
1417
|
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
1418
|
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
-
const int64_t n_vocab = hparams.n_vocab;
|
1405
1419
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
1420
|
|
1407
|
-
|
1408
1421
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1409
1422
|
|
1410
1423
|
const float freq_base = hparams.rope_freq_base;
|
@@ -1416,26 +1429,35 @@ static bool llama_eval_internal(
|
|
1416
1429
|
auto & mem_per_token = lctx.mem_per_token;
|
1417
1430
|
auto & buf_compute = lctx.buf_compute;
|
1418
1431
|
|
1432
|
+
|
1419
1433
|
struct ggml_init_params params = {
|
1420
1434
|
/*.mem_size =*/ buf_compute.size,
|
1421
1435
|
/*.mem_buffer =*/ buf_compute.addr,
|
1422
1436
|
/*.no_alloc =*/ false,
|
1423
1437
|
};
|
1424
1438
|
|
1439
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1440
|
+
params.no_alloc = true;
|
1441
|
+
#endif
|
1442
|
+
|
1425
1443
|
struct ggml_context * ctx0 = ggml_init(params);
|
1426
1444
|
|
1427
1445
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1428
1446
|
|
1429
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1430
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1431
|
-
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1432
|
-
|
1433
1447
|
struct ggml_tensor * cur;
|
1434
1448
|
struct ggml_tensor * inpL;
|
1435
1449
|
|
1436
1450
|
if (tokens) {
|
1437
1451
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1452
|
+
|
1453
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1454
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
1455
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1456
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1457
|
+
}
|
1458
|
+
#else
|
1438
1459
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1460
|
+
#endif
|
1439
1461
|
ggml_set_name(inp_tokens, "inp_tokens");
|
1440
1462
|
|
1441
1463
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
@@ -1445,7 +1467,15 @@ static bool llama_eval_internal(
|
|
1445
1467
|
#endif
|
1446
1468
|
|
1447
1469
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1470
|
+
|
1471
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1472
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
1473
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1474
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1475
|
+
}
|
1476
|
+
#else
|
1448
1477
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1478
|
+
#endif
|
1449
1479
|
}
|
1450
1480
|
|
1451
1481
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1472,6 +1502,17 @@ static bool llama_eval_internal(
|
|
1472
1502
|
}
|
1473
1503
|
#endif // GGML_USE_CUBLAS
|
1474
1504
|
|
1505
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
1506
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1507
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
1508
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1509
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1510
|
+
}
|
1511
|
+
#else
|
1512
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
+
#endif
|
1514
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1515
|
+
|
1475
1516
|
for (int il = 0; il < n_layer; ++il) {
|
1476
1517
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
1477
1518
|
|
@@ -1567,9 +1608,6 @@ static bool llama_eval_internal(
|
|
1567
1608
|
ggml_set_name(KQ, "KQ");
|
1568
1609
|
|
1569
1610
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1570
|
-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1571
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1572
|
-
|
1573
1611
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1574
1612
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1575
1613
|
offload_func_kq(KQ_scaled);
|
@@ -1685,9 +1723,6 @@ static bool llama_eval_internal(
|
|
1685
1723
|
|
1686
1724
|
lctx.use_buf(ctx0, 0);
|
1687
1725
|
|
1688
|
-
// used at the end to optionally extract the embeddings
|
1689
|
-
struct ggml_tensor * embeddings = NULL;
|
1690
|
-
|
1691
1726
|
// norm
|
1692
1727
|
{
|
1693
1728
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
@@ -1698,8 +1733,6 @@ static bool llama_eval_internal(
|
|
1698
1733
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1699
1734
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1700
1735
|
ggml_set_name(cur, "result_norm");
|
1701
|
-
|
1702
|
-
embeddings = cur;
|
1703
1736
|
}
|
1704
1737
|
|
1705
1738
|
// lm_head
|
@@ -1711,12 +1744,88 @@ static bool llama_eval_internal(
|
|
1711
1744
|
// logits -> probs
|
1712
1745
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1713
1746
|
|
1714
|
-
// run the computation
|
1715
1747
|
ggml_build_forward_expand(gf, cur);
|
1716
1748
|
|
1717
|
-
|
1749
|
+
if (mem_per_token == 0) {
|
1750
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
#if 0
|
1754
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1758
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1759
|
+
n_past, N);
|
1760
|
+
#endif
|
1761
|
+
|
1762
|
+
ggml_free(ctx0);
|
1763
|
+
|
1764
|
+
return gf;
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
// evaluate the transformer
|
1768
|
+
//
|
1769
|
+
// - lctx: llama context
|
1770
|
+
// - tokens: new batch of tokens to process
|
1771
|
+
// - embd embeddings input
|
1772
|
+
// - n_tokens number of tokens
|
1773
|
+
// - n_past: the context size so far
|
1774
|
+
// - n_threads: number of threads to use
|
1775
|
+
//
|
1776
|
+
static bool llama_eval_internal(
|
1777
|
+
llama_context & lctx,
|
1778
|
+
const llama_token * tokens,
|
1779
|
+
const float * embd,
|
1780
|
+
int n_tokens,
|
1781
|
+
int n_past,
|
1782
|
+
int n_threads,
|
1783
|
+
const char * cgraph_fname) {
|
1784
|
+
|
1785
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1786
|
+
|
1787
|
+
const int64_t t_start_us = ggml_time_us();
|
1788
|
+
|
1789
|
+
#ifdef GGML_USE_MPI
|
1790
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1791
|
+
#endif
|
1792
|
+
|
1793
|
+
const int N = n_tokens;
|
1794
|
+
|
1795
|
+
const auto & model = lctx.model;
|
1796
|
+
const auto & hparams = model.hparams;
|
1797
|
+
|
1798
|
+
const auto & kv_self = lctx.kv_self;
|
1799
|
+
|
1800
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
1801
|
+
|
1802
|
+
const int64_t n_embd = hparams.n_embd;
|
1803
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1804
|
+
|
1805
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1806
|
+
ggml_allocr_reset(lctx.alloc);
|
1807
|
+
#endif
|
1808
|
+
|
1809
|
+
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
1810
|
+
|
1811
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1812
|
+
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
|
+
#endif
|
1814
|
+
|
1815
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
|
+
|
1817
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1819
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1820
|
+
|
1821
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1822
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
1823
|
+
|
1824
|
+
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
1825
|
+
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
1718
1826
|
|
1719
1827
|
#if GGML_USE_MPI
|
1828
|
+
const int64_t n_layer = hparams.n_layer;
|
1720
1829
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1721
1830
|
#endif
|
1722
1831
|
|
@@ -1728,7 +1837,10 @@ static bool llama_eval_internal(
|
|
1728
1837
|
//}
|
1729
1838
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1730
1839
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1731
|
-
ggml_metal_get_tensor (lctx.ctx_metal,
|
1840
|
+
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
1841
|
+
if (!lctx.embedding.empty()) {
|
1842
|
+
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1843
|
+
}
|
1732
1844
|
} else {
|
1733
1845
|
// IMPORTANT:
|
1734
1846
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
@@ -1759,8 +1871,6 @@ static bool llama_eval_internal(
|
|
1759
1871
|
// update kv token count
|
1760
1872
|
lctx.kv_self.n = n_past + N;
|
1761
1873
|
|
1762
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1763
|
-
|
1764
1874
|
if (cgraph_fname) {
|
1765
1875
|
ggml_graph_export(gf, cgraph_fname);
|
1766
1876
|
}
|
@@ -1798,21 +1908,6 @@ static bool llama_eval_internal(
|
|
1798
1908
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
1799
1909
|
}
|
1800
1910
|
|
1801
|
-
if (mem_per_token == 0) {
|
1802
|
-
mem_per_token = ggml_used_mem(ctx0)/N;
|
1803
|
-
}
|
1804
|
-
|
1805
|
-
#if 0
|
1806
|
-
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1807
|
-
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1808
|
-
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1809
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
-
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
-
n_past, N);
|
1812
|
-
#endif
|
1813
|
-
|
1814
|
-
ggml_free(ctx0);
|
1815
|
-
|
1816
1911
|
// measure the performance only for the single-token evals
|
1817
1912
|
if (N == 1) {
|
1818
1913
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
@@ -1924,7 +2019,9 @@ struct llama_tokenizer {
|
|
1924
2019
|
if (token == vocab_.token_to_id.end()) {
|
1925
2020
|
// output any symbols that did not form tokens as bytes.
|
1926
2021
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
1927
|
-
|
2022
|
+
// NOTE: old version, before #2420 - not sure what are the implications of this
|
2023
|
+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
2024
|
+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
1928
2025
|
output.push_back(token_id);
|
1929
2026
|
}
|
1930
2027
|
} else {
|
@@ -3101,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
|
|
3101
3198
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
3102
3199
|
|
3103
3200
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3104
|
-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3201
|
+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3105
3202
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3106
3203
|
params.progress_callback_user_data)) {
|
3107
3204
|
delete model;
|
@@ -3178,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
|
|
3178
3275
|
ctx->embedding.resize(hparams.n_embd);
|
3179
3276
|
}
|
3180
3277
|
|
3278
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
3279
|
+
{
|
3280
|
+
static const size_t tensor_alignment = 32;
|
3281
|
+
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
3282
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
3283
|
+
|
3284
|
+
// create measure allocator
|
3285
|
+
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
3286
|
+
|
3287
|
+
// build worst-case graph
|
3288
|
+
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
3289
|
+
int n_past = hparams.n_ctx - n_tokens;
|
3290
|
+
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3291
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3292
|
+
|
3293
|
+
// measure memory requirements for the graph
|
3294
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3295
|
+
|
3296
|
+
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
|
+
|
3298
|
+
// debug - for comparison with scratch buffer
|
3299
|
+
//size_t prev_req =
|
3300
|
+
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
|
+
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
|
+
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
+
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
|
+
|
3305
|
+
// recreate allocator with exact memory requirements
|
3306
|
+
ggml_allocr_free(ctx->alloc);
|
3307
|
+
|
3308
|
+
ctx->buf_alloc.resize(alloc_size);
|
3309
|
+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3310
|
+
}
|
3311
|
+
#else
|
3181
3312
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
3313
|
+
#endif
|
3182
3314
|
|
3315
|
+
#ifdef LLAMA_USE_SCRATCH
|
3183
3316
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
3184
3317
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
3318
|
+
#endif
|
3185
3319
|
}
|
3186
3320
|
|
3187
3321
|
#ifdef GGML_USE_METAL
|
@@ -3251,9 +3385,6 @@ struct llama_context * llama_init_from_file(
|
|
3251
3385
|
}
|
3252
3386
|
|
3253
3387
|
void llama_free(struct llama_context * ctx) {
|
3254
|
-
if (ctx->model_owner) {
|
3255
|
-
delete &ctx->model;
|
3256
|
-
}
|
3257
3388
|
delete ctx;
|
3258
3389
|
}
|
3259
3390
|
|
@@ -3663,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3663
3794
|
const auto & kv_self = ctx->kv_self;
|
3664
3795
|
const auto & hparams = ctx->model.hparams;
|
3665
3796
|
const int n_layer = hparams.n_layer;
|
3666
|
-
const int n_embd = hparams.
|
3797
|
+
const int n_embd = hparams.n_embd_gqa();
|
3667
3798
|
const int n_ctx = hparams.n_ctx;
|
3668
3799
|
|
3669
3800
|
const size_t kv_size = kv_self.buf.size;
|
@@ -3766,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3766
3897
|
const auto & kv_self = ctx->kv_self;
|
3767
3898
|
const auto & hparams = ctx->model.hparams;
|
3768
3899
|
const int n_layer = hparams.n_layer;
|
3769
|
-
const int n_embd = hparams.
|
3900
|
+
const int n_embd = hparams.n_embd_gqa();
|
3770
3901
|
const int n_ctx = hparams.n_ctx;
|
3771
3902
|
|
3772
3903
|
size_t kv_size;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -108,6 +108,7 @@ extern "C" {
|
|
108
108
|
|
109
109
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
110
110
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
111
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
111
112
|
bool f16_kv; // use fp16 for KV cache
|
112
113
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
113
114
|
bool vocab_only; // only load the vocabulary, no weights
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-468ea24'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -163,6 +163,8 @@ module LLaMACpp
|
|
163
163
|
def rope_freq_scale: () -> Float
|
164
164
|
def low_vram: () -> bool
|
165
165
|
def low_vram=: (bool) -> bool
|
166
|
+
def mul_mat_q: () -> bool
|
167
|
+
def mul_mat_q=: (bool) -> bool
|
166
168
|
def seed: () -> Integer
|
167
169
|
def seed=: (Integer) -> Integer
|
168
170
|
def use_mlock: () -> bool
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -30,6 +30,8 @@ files:
|
|
30
30
|
- ext/llama_cpp/llama_cpp.cpp
|
31
31
|
- ext/llama_cpp/llama_cpp.h
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
|
+
- ext/llama_cpp/src/ggml-alloc.c
|
34
|
+
- ext/llama_cpp/src/ggml-alloc.h
|
33
35
|
- ext/llama_cpp/src/ggml-cuda.cu
|
34
36
|
- ext/llama_cpp/src/ggml-cuda.h
|
35
37
|
- ext/llama_cpp/src/ggml-metal.h
|