llama_cpp 0.3.5 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2090 -438
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +17 -16
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +49 -26
- data/ext/llama_cpp/src/ggml.h +12 -1
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama.cpp +199 -68
- data/ext/llama_cpp/src/llama.h +1 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -56,8 +56,14 @@
|
|
56
56
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
57
57
|
#endif
|
58
58
|
|
59
|
+
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
60
|
+
#include "ggml-alloc.h"
|
61
|
+
#define LLAMA_USE_ALLOCATOR
|
62
|
+
#else
|
59
63
|
#define LLAMA_USE_SCRATCH
|
60
64
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
65
|
+
#endif
|
66
|
+
|
61
67
|
|
62
68
|
// available llama models
|
63
69
|
enum e_model {
|
@@ -327,13 +333,22 @@ struct llama_model {
|
|
327
333
|
|
328
334
|
struct llama_context {
|
329
335
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
330
|
-
#ifdef GGML_USE_METAL
|
331
336
|
~llama_context() {
|
337
|
+
if (model_owner) {
|
338
|
+
delete &model;
|
339
|
+
}
|
340
|
+
#ifdef GGML_USE_METAL
|
332
341
|
if (ctx_metal) {
|
333
342
|
ggml_metal_free(ctx_metal);
|
334
343
|
}
|
335
|
-
}
|
336
344
|
#endif
|
345
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
346
|
+
if (alloc) {
|
347
|
+
ggml_allocr_free(alloc);
|
348
|
+
}
|
349
|
+
#endif
|
350
|
+
}
|
351
|
+
|
337
352
|
std::mt19937 rng;
|
338
353
|
|
339
354
|
bool has_evaluated_once = false;
|
@@ -371,7 +386,17 @@ struct llama_context {
|
|
371
386
|
// memory buffers used to evaluate the model
|
372
387
|
// TODO: move in llama_state
|
373
388
|
llama_ctx_buffer buf_compute;
|
389
|
+
|
390
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
391
|
+
llama_ctx_buffer buf_alloc;
|
392
|
+
ggml_allocr * alloc = NULL;
|
393
|
+
#endif
|
394
|
+
|
395
|
+
#ifdef LLAMA_USE_SCRATCH
|
374
396
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
397
|
+
int buf_last = 0;
|
398
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
399
|
+
#endif
|
375
400
|
|
376
401
|
#ifdef GGML_USE_METAL
|
377
402
|
ggml_metal_context * ctx_metal = NULL;
|
@@ -381,9 +406,6 @@ struct llama_context {
|
|
381
406
|
ggml_mpi_context * ctx_mpi = NULL;
|
382
407
|
#endif
|
383
408
|
|
384
|
-
int buf_last = 0;
|
385
|
-
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
386
|
-
|
387
409
|
void use_buf(struct ggml_context * ctx, int i) {
|
388
410
|
#if defined(LLAMA_USE_SCRATCH)
|
389
411
|
size_t last_size = 0;
|
@@ -879,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
|
|
879
901
|
/*.progress_callback =*/ nullptr,
|
880
902
|
/*.progress_callback_user_data =*/ nullptr,
|
881
903
|
/*.low_vram =*/ false,
|
904
|
+
/*.mul_mat_q =*/ false,
|
882
905
|
/*.f16_kv =*/ true,
|
883
906
|
/*.logits_all =*/ false,
|
884
907
|
/*.vocab_only =*/ false,
|
@@ -1006,6 +1029,7 @@ static void llama_model_load_internal(
|
|
1006
1029
|
int n_gpu_layers,
|
1007
1030
|
int main_gpu,
|
1008
1031
|
const float * tensor_split,
|
1032
|
+
const bool mul_mat_q,
|
1009
1033
|
float rope_freq_base,
|
1010
1034
|
float rope_freq_scale,
|
1011
1035
|
bool low_vram,
|
@@ -1134,9 +1158,11 @@ static void llama_model_load_internal(
|
|
1134
1158
|
}
|
1135
1159
|
|
1136
1160
|
(void) main_gpu;
|
1161
|
+
(void) mul_mat_q;
|
1137
1162
|
#if defined(GGML_USE_CUBLAS)
|
1138
1163
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1139
1164
|
ggml_cuda_set_main_device(main_gpu);
|
1165
|
+
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1140
1166
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1141
1167
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1142
1168
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1230,12 +1256,16 @@ static void llama_model_load_internal(
|
|
1230
1256
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1231
1257
|
|
1232
1258
|
// this is the total memory required to run the inference
|
1233
|
-
|
1259
|
+
size_t mem_required =
|
1234
1260
|
ctx_size +
|
1235
|
-
mmapped_size - vram_weights
|
1261
|
+
mmapped_size - vram_weights; // weights in VRAM not in memory
|
1262
|
+
|
1263
|
+
#ifndef LLAMA_USE_ALLOCATOR
|
1264
|
+
mem_required +=
|
1236
1265
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1237
1266
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1238
1267
|
MEM_REQ_EVAL().at(model.type);
|
1268
|
+
#endif
|
1239
1269
|
|
1240
1270
|
// this is the memory required by one llama_state
|
1241
1271
|
const size_t mem_required_state =
|
@@ -1341,6 +1371,7 @@ static bool llama_model_load(
|
|
1341
1371
|
int n_gpu_layers,
|
1342
1372
|
int main_gpu,
|
1343
1373
|
const float * tensor_split,
|
1374
|
+
const bool mul_mat_q,
|
1344
1375
|
float rope_freq_base,
|
1345
1376
|
float rope_freq_scale,
|
1346
1377
|
bool low_vram,
|
@@ -1351,7 +1382,8 @@ static bool llama_model_load(
|
|
1351
1382
|
llama_progress_callback progress_callback,
|
1352
1383
|
void *progress_callback_user_data) {
|
1353
1384
|
try {
|
1354
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1385
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
1386
|
+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1355
1387
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1356
1388
|
return true;
|
1357
1389
|
} catch (const std::exception & err) {
|
@@ -1360,32 +1392,15 @@ static bool llama_model_load(
|
|
1360
1392
|
}
|
1361
1393
|
}
|
1362
1394
|
|
1363
|
-
|
1364
|
-
//
|
1365
|
-
// - lctx: llama context
|
1366
|
-
// - tokens: new batch of tokens to process
|
1367
|
-
// - embd embeddings input
|
1368
|
-
// - n_tokens number of tokens
|
1369
|
-
// - n_past: the context size so far
|
1370
|
-
// - n_threads: number of threads to use
|
1371
|
-
//
|
1372
|
-
static bool llama_eval_internal(
|
1395
|
+
static struct ggml_cgraph * llama_build_graph(
|
1373
1396
|
llama_context & lctx,
|
1374
1397
|
const llama_token * tokens,
|
1375
1398
|
const float * embd,
|
1376
1399
|
int n_tokens,
|
1377
|
-
int n_past
|
1378
|
-
int n_threads,
|
1379
|
-
const char * cgraph_fname) {
|
1400
|
+
int n_past) {
|
1380
1401
|
|
1381
1402
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1382
1403
|
|
1383
|
-
#ifdef GGML_USE_MPI
|
1384
|
-
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1385
|
-
#endif
|
1386
|
-
|
1387
|
-
const int64_t t_start_us = ggml_time_us();
|
1388
|
-
|
1389
1404
|
const int N = n_tokens;
|
1390
1405
|
|
1391
1406
|
const auto & model = lctx.model;
|
@@ -1401,10 +1416,8 @@ static bool llama_eval_internal(
|
|
1401
1416
|
const int64_t n_head = hparams.n_head;
|
1402
1417
|
const int64_t n_head_kv = hparams.n_head_kv;
|
1403
1418
|
const int64_t n_embd_head = hparams.n_embd_head();
|
1404
|
-
const int64_t n_vocab = hparams.n_vocab;
|
1405
1419
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
1406
1420
|
|
1407
|
-
|
1408
1421
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
1409
1422
|
|
1410
1423
|
const float freq_base = hparams.rope_freq_base;
|
@@ -1416,26 +1429,35 @@ static bool llama_eval_internal(
|
|
1416
1429
|
auto & mem_per_token = lctx.mem_per_token;
|
1417
1430
|
auto & buf_compute = lctx.buf_compute;
|
1418
1431
|
|
1432
|
+
|
1419
1433
|
struct ggml_init_params params = {
|
1420
1434
|
/*.mem_size =*/ buf_compute.size,
|
1421
1435
|
/*.mem_buffer =*/ buf_compute.addr,
|
1422
1436
|
/*.no_alloc =*/ false,
|
1423
1437
|
};
|
1424
1438
|
|
1439
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1440
|
+
params.no_alloc = true;
|
1441
|
+
#endif
|
1442
|
+
|
1425
1443
|
struct ggml_context * ctx0 = ggml_init(params);
|
1426
1444
|
|
1427
1445
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
1428
1446
|
|
1429
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1430
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1431
|
-
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1432
|
-
|
1433
1447
|
struct ggml_tensor * cur;
|
1434
1448
|
struct ggml_tensor * inpL;
|
1435
1449
|
|
1436
1450
|
if (tokens) {
|
1437
1451
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1452
|
+
|
1453
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1454
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
1455
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1456
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1457
|
+
}
|
1458
|
+
#else
|
1438
1459
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1460
|
+
#endif
|
1439
1461
|
ggml_set_name(inp_tokens, "inp_tokens");
|
1440
1462
|
|
1441
1463
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
@@ -1445,7 +1467,15 @@ static bool llama_eval_internal(
|
|
1445
1467
|
#endif
|
1446
1468
|
|
1447
1469
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1470
|
+
|
1471
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1472
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
1473
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1474
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1475
|
+
}
|
1476
|
+
#else
|
1448
1477
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1478
|
+
#endif
|
1449
1479
|
}
|
1450
1480
|
|
1451
1481
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1472,6 +1502,17 @@ static bool llama_eval_internal(
|
|
1472
1502
|
}
|
1473
1503
|
#endif // GGML_USE_CUBLAS
|
1474
1504
|
|
1505
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
1506
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1507
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
1508
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
1509
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1510
|
+
}
|
1511
|
+
#else
|
1512
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
1513
|
+
#endif
|
1514
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1515
|
+
|
1475
1516
|
for (int il = 0; il < n_layer; ++il) {
|
1476
1517
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
1477
1518
|
|
@@ -1567,9 +1608,6 @@ static bool llama_eval_internal(
|
|
1567
1608
|
ggml_set_name(KQ, "KQ");
|
1568
1609
|
|
1569
1610
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
1570
|
-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
1571
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
1572
|
-
|
1573
1611
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1574
1612
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1575
1613
|
offload_func_kq(KQ_scaled);
|
@@ -1685,9 +1723,6 @@ static bool llama_eval_internal(
|
|
1685
1723
|
|
1686
1724
|
lctx.use_buf(ctx0, 0);
|
1687
1725
|
|
1688
|
-
// used at the end to optionally extract the embeddings
|
1689
|
-
struct ggml_tensor * embeddings = NULL;
|
1690
|
-
|
1691
1726
|
// norm
|
1692
1727
|
{
|
1693
1728
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
@@ -1698,8 +1733,6 @@ static bool llama_eval_internal(
|
|
1698
1733
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1699
1734
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1700
1735
|
ggml_set_name(cur, "result_norm");
|
1701
|
-
|
1702
|
-
embeddings = cur;
|
1703
1736
|
}
|
1704
1737
|
|
1705
1738
|
// lm_head
|
@@ -1711,12 +1744,88 @@ static bool llama_eval_internal(
|
|
1711
1744
|
// logits -> probs
|
1712
1745
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1713
1746
|
|
1714
|
-
// run the computation
|
1715
1747
|
ggml_build_forward_expand(gf, cur);
|
1716
1748
|
|
1717
|
-
|
1749
|
+
if (mem_per_token == 0) {
|
1750
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
#if 0
|
1754
|
+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1755
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1756
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1757
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1758
|
+
lctx.work_buffer.size()/1024.0/1024.0,
|
1759
|
+
n_past, N);
|
1760
|
+
#endif
|
1761
|
+
|
1762
|
+
ggml_free(ctx0);
|
1763
|
+
|
1764
|
+
return gf;
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
// evaluate the transformer
|
1768
|
+
//
|
1769
|
+
// - lctx: llama context
|
1770
|
+
// - tokens: new batch of tokens to process
|
1771
|
+
// - embd embeddings input
|
1772
|
+
// - n_tokens number of tokens
|
1773
|
+
// - n_past: the context size so far
|
1774
|
+
// - n_threads: number of threads to use
|
1775
|
+
//
|
1776
|
+
static bool llama_eval_internal(
|
1777
|
+
llama_context & lctx,
|
1778
|
+
const llama_token * tokens,
|
1779
|
+
const float * embd,
|
1780
|
+
int n_tokens,
|
1781
|
+
int n_past,
|
1782
|
+
int n_threads,
|
1783
|
+
const char * cgraph_fname) {
|
1784
|
+
|
1785
|
+
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1786
|
+
|
1787
|
+
const int64_t t_start_us = ggml_time_us();
|
1788
|
+
|
1789
|
+
#ifdef GGML_USE_MPI
|
1790
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1791
|
+
#endif
|
1792
|
+
|
1793
|
+
const int N = n_tokens;
|
1794
|
+
|
1795
|
+
const auto & model = lctx.model;
|
1796
|
+
const auto & hparams = model.hparams;
|
1797
|
+
|
1798
|
+
const auto & kv_self = lctx.kv_self;
|
1799
|
+
|
1800
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
1801
|
+
|
1802
|
+
const int64_t n_embd = hparams.n_embd;
|
1803
|
+
const int64_t n_vocab = hparams.n_vocab;
|
1804
|
+
|
1805
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1806
|
+
ggml_allocr_reset(lctx.alloc);
|
1807
|
+
#endif
|
1808
|
+
|
1809
|
+
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
1810
|
+
|
1811
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
1812
|
+
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
1813
|
+
#endif
|
1814
|
+
|
1815
|
+
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
1816
|
+
|
1817
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1818
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1819
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1820
|
+
|
1821
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1822
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
1823
|
+
|
1824
|
+
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
1825
|
+
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
1718
1826
|
|
1719
1827
|
#if GGML_USE_MPI
|
1828
|
+
const int64_t n_layer = hparams.n_layer;
|
1720
1829
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
1721
1830
|
#endif
|
1722
1831
|
|
@@ -1728,7 +1837,10 @@ static bool llama_eval_internal(
|
|
1728
1837
|
//}
|
1729
1838
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1730
1839
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
1731
|
-
ggml_metal_get_tensor (lctx.ctx_metal,
|
1840
|
+
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
1841
|
+
if (!lctx.embedding.empty()) {
|
1842
|
+
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
1843
|
+
}
|
1732
1844
|
} else {
|
1733
1845
|
// IMPORTANT:
|
1734
1846
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
@@ -1759,8 +1871,6 @@ static bool llama_eval_internal(
|
|
1759
1871
|
// update kv token count
|
1760
1872
|
lctx.kv_self.n = n_past + N;
|
1761
1873
|
|
1762
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
1763
|
-
|
1764
1874
|
if (cgraph_fname) {
|
1765
1875
|
ggml_graph_export(gf, cgraph_fname);
|
1766
1876
|
}
|
@@ -1798,21 +1908,6 @@ static bool llama_eval_internal(
|
|
1798
1908
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
1799
1909
|
}
|
1800
1910
|
|
1801
|
-
if (mem_per_token == 0) {
|
1802
|
-
mem_per_token = ggml_used_mem(ctx0)/N;
|
1803
|
-
}
|
1804
|
-
|
1805
|
-
#if 0
|
1806
|
-
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
1807
|
-
ggml_used_mem(ctx0)/1024.0/1024.0,
|
1808
|
-
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
1809
|
-
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
1810
|
-
lctx.work_buffer.size()/1024.0/1024.0,
|
1811
|
-
n_past, N);
|
1812
|
-
#endif
|
1813
|
-
|
1814
|
-
ggml_free(ctx0);
|
1815
|
-
|
1816
1911
|
// measure the performance only for the single-token evals
|
1817
1912
|
if (N == 1) {
|
1818
1913
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
@@ -1924,7 +2019,9 @@ struct llama_tokenizer {
|
|
1924
2019
|
if (token == vocab_.token_to_id.end()) {
|
1925
2020
|
// output any symbols that did not form tokens as bytes.
|
1926
2021
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
1927
|
-
|
2022
|
+
// NOTE: old version, before #2420 - not sure what are the implications of this
|
2023
|
+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
2024
|
+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
1928
2025
|
output.push_back(token_id);
|
1929
2026
|
}
|
1930
2027
|
} else {
|
@@ -3101,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
|
|
3101
3198
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
3102
3199
|
|
3103
3200
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
3104
|
-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3201
|
+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
3105
3202
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
3106
3203
|
params.progress_callback_user_data)) {
|
3107
3204
|
delete model;
|
@@ -3178,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
|
|
3178
3275
|
ctx->embedding.resize(hparams.n_embd);
|
3179
3276
|
}
|
3180
3277
|
|
3278
|
+
#ifdef LLAMA_USE_ALLOCATOR
|
3279
|
+
{
|
3280
|
+
static const size_t tensor_alignment = 32;
|
3281
|
+
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
3282
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
3283
|
+
|
3284
|
+
// create measure allocator
|
3285
|
+
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
3286
|
+
|
3287
|
+
// build worst-case graph
|
3288
|
+
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
3289
|
+
int n_past = hparams.n_ctx - n_tokens;
|
3290
|
+
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
3291
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
3292
|
+
|
3293
|
+
// measure memory requirements for the graph
|
3294
|
+
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
3295
|
+
|
3296
|
+
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
3297
|
+
|
3298
|
+
// debug - for comparison with scratch buffer
|
3299
|
+
//size_t prev_req =
|
3300
|
+
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
3301
|
+
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
3302
|
+
// MEM_REQ_EVAL().at(ctx->model.type);
|
3303
|
+
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
3304
|
+
|
3305
|
+
// recreate allocator with exact memory requirements
|
3306
|
+
ggml_allocr_free(ctx->alloc);
|
3307
|
+
|
3308
|
+
ctx->buf_alloc.resize(alloc_size);
|
3309
|
+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
3310
|
+
}
|
3311
|
+
#else
|
3181
3312
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
3313
|
+
#endif
|
3182
3314
|
|
3315
|
+
#ifdef LLAMA_USE_SCRATCH
|
3183
3316
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
3184
3317
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
3318
|
+
#endif
|
3185
3319
|
}
|
3186
3320
|
|
3187
3321
|
#ifdef GGML_USE_METAL
|
@@ -3251,9 +3385,6 @@ struct llama_context * llama_init_from_file(
|
|
3251
3385
|
}
|
3252
3386
|
|
3253
3387
|
void llama_free(struct llama_context * ctx) {
|
3254
|
-
if (ctx->model_owner) {
|
3255
|
-
delete &ctx->model;
|
3256
|
-
}
|
3257
3388
|
delete ctx;
|
3258
3389
|
}
|
3259
3390
|
|
@@ -3663,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3663
3794
|
const auto & kv_self = ctx->kv_self;
|
3664
3795
|
const auto & hparams = ctx->model.hparams;
|
3665
3796
|
const int n_layer = hparams.n_layer;
|
3666
|
-
const int n_embd = hparams.
|
3797
|
+
const int n_embd = hparams.n_embd_gqa();
|
3667
3798
|
const int n_ctx = hparams.n_ctx;
|
3668
3799
|
|
3669
3800
|
const size_t kv_size = kv_self.buf.size;
|
@@ -3766,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3766
3897
|
const auto & kv_self = ctx->kv_self;
|
3767
3898
|
const auto & hparams = ctx->model.hparams;
|
3768
3899
|
const int n_layer = hparams.n_layer;
|
3769
|
-
const int n_embd = hparams.
|
3900
|
+
const int n_embd = hparams.n_embd_gqa();
|
3770
3901
|
const int n_ctx = hparams.n_ctx;
|
3771
3902
|
|
3772
3903
|
size_t kv_size;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -108,6 +108,7 @@ extern "C" {
|
|
108
108
|
|
109
109
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
110
110
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
111
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
111
112
|
bool f16_kv; // use fp16 for KV cache
|
112
113
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
113
114
|
bool vocab_only; // only load the vocabulary, no weights
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-468ea24'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -163,6 +163,8 @@ module LLaMACpp
|
|
163
163
|
def rope_freq_scale: () -> Float
|
164
164
|
def low_vram: () -> bool
|
165
165
|
def low_vram=: (bool) -> bool
|
166
|
+
def mul_mat_q: () -> bool
|
167
|
+
def mul_mat_q=: (bool) -> bool
|
166
168
|
def seed: () -> Integer
|
167
169
|
def seed=: (Integer) -> Integer
|
168
170
|
def use_mlock: () -> bool
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -30,6 +30,8 @@ files:
|
|
30
30
|
- ext/llama_cpp/llama_cpp.cpp
|
31
31
|
- ext/llama_cpp/llama_cpp.h
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
|
+
- ext/llama_cpp/src/ggml-alloc.c
|
34
|
+
- ext/llama_cpp/src/ggml-alloc.h
|
33
35
|
- ext/llama_cpp/src/ggml-cuda.cu
|
34
36
|
- ext/llama_cpp/src/ggml-cuda.h
|
35
37
|
- ext/llama_cpp/src/ggml-metal.h
|