llama_cpp 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +165 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +217 -76
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +16 -5
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1082 -774
- data/ext/llama_cpp/src/ggml.h +64 -18
- data/ext/llama_cpp/src/llama.cpp +179 -51
- data/ext/llama_cpp/src/llama.h +15 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +3 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,12 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
204
208
|
#define GGML_UNUSED(x) (void)(x)
|
205
209
|
|
210
|
+
|
206
211
|
#define GGML_ASSERT(x) \
|
207
212
|
do { \
|
208
213
|
if (!(x)) { \
|
@@ -363,6 +368,8 @@ extern "C" {
|
|
363
368
|
GGML_OP_CLAMP,
|
364
369
|
GGML_OP_CONV_1D,
|
365
370
|
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
366
373
|
|
367
374
|
GGML_OP_FLASH_ATTN,
|
368
375
|
GGML_OP_FLASH_FF,
|
@@ -414,12 +421,7 @@ extern "C" {
|
|
414
421
|
bool is_param;
|
415
422
|
|
416
423
|
struct ggml_tensor * grad;
|
417
|
-
struct ggml_tensor *
|
418
|
-
struct ggml_tensor * src1;
|
419
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
420
|
-
|
421
|
-
// thread scheduling
|
422
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
423
425
|
|
424
426
|
// performance
|
425
427
|
int perf_runs;
|
@@ -432,19 +434,31 @@ extern "C" {
|
|
432
434
|
|
433
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
434
436
|
|
435
|
-
char padding[
|
437
|
+
char padding[8];
|
436
438
|
};
|
437
439
|
|
438
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
439
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
440
458
|
// computation graph
|
441
459
|
struct ggml_cgraph {
|
442
460
|
int n_nodes;
|
443
461
|
int n_leafs;
|
444
|
-
int n_threads;
|
445
|
-
|
446
|
-
size_t work_size;
|
447
|
-
struct ggml_tensor * work;
|
448
462
|
|
449
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
450
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -1161,6 +1175,31 @@ extern "C" {
|
|
1161
1175
|
int s,
|
1162
1176
|
int d);
|
1163
1177
|
|
1178
|
+
enum ggml_op_pool {
|
1179
|
+
GGML_OP_POOL_MAX,
|
1180
|
+
GGML_OP_POOL_AVG,
|
1181
|
+
GGML_OP_POOL_COUNT,
|
1182
|
+
};
|
1183
|
+
|
1184
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1185
|
+
struct ggml_context * ctx,
|
1186
|
+
struct ggml_tensor * a,
|
1187
|
+
enum ggml_op_pool op,
|
1188
|
+
int k0, // kernel size
|
1189
|
+
int s0, // stride
|
1190
|
+
int p0); // padding
|
1191
|
+
|
1192
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1193
|
+
struct ggml_context * ctx,
|
1194
|
+
struct ggml_tensor * a,
|
1195
|
+
enum ggml_op_pool op,
|
1196
|
+
int k0,
|
1197
|
+
int k1,
|
1198
|
+
int s0,
|
1199
|
+
int s1,
|
1200
|
+
int p0,
|
1201
|
+
int p1);
|
1202
|
+
|
1164
1203
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1165
1204
|
struct ggml_context * ctx,
|
1166
1205
|
struct ggml_tensor * q,
|
@@ -1290,15 +1329,22 @@ extern "C" {
|
|
1290
1329
|
|
1291
1330
|
GGML_API void ggml_set_param(
|
1292
1331
|
struct ggml_context * ctx,
|
1293
|
-
struct ggml_tensor
|
1332
|
+
struct ggml_tensor * tensor);
|
1294
1333
|
|
1295
1334
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1296
1335
|
|
1297
1336
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1298
1337
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1299
1338
|
|
1300
|
-
|
1301
|
-
|
1339
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1340
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1341
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1342
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1343
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1344
|
+
|
1345
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1346
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1347
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1302
1348
|
|
1303
1349
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1304
1350
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -79,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
79
82
|
(void) tensor;
|
80
83
|
}
|
81
84
|
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
82
104
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
83
105
|
{
|
84
106
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -321,6 +343,9 @@ struct llama_context {
|
|
321
343
|
// input embedding (1-dimensional array: [n_embd])
|
322
344
|
std::vector<float> embedding;
|
323
345
|
|
346
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
347
|
+
std::vector<uint8_t> work_buffer;
|
348
|
+
|
324
349
|
// memory buffers used to evaluate the model
|
325
350
|
// TODO: move in llama_state
|
326
351
|
llama_ctx_buffer buf_compute;
|
@@ -330,6 +355,10 @@ struct llama_context {
|
|
330
355
|
ggml_metal_context * ctx_metal = NULL;
|
331
356
|
#endif
|
332
357
|
|
358
|
+
#ifdef GGML_USE_MPI
|
359
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
360
|
+
#endif
|
361
|
+
|
333
362
|
int buf_last = 0;
|
334
363
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
335
364
|
|
@@ -758,7 +787,6 @@ struct llama_model_loader {
|
|
758
787
|
|
759
788
|
};
|
760
789
|
|
761
|
-
|
762
790
|
//
|
763
791
|
// kv cache
|
764
792
|
//
|
@@ -849,7 +877,7 @@ bool llama_mlock_supported() {
|
|
849
877
|
return llama_mlock::SUPPORTED;
|
850
878
|
}
|
851
879
|
|
852
|
-
void
|
880
|
+
void llama_backend_init(bool numa) {
|
853
881
|
ggml_time_init();
|
854
882
|
|
855
883
|
// needed to initialize f16 tables
|
@@ -862,6 +890,16 @@ void llama_init_backend(bool numa) {
|
|
862
890
|
if (numa) {
|
863
891
|
ggml_numa_init();
|
864
892
|
}
|
893
|
+
|
894
|
+
#ifdef GGML_USE_MPI
|
895
|
+
ggml_mpi_backend_init();
|
896
|
+
#endif
|
897
|
+
}
|
898
|
+
|
899
|
+
void llama_backend_free() {
|
900
|
+
#ifdef GGML_USE_MPI
|
901
|
+
ggml_mpi_backend_free();
|
902
|
+
#endif
|
865
903
|
}
|
866
904
|
|
867
905
|
int64_t llama_time_us() {
|
@@ -1263,18 +1301,16 @@ static bool llama_eval_internal(
|
|
1263
1301
|
llama_context & lctx,
|
1264
1302
|
const llama_token * tokens,
|
1265
1303
|
const float * embd,
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1304
|
+
int n_tokens,
|
1305
|
+
int n_past,
|
1306
|
+
int n_threads,
|
1269
1307
|
const char * cgraph_fname) {
|
1270
1308
|
|
1271
1309
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
1310
|
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
return false;
|
1277
|
-
}
|
1311
|
+
#ifdef GGML_USE_MPI
|
1312
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1313
|
+
#endif
|
1278
1314
|
|
1279
1315
|
const int64_t t_start_us = ggml_time_us();
|
1280
1316
|
|
@@ -1306,20 +1342,26 @@ static bool llama_eval_internal(
|
|
1306
1342
|
|
1307
1343
|
struct ggml_context * ctx0 = ggml_init(params);
|
1308
1344
|
|
1345
|
+
ggml_cgraph gf = {};
|
1346
|
+
|
1309
1347
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1310
1348
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1311
|
-
|
1312
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1349
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1313
1350
|
|
1314
1351
|
struct ggml_tensor * cur;
|
1315
1352
|
struct ggml_tensor * inpL;
|
1316
1353
|
|
1317
1354
|
if (tokens) {
|
1318
|
-
struct ggml_tensor *
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1356
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1357
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1358
|
+
|
1359
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1322
1360
|
} else {
|
1361
|
+
#ifdef GGML_USE_MPI
|
1362
|
+
GGML_ASSERT(false && "not implemented");
|
1363
|
+
#endif
|
1364
|
+
|
1323
1365
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
1366
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
1367
|
}
|
@@ -1337,18 +1379,20 @@ static bool llama_eval_internal(
|
|
1337
1379
|
offload_func_t offload_func_v = llama_nop;
|
1338
1380
|
|
1339
1381
|
#ifdef GGML_USE_CUBLAS
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1382
|
+
if (n_gpu_layers > n_layer) {
|
1383
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1384
|
+
}
|
1385
|
+
if (n_gpu_layers > n_layer + 1) {
|
1386
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1387
|
+
}
|
1388
|
+
if (n_gpu_layers > n_layer + 2) {
|
1389
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1390
|
+
}
|
1349
1391
|
#endif // GGML_USE_CUBLAS
|
1350
1392
|
|
1351
1393
|
for (int il = 0; il < n_layer; ++il) {
|
1394
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1395
|
+
|
1352
1396
|
offload_func_t offload_func = llama_nop;
|
1353
1397
|
|
1354
1398
|
#ifdef GGML_USE_CUBLAS
|
@@ -1555,7 +1599,6 @@ static bool llama_eval_internal(
|
|
1555
1599
|
|
1556
1600
|
// input for next layer
|
1557
1601
|
inpL = cur;
|
1558
|
-
|
1559
1602
|
}
|
1560
1603
|
|
1561
1604
|
lctx.use_buf(ctx0, 0);
|
@@ -1563,7 +1606,6 @@ static bool llama_eval_internal(
|
|
1563
1606
|
// used at the end to optionally extract the embeddings
|
1564
1607
|
struct ggml_tensor * embeddings = NULL;
|
1565
1608
|
|
1566
|
-
|
1567
1609
|
// norm
|
1568
1610
|
{
|
1569
1611
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1578,7 +1620,6 @@ static bool llama_eval_internal(
|
|
1578
1620
|
embeddings = cur;
|
1579
1621
|
}
|
1580
1622
|
|
1581
|
-
|
1582
1623
|
// lm_head
|
1583
1624
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1584
1625
|
ggml_set_name(cur, "result_output");
|
@@ -1591,8 +1632,13 @@ static bool llama_eval_internal(
|
|
1591
1632
|
// run the computation
|
1592
1633
|
ggml_build_forward_expand(&gf, cur);
|
1593
1634
|
|
1635
|
+
#if GGML_USE_MPI
|
1636
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1637
|
+
#endif
|
1638
|
+
|
1594
1639
|
#ifdef GGML_USE_METAL
|
1595
1640
|
if (lctx.ctx_metal && N == 1) {
|
1641
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1596
1642
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1597
1643
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1598
1644
|
} else {
|
@@ -1612,12 +1658,21 @@ static bool llama_eval_internal(
|
|
1612
1658
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1613
1659
|
}
|
1614
1660
|
|
1615
|
-
|
1661
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1616
1662
|
}
|
1617
1663
|
#else
|
1618
|
-
|
1664
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1619
1665
|
#endif
|
1620
1666
|
|
1667
|
+
#if GGML_USE_MPI
|
1668
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1669
|
+
#endif
|
1670
|
+
|
1671
|
+
// update kv token count
|
1672
|
+
lctx.kv_self.n = n_past + N;
|
1673
|
+
|
1674
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1675
|
+
|
1621
1676
|
if (cgraph_fname) {
|
1622
1677
|
ggml_graph_export(&gf, cgraph_fname);
|
1623
1678
|
}
|
@@ -1633,23 +1688,17 @@ static bool llama_eval_internal(
|
|
1633
1688
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1634
1689
|
//}
|
1635
1690
|
|
1636
|
-
//embd_w.resize(n_vocab*N);
|
1637
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1638
|
-
|
1639
|
-
// update kv token count
|
1640
|
-
lctx.kv_self.n = n_past + N;
|
1641
|
-
|
1642
1691
|
// extract logits
|
1643
1692
|
{
|
1644
1693
|
auto & logits_out = lctx.logits;
|
1645
1694
|
|
1646
1695
|
if (lctx.logits_all) {
|
1647
1696
|
logits_out.resize(n_vocab * N);
|
1648
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1697
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1649
1698
|
} else {
|
1650
1699
|
// return result for just the last token
|
1651
1700
|
logits_out.resize(n_vocab);
|
1652
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1701
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1653
1702
|
}
|
1654
1703
|
}
|
1655
1704
|
|
@@ -2118,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2118
2167
|
}
|
2119
2168
|
}
|
2120
2169
|
|
2170
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2171
|
+
float max_l = *std::max_element(array, array + size);
|
2172
|
+
float sum = 0.f;
|
2173
|
+
for (size_t i = 0; i < size; ++i) {
|
2174
|
+
float p = expf(array[i] - max_l);
|
2175
|
+
sum += p;
|
2176
|
+
array[i] = p;
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
for (size_t i = 0; i < size; ++i) {
|
2180
|
+
array[i] = logf(array[i] / sum);
|
2181
|
+
}
|
2182
|
+
}
|
2183
|
+
|
2184
|
+
void llama_sample_classifier_free_guidance(
|
2185
|
+
struct llama_context * ctx,
|
2186
|
+
llama_token_data_array * candidates,
|
2187
|
+
struct llama_context * guidance_ctx,
|
2188
|
+
float scale,
|
2189
|
+
float smooth_factor) {
|
2190
|
+
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2191
|
+
|
2192
|
+
assert(ctx);
|
2193
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2194
|
+
assert(n_vocab == (int)candidates->size);
|
2195
|
+
assert(!candidates->sorted);
|
2196
|
+
|
2197
|
+
std::vector<float> logits_base;
|
2198
|
+
logits_base.reserve(candidates->size);
|
2199
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2200
|
+
logits_base.push_back(candidates->data[i].logit);
|
2201
|
+
}
|
2202
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2203
|
+
|
2204
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2205
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2206
|
+
|
2207
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2208
|
+
float logit_guidance = logits_guidance[i];
|
2209
|
+
float logit_base = logits_base[i];
|
2210
|
+
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
+
|
2215
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
+
float logit_base = logits_base[i];
|
2217
|
+
float logit_guidance = logits_guidance[i];
|
2218
|
+
|
2219
|
+
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2220
|
+
}
|
2221
|
+
|
2222
|
+
if (ctx) {
|
2223
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2224
|
+
}
|
2225
|
+
}
|
2121
2226
|
|
2122
2227
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2123
2228
|
assert(ctx);
|
@@ -2405,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2405
2510
|
} else {
|
2406
2511
|
new_type = quantized_type;
|
2407
2512
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
+
bool convert_incompatible_tensor = false;
|
2408
2514
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2409
2515
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2410
2516
|
int nx = tensor.ne.at(0);
|
2411
2517
|
int ny = tensor.ne.at(1);
|
2412
2518
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2413
|
-
fprintf(stderr, "\n\
|
2414
|
-
|
2415
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2416
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2519
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
+
convert_incompatible_tensor = true;
|
2417
2521
|
}
|
2418
2522
|
}
|
2419
2523
|
if (tensor.name == "output.weight") {
|
@@ -2441,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2441
2545
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2442
2546
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2443
2547
|
}
|
2548
|
+
if (convert_incompatible_tensor) {
|
2549
|
+
if (tensor.name == "output.weight") {
|
2550
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2551
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2552
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2553
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2554
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2555
|
+
} else {
|
2556
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2557
|
+
}
|
2558
|
+
}
|
2444
2559
|
#endif
|
2445
2560
|
|
2446
2561
|
float * f32_data;
|
@@ -2575,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
|
|
2575
2690
|
}
|
2576
2691
|
|
2577
2692
|
struct llama_context * llama_new_context_with_model(
|
2578
|
-
|
2579
|
-
|
2693
|
+
struct llama_model * model,
|
2694
|
+
struct llama_context_params params) {
|
2580
2695
|
|
2581
2696
|
if (!model) {
|
2582
2697
|
return nullptr;
|
@@ -2645,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2645
2760
|
#ifdef GGML_USE_METAL
|
2646
2761
|
if (params.n_gpu_layers > 0) {
|
2647
2762
|
// this allocates all Metal resources and memory buffers
|
2648
|
-
ctx->ctx_metal = ggml_metal_init();
|
2763
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2649
2764
|
|
2650
2765
|
void * data_ptr = NULL;
|
2651
2766
|
size_t data_size = 0;
|
@@ -2680,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2680
2795
|
}
|
2681
2796
|
#endif
|
2682
2797
|
|
2798
|
+
#ifdef GGML_USE_MPI
|
2799
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2800
|
+
|
2801
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2802
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2803
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2804
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2805
|
+
llama_backend_free();
|
2806
|
+
exit(1);
|
2807
|
+
}
|
2808
|
+
#endif
|
2809
|
+
|
2683
2810
|
return ctx;
|
2684
2811
|
}
|
2685
2812
|
|
@@ -2802,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2802
2929
|
// read tensors and apply
|
2803
2930
|
bool warned = false;
|
2804
2931
|
int n_tensors = 0;
|
2932
|
+
|
2933
|
+
std::vector<uint8_t> work_buffer;
|
2934
|
+
|
2805
2935
|
while (true) {
|
2806
2936
|
int32_t n_dims;
|
2807
2937
|
int32_t length;
|
@@ -2966,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2966
3096
|
}
|
2967
3097
|
|
2968
3098
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2969
|
-
|
2970
|
-
|
3099
|
+
|
3100
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2971
3101
|
|
2972
3102
|
// we won't need these tensors again, reset the context to save memory
|
2973
3103
|
ggml_free(lora_ctx);
|
@@ -3120,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3120
3250
|
|
3121
3251
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3122
3252
|
ggml_cgraph gf{};
|
3123
|
-
gf.n_threads = 1;
|
3124
3253
|
|
3125
3254
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3126
3255
|
kout3d->data = out;
|
@@ -3140,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3140
3269
|
|
3141
3270
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3142
3271
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3143
|
-
|
3272
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3144
3273
|
|
3145
3274
|
ggml_free(cpy_ctx);
|
3146
3275
|
}
|
@@ -3226,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3226
3355
|
|
3227
3356
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3228
3357
|
ggml_cgraph gf{};
|
3229
|
-
gf.n_threads = 1;
|
3230
3358
|
|
3231
3359
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3232
3360
|
kin3d->data = (void *) inp;
|
@@ -3246,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3246
3374
|
|
3247
3375
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3248
3376
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3249
|
-
|
3377
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3250
3378
|
|
3251
3379
|
ggml_free(cpy_ctx);
|
3252
3380
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -158,7 +158,9 @@ extern "C" {
|
|
158
158
|
// Initialize the llama + ggml backend
|
159
159
|
// If numa is true, use NUMA optimizations
|
160
160
|
// Call once at the start of the program
|
161
|
-
LLAMA_API void
|
161
|
+
LLAMA_API void llama_backend_init(bool numa);
|
162
|
+
// Call once at the end of the program - currently only used for MPI
|
163
|
+
LLAMA_API void llama_backend_free();
|
162
164
|
|
163
165
|
LLAMA_API int64_t llama_time_us();
|
164
166
|
|
@@ -307,6 +309,18 @@ extern "C" {
|
|
307
309
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
308
310
|
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
309
311
|
|
312
|
+
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
313
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
|
+
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
|
+
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
+
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
|
+
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
|
+
struct llama_context * ctx,
|
319
|
+
llama_token_data_array * candidates,
|
320
|
+
struct llama_context * guidance_ctx,
|
321
|
+
float scale,
|
322
|
+
float smooth_factor);
|
323
|
+
|
310
324
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
311
325
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
312
326
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-32c5411'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
-
def self?.
|
29
|
+
def self?.backend_init: (?numa: bool) -> void
|
30
|
+
def self?.backend_free: () -> void
|
30
31
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
32
|
def self?.generate: (::LLaMACpp::Context, String,
|
32
33
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -108,6 +109,7 @@ module LLaMACpp
|
|
108
109
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
109
110
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
110
111
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
|
111
113
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
112
114
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
113
115
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|