llama_cpp 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +165 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +217 -76
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +16 -5
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1082 -774
- data/ext/llama_cpp/src/ggml.h +64 -18
- data/ext/llama_cpp/src/llama.cpp +179 -51
- data/ext/llama_cpp/src/llama.h +15 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +3 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,12 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
204
208
|
#define GGML_UNUSED(x) (void)(x)
|
205
209
|
|
210
|
+
|
206
211
|
#define GGML_ASSERT(x) \
|
207
212
|
do { \
|
208
213
|
if (!(x)) { \
|
@@ -363,6 +368,8 @@ extern "C" {
|
|
363
368
|
GGML_OP_CLAMP,
|
364
369
|
GGML_OP_CONV_1D,
|
365
370
|
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
366
373
|
|
367
374
|
GGML_OP_FLASH_ATTN,
|
368
375
|
GGML_OP_FLASH_FF,
|
@@ -414,12 +421,7 @@ extern "C" {
|
|
414
421
|
bool is_param;
|
415
422
|
|
416
423
|
struct ggml_tensor * grad;
|
417
|
-
struct ggml_tensor *
|
418
|
-
struct ggml_tensor * src1;
|
419
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
420
|
-
|
421
|
-
// thread scheduling
|
422
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
423
425
|
|
424
426
|
// performance
|
425
427
|
int perf_runs;
|
@@ -432,19 +434,31 @@ extern "C" {
|
|
432
434
|
|
433
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
434
436
|
|
435
|
-
char padding[
|
437
|
+
char padding[8];
|
436
438
|
};
|
437
439
|
|
438
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
439
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
440
458
|
// computation graph
|
441
459
|
struct ggml_cgraph {
|
442
460
|
int n_nodes;
|
443
461
|
int n_leafs;
|
444
|
-
int n_threads;
|
445
|
-
|
446
|
-
size_t work_size;
|
447
|
-
struct ggml_tensor * work;
|
448
462
|
|
449
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
450
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -1161,6 +1175,31 @@ extern "C" {
|
|
1161
1175
|
int s,
|
1162
1176
|
int d);
|
1163
1177
|
|
1178
|
+
enum ggml_op_pool {
|
1179
|
+
GGML_OP_POOL_MAX,
|
1180
|
+
GGML_OP_POOL_AVG,
|
1181
|
+
GGML_OP_POOL_COUNT,
|
1182
|
+
};
|
1183
|
+
|
1184
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1185
|
+
struct ggml_context * ctx,
|
1186
|
+
struct ggml_tensor * a,
|
1187
|
+
enum ggml_op_pool op,
|
1188
|
+
int k0, // kernel size
|
1189
|
+
int s0, // stride
|
1190
|
+
int p0); // padding
|
1191
|
+
|
1192
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1193
|
+
struct ggml_context * ctx,
|
1194
|
+
struct ggml_tensor * a,
|
1195
|
+
enum ggml_op_pool op,
|
1196
|
+
int k0,
|
1197
|
+
int k1,
|
1198
|
+
int s0,
|
1199
|
+
int s1,
|
1200
|
+
int p0,
|
1201
|
+
int p1);
|
1202
|
+
|
1164
1203
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1165
1204
|
struct ggml_context * ctx,
|
1166
1205
|
struct ggml_tensor * q,
|
@@ -1290,15 +1329,22 @@ extern "C" {
|
|
1290
1329
|
|
1291
1330
|
GGML_API void ggml_set_param(
|
1292
1331
|
struct ggml_context * ctx,
|
1293
|
-
struct ggml_tensor
|
1332
|
+
struct ggml_tensor * tensor);
|
1294
1333
|
|
1295
1334
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1296
1335
|
|
1297
1336
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1298
1337
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1299
1338
|
|
1300
|
-
|
1301
|
-
|
1339
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1340
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1341
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1342
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1343
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1344
|
+
|
1345
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1346
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1347
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1302
1348
|
|
1303
1349
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1304
1350
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,9 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_MPI
|
23
|
+
#include "ggml-mpi.h"
|
24
|
+
#endif
|
22
25
|
#ifdef GGML_USE_K_QUANTS
|
23
26
|
#ifndef QK_K
|
24
27
|
#ifdef GGML_QKK_64
|
@@ -79,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
79
82
|
(void) tensor;
|
80
83
|
}
|
81
84
|
|
85
|
+
//
|
86
|
+
// ggml helpers
|
87
|
+
//
|
88
|
+
|
89
|
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
90
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
91
|
+
|
92
|
+
if (plan.work_size > 0) {
|
93
|
+
buf.resize(plan.work_size);
|
94
|
+
plan.work_data = buf.data();
|
95
|
+
}
|
96
|
+
|
97
|
+
ggml_graph_compute(graph, &plan);
|
98
|
+
}
|
99
|
+
|
100
|
+
//
|
101
|
+
// memory sizes
|
102
|
+
//
|
103
|
+
|
82
104
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
83
105
|
{
|
84
106
|
static std::map<e_model, size_t> k_sizes = {
|
@@ -321,6 +343,9 @@ struct llama_context {
|
|
321
343
|
// input embedding (1-dimensional array: [n_embd])
|
322
344
|
std::vector<float> embedding;
|
323
345
|
|
346
|
+
// reusable buffer for `struct ggml_graph_plan.work_data`
|
347
|
+
std::vector<uint8_t> work_buffer;
|
348
|
+
|
324
349
|
// memory buffers used to evaluate the model
|
325
350
|
// TODO: move in llama_state
|
326
351
|
llama_ctx_buffer buf_compute;
|
@@ -330,6 +355,10 @@ struct llama_context {
|
|
330
355
|
ggml_metal_context * ctx_metal = NULL;
|
331
356
|
#endif
|
332
357
|
|
358
|
+
#ifdef GGML_USE_MPI
|
359
|
+
ggml_mpi_context * ctx_mpi = NULL;
|
360
|
+
#endif
|
361
|
+
|
333
362
|
int buf_last = 0;
|
334
363
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
335
364
|
|
@@ -758,7 +787,6 @@ struct llama_model_loader {
|
|
758
787
|
|
759
788
|
};
|
760
789
|
|
761
|
-
|
762
790
|
//
|
763
791
|
// kv cache
|
764
792
|
//
|
@@ -849,7 +877,7 @@ bool llama_mlock_supported() {
|
|
849
877
|
return llama_mlock::SUPPORTED;
|
850
878
|
}
|
851
879
|
|
852
|
-
void
|
880
|
+
void llama_backend_init(bool numa) {
|
853
881
|
ggml_time_init();
|
854
882
|
|
855
883
|
// needed to initialize f16 tables
|
@@ -862,6 +890,16 @@ void llama_init_backend(bool numa) {
|
|
862
890
|
if (numa) {
|
863
891
|
ggml_numa_init();
|
864
892
|
}
|
893
|
+
|
894
|
+
#ifdef GGML_USE_MPI
|
895
|
+
ggml_mpi_backend_init();
|
896
|
+
#endif
|
897
|
+
}
|
898
|
+
|
899
|
+
void llama_backend_free() {
|
900
|
+
#ifdef GGML_USE_MPI
|
901
|
+
ggml_mpi_backend_free();
|
902
|
+
#endif
|
865
903
|
}
|
866
904
|
|
867
905
|
int64_t llama_time_us() {
|
@@ -1263,18 +1301,16 @@ static bool llama_eval_internal(
|
|
1263
1301
|
llama_context & lctx,
|
1264
1302
|
const llama_token * tokens,
|
1265
1303
|
const float * embd,
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1304
|
+
int n_tokens,
|
1305
|
+
int n_past,
|
1306
|
+
int n_threads,
|
1269
1307
|
const char * cgraph_fname) {
|
1270
1308
|
|
1271
1309
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
1272
1310
|
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
return false;
|
1277
|
-
}
|
1311
|
+
#ifdef GGML_USE_MPI
|
1312
|
+
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
1313
|
+
#endif
|
1278
1314
|
|
1279
1315
|
const int64_t t_start_us = ggml_time_us();
|
1280
1316
|
|
@@ -1306,20 +1342,26 @@ static bool llama_eval_internal(
|
|
1306
1342
|
|
1307
1343
|
struct ggml_context * ctx0 = ggml_init(params);
|
1308
1344
|
|
1345
|
+
ggml_cgraph gf = {};
|
1346
|
+
|
1309
1347
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
1310
1348
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
1311
|
-
|
1312
|
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1349
|
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
1313
1350
|
|
1314
1351
|
struct ggml_tensor * cur;
|
1315
1352
|
struct ggml_tensor * inpL;
|
1316
1353
|
|
1317
1354
|
if (tokens) {
|
1318
|
-
struct ggml_tensor *
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1355
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
1356
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
1357
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
1358
|
+
|
1359
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
1322
1360
|
} else {
|
1361
|
+
#ifdef GGML_USE_MPI
|
1362
|
+
GGML_ASSERT(false && "not implemented");
|
1363
|
+
#endif
|
1364
|
+
|
1323
1365
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
1324
1366
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
1325
1367
|
}
|
@@ -1337,18 +1379,20 @@ static bool llama_eval_internal(
|
|
1337
1379
|
offload_func_t offload_func_v = llama_nop;
|
1338
1380
|
|
1339
1381
|
#ifdef GGML_USE_CUBLAS
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1382
|
+
if (n_gpu_layers > n_layer) {
|
1383
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1384
|
+
}
|
1385
|
+
if (n_gpu_layers > n_layer + 1) {
|
1386
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1387
|
+
}
|
1388
|
+
if (n_gpu_layers > n_layer + 2) {
|
1389
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1390
|
+
}
|
1349
1391
|
#endif // GGML_USE_CUBLAS
|
1350
1392
|
|
1351
1393
|
for (int il = 0; il < n_layer; ++il) {
|
1394
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
1395
|
+
|
1352
1396
|
offload_func_t offload_func = llama_nop;
|
1353
1397
|
|
1354
1398
|
#ifdef GGML_USE_CUBLAS
|
@@ -1555,7 +1599,6 @@ static bool llama_eval_internal(
|
|
1555
1599
|
|
1556
1600
|
// input for next layer
|
1557
1601
|
inpL = cur;
|
1558
|
-
|
1559
1602
|
}
|
1560
1603
|
|
1561
1604
|
lctx.use_buf(ctx0, 0);
|
@@ -1563,7 +1606,6 @@ static bool llama_eval_internal(
|
|
1563
1606
|
// used at the end to optionally extract the embeddings
|
1564
1607
|
struct ggml_tensor * embeddings = NULL;
|
1565
1608
|
|
1566
|
-
|
1567
1609
|
// norm
|
1568
1610
|
{
|
1569
1611
|
cur = ggml_rms_norm(ctx0, inpL);
|
@@ -1578,7 +1620,6 @@ static bool llama_eval_internal(
|
|
1578
1620
|
embeddings = cur;
|
1579
1621
|
}
|
1580
1622
|
|
1581
|
-
|
1582
1623
|
// lm_head
|
1583
1624
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1584
1625
|
ggml_set_name(cur, "result_output");
|
@@ -1591,8 +1632,13 @@ static bool llama_eval_internal(
|
|
1591
1632
|
// run the computation
|
1592
1633
|
ggml_build_forward_expand(&gf, cur);
|
1593
1634
|
|
1635
|
+
#if GGML_USE_MPI
|
1636
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
1637
|
+
#endif
|
1638
|
+
|
1594
1639
|
#ifdef GGML_USE_METAL
|
1595
1640
|
if (lctx.ctx_metal && N == 1) {
|
1641
|
+
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
1596
1642
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1597
1643
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1598
1644
|
} else {
|
@@ -1612,12 +1658,21 @@ static bool llama_eval_internal(
|
|
1612
1658
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1613
1659
|
}
|
1614
1660
|
|
1615
|
-
|
1661
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1616
1662
|
}
|
1617
1663
|
#else
|
1618
|
-
|
1664
|
+
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
1619
1665
|
#endif
|
1620
1666
|
|
1667
|
+
#if GGML_USE_MPI
|
1668
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
1669
|
+
#endif
|
1670
|
+
|
1671
|
+
// update kv token count
|
1672
|
+
lctx.kv_self.n = n_past + N;
|
1673
|
+
|
1674
|
+
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
1675
|
+
|
1621
1676
|
if (cgraph_fname) {
|
1622
1677
|
ggml_graph_export(&gf, cgraph_fname);
|
1623
1678
|
}
|
@@ -1633,23 +1688,17 @@ static bool llama_eval_internal(
|
|
1633
1688
|
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
1634
1689
|
//}
|
1635
1690
|
|
1636
|
-
//embd_w.resize(n_vocab*N);
|
1637
|
-
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1638
|
-
|
1639
|
-
// update kv token count
|
1640
|
-
lctx.kv_self.n = n_past + N;
|
1641
|
-
|
1642
1691
|
// extract logits
|
1643
1692
|
{
|
1644
1693
|
auto & logits_out = lctx.logits;
|
1645
1694
|
|
1646
1695
|
if (lctx.logits_all) {
|
1647
1696
|
logits_out.resize(n_vocab * N);
|
1648
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1697
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
1649
1698
|
} else {
|
1650
1699
|
// return result for just the last token
|
1651
1700
|
logits_out.resize(n_vocab);
|
1652
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1701
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1653
1702
|
}
|
1654
1703
|
}
|
1655
1704
|
|
@@ -2118,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
2118
2167
|
}
|
2119
2168
|
}
|
2120
2169
|
|
2170
|
+
static void llama_log_softmax(float * array, size_t size) {
|
2171
|
+
float max_l = *std::max_element(array, array + size);
|
2172
|
+
float sum = 0.f;
|
2173
|
+
for (size_t i = 0; i < size; ++i) {
|
2174
|
+
float p = expf(array[i] - max_l);
|
2175
|
+
sum += p;
|
2176
|
+
array[i] = p;
|
2177
|
+
}
|
2178
|
+
|
2179
|
+
for (size_t i = 0; i < size; ++i) {
|
2180
|
+
array[i] = logf(array[i] / sum);
|
2181
|
+
}
|
2182
|
+
}
|
2183
|
+
|
2184
|
+
void llama_sample_classifier_free_guidance(
|
2185
|
+
struct llama_context * ctx,
|
2186
|
+
llama_token_data_array * candidates,
|
2187
|
+
struct llama_context * guidance_ctx,
|
2188
|
+
float scale,
|
2189
|
+
float smooth_factor) {
|
2190
|
+
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2191
|
+
|
2192
|
+
assert(ctx);
|
2193
|
+
auto n_vocab = llama_n_vocab(ctx);
|
2194
|
+
assert(n_vocab == (int)candidates->size);
|
2195
|
+
assert(!candidates->sorted);
|
2196
|
+
|
2197
|
+
std::vector<float> logits_base;
|
2198
|
+
logits_base.reserve(candidates->size);
|
2199
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
2200
|
+
logits_base.push_back(candidates->data[i].logit);
|
2201
|
+
}
|
2202
|
+
llama_log_softmax(logits_base.data(), candidates->size);
|
2203
|
+
|
2204
|
+
float* logits_guidance = llama_get_logits(guidance_ctx);
|
2205
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2206
|
+
|
2207
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2208
|
+
float logit_guidance = logits_guidance[i];
|
2209
|
+
float logit_base = logits_base[i];
|
2210
|
+
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
2211
|
+
}
|
2212
|
+
|
2213
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
+
|
2215
|
+
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
+
float logit_base = logits_base[i];
|
2217
|
+
float logit_guidance = logits_guidance[i];
|
2218
|
+
|
2219
|
+
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2220
|
+
}
|
2221
|
+
|
2222
|
+
if (ctx) {
|
2223
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2224
|
+
}
|
2225
|
+
}
|
2121
2226
|
|
2122
2227
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
2123
2228
|
assert(ctx);
|
@@ -2405,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2405
2510
|
} else {
|
2406
2511
|
new_type = quantized_type;
|
2407
2512
|
#ifdef GGML_USE_K_QUANTS
|
2513
|
+
bool convert_incompatible_tensor = false;
|
2408
2514
|
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2409
2515
|
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2410
2516
|
int nx = tensor.ne.at(0);
|
2411
2517
|
int ny = tensor.ne.at(1);
|
2412
2518
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2413
|
-
fprintf(stderr, "\n\
|
2414
|
-
|
2415
|
-
fprintf(stderr, "========================================================================================\n\n");
|
2416
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2519
|
+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
2520
|
+
convert_incompatible_tensor = true;
|
2417
2521
|
}
|
2418
2522
|
}
|
2419
2523
|
if (tensor.name == "output.weight") {
|
@@ -2441,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2441
2545
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2442
2546
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2443
2547
|
}
|
2548
|
+
if (convert_incompatible_tensor) {
|
2549
|
+
if (tensor.name == "output.weight") {
|
2550
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
2551
|
+
fprintf(stderr, "F16 will be used for this tensor instead.\n");
|
2552
|
+
} else if (tensor.name == "tok_embeddings.weight") {
|
2553
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
2554
|
+
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
|
2555
|
+
} else {
|
2556
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2557
|
+
}
|
2558
|
+
}
|
2444
2559
|
#endif
|
2445
2560
|
|
2446
2561
|
float * f32_data;
|
@@ -2575,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
|
|
2575
2690
|
}
|
2576
2691
|
|
2577
2692
|
struct llama_context * llama_new_context_with_model(
|
2578
|
-
|
2579
|
-
|
2693
|
+
struct llama_model * model,
|
2694
|
+
struct llama_context_params params) {
|
2580
2695
|
|
2581
2696
|
if (!model) {
|
2582
2697
|
return nullptr;
|
@@ -2645,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2645
2760
|
#ifdef GGML_USE_METAL
|
2646
2761
|
if (params.n_gpu_layers > 0) {
|
2647
2762
|
// this allocates all Metal resources and memory buffers
|
2648
|
-
ctx->ctx_metal = ggml_metal_init();
|
2763
|
+
ctx->ctx_metal = ggml_metal_init(1);
|
2649
2764
|
|
2650
2765
|
void * data_ptr = NULL;
|
2651
2766
|
size_t data_size = 0;
|
@@ -2680,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
|
|
2680
2795
|
}
|
2681
2796
|
#endif
|
2682
2797
|
|
2798
|
+
#ifdef GGML_USE_MPI
|
2799
|
+
ctx->ctx_mpi = ggml_mpi_init();
|
2800
|
+
|
2801
|
+
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
2802
|
+
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
2803
|
+
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
2804
|
+
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
2805
|
+
llama_backend_free();
|
2806
|
+
exit(1);
|
2807
|
+
}
|
2808
|
+
#endif
|
2809
|
+
|
2683
2810
|
return ctx;
|
2684
2811
|
}
|
2685
2812
|
|
@@ -2802,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2802
2929
|
// read tensors and apply
|
2803
2930
|
bool warned = false;
|
2804
2931
|
int n_tensors = 0;
|
2932
|
+
|
2933
|
+
std::vector<uint8_t> work_buffer;
|
2934
|
+
|
2805
2935
|
while (true) {
|
2806
2936
|
int32_t n_dims;
|
2807
2937
|
int32_t length;
|
@@ -2966,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
2966
3096
|
}
|
2967
3097
|
|
2968
3098
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
2969
|
-
|
2970
|
-
|
3099
|
+
|
3100
|
+
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
2971
3101
|
|
2972
3102
|
// we won't need these tensors again, reset the context to save memory
|
2973
3103
|
ggml_free(lora_ctx);
|
@@ -3120,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3120
3250
|
|
3121
3251
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3122
3252
|
ggml_cgraph gf{};
|
3123
|
-
gf.n_threads = 1;
|
3124
3253
|
|
3125
3254
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3126
3255
|
kout3d->data = out;
|
@@ -3140,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3140
3269
|
|
3141
3270
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
3142
3271
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
3143
|
-
|
3272
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3144
3273
|
|
3145
3274
|
ggml_free(cpy_ctx);
|
3146
3275
|
}
|
@@ -3226,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3226
3355
|
|
3227
3356
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3228
3357
|
ggml_cgraph gf{};
|
3229
|
-
gf.n_threads = 1;
|
3230
3358
|
|
3231
3359
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
3232
3360
|
kin3d->data = (void *) inp;
|
@@ -3246,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3246
3374
|
|
3247
3375
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
3248
3376
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
3249
|
-
|
3377
|
+
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
3250
3378
|
|
3251
3379
|
ggml_free(cpy_ctx);
|
3252
3380
|
}
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -158,7 +158,9 @@ extern "C" {
|
|
158
158
|
// Initialize the llama + ggml backend
|
159
159
|
// If numa is true, use NUMA optimizations
|
160
160
|
// Call once at the start of the program
|
161
|
-
LLAMA_API void
|
161
|
+
LLAMA_API void llama_backend_init(bool numa);
|
162
|
+
// Call once at the end of the program - currently only used for MPI
|
163
|
+
LLAMA_API void llama_backend_free();
|
162
164
|
|
163
165
|
LLAMA_API int64_t llama_time_us();
|
164
166
|
|
@@ -307,6 +309,18 @@ extern "C" {
|
|
307
309
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
308
310
|
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
309
311
|
|
312
|
+
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
313
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
|
+
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
|
+
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
+
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
|
+
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
|
+
struct llama_context * ctx,
|
319
|
+
llama_token_data_array * candidates,
|
320
|
+
struct llama_context * guidance_ctx,
|
321
|
+
float scale,
|
322
|
+
float smooth_factor);
|
323
|
+
|
310
324
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
311
325
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
312
326
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-32c5411'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
-
def self?.
|
29
|
+
def self?.backend_init: (?numa: bool) -> void
|
30
|
+
def self?.backend_free: () -> void
|
30
31
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
32
|
def self?.generate: (::LLaMACpp::Context, String,
|
32
33
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -108,6 +109,7 @@ module LLaMACpp
|
|
108
109
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
109
110
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
110
111
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
|
111
113
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
112
114
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
113
115
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|