llama_cpp 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,10 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
208
|
+
#define GGML_UNUSED(x) (void)(x)
|
209
|
+
|
210
|
+
|
204
211
|
#define GGML_ASSERT(x) \
|
205
212
|
do { \
|
206
213
|
if (!(x)) { \
|
@@ -209,6 +216,30 @@
|
|
209
216
|
} \
|
210
217
|
} while (0)
|
211
218
|
|
219
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
220
|
+
// main purpose is to reduce code duplication and improve readability.
|
221
|
+
//
|
222
|
+
// example:
|
223
|
+
//
|
224
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
225
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
226
|
+
//
|
227
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
228
|
+
const type prefix##0 = (pointer)->array[0]; \
|
229
|
+
GGML_UNUSED(prefix##0);
|
230
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
231
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
232
|
+
const type prefix##1 = (pointer)->array[1]; \
|
233
|
+
GGML_UNUSED(prefix##1);
|
234
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
235
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
236
|
+
const type prefix##2 = (pointer)->array[2]; \
|
237
|
+
GGML_UNUSED(prefix##2);
|
238
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
239
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
240
|
+
const type prefix##3 = (pointer)->array[3]; \
|
241
|
+
GGML_UNUSED(prefix##3);
|
242
|
+
|
212
243
|
#ifdef __cplusplus
|
213
244
|
extern "C" {
|
214
245
|
#endif
|
@@ -224,8 +255,8 @@ extern "C" {
|
|
224
255
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
256
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
257
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
258
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
259
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
260
|
|
230
261
|
struct ggml_object;
|
231
262
|
struct ggml_context;
|
@@ -295,12 +326,15 @@ extern "C" {
|
|
295
326
|
GGML_OP_SUM,
|
296
327
|
GGML_OP_SUM_ROWS,
|
297
328
|
GGML_OP_MEAN,
|
329
|
+
GGML_OP_ARGMAX,
|
298
330
|
GGML_OP_REPEAT,
|
299
331
|
GGML_OP_REPEAT_BACK,
|
300
332
|
GGML_OP_ABS,
|
301
333
|
GGML_OP_SGN,
|
302
334
|
GGML_OP_NEG,
|
303
335
|
GGML_OP_STEP,
|
336
|
+
GGML_OP_TANH,
|
337
|
+
GGML_OP_ELU,
|
304
338
|
GGML_OP_RELU,
|
305
339
|
GGML_OP_GELU,
|
306
340
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +366,10 @@ extern "C" {
|
|
332
366
|
GGML_OP_ROPE_BACK,
|
333
367
|
GGML_OP_ALIBI,
|
334
368
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
|
369
|
+
GGML_OP_CONV_1D,
|
370
|
+
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
338
373
|
|
339
374
|
GGML_OP_FLASH_ATTN,
|
340
375
|
GGML_OP_FLASH_FF,
|
@@ -386,12 +421,7 @@ extern "C" {
|
|
386
421
|
bool is_param;
|
387
422
|
|
388
423
|
struct ggml_tensor * grad;
|
389
|
-
struct ggml_tensor *
|
390
|
-
struct ggml_tensor * src1;
|
391
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
392
|
-
|
393
|
-
// thread scheduling
|
394
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
395
425
|
|
396
426
|
// performance
|
397
427
|
int perf_runs;
|
@@ -404,19 +434,31 @@ extern "C" {
|
|
404
434
|
|
405
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
406
436
|
|
407
|
-
char padding[
|
437
|
+
char padding[8];
|
408
438
|
};
|
409
439
|
|
410
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
411
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
412
458
|
// computation graph
|
413
459
|
struct ggml_cgraph {
|
414
460
|
int n_nodes;
|
415
461
|
int n_leafs;
|
416
|
-
int n_threads;
|
417
|
-
|
418
|
-
size_t work_size;
|
419
|
-
struct ggml_tensor * work;
|
420
462
|
|
421
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
422
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -444,6 +486,9 @@ extern "C" {
|
|
444
486
|
|
445
487
|
|
446
488
|
// compute types
|
489
|
+
|
490
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
491
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
492
|
enum ggml_task_type {
|
448
493
|
GGML_TASK_INIT = 0,
|
449
494
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +732,11 @@ extern "C" {
|
|
687
732
|
struct ggml_context * ctx,
|
688
733
|
struct ggml_tensor * a);
|
689
734
|
|
735
|
+
// argmax along rows
|
736
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
737
|
+
struct ggml_context * ctx,
|
738
|
+
struct ggml_tensor * a);
|
739
|
+
|
690
740
|
// if a is the same shape as b, and a is not parameter, return a
|
691
741
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
742
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +781,22 @@ extern "C" {
|
|
731
781
|
struct ggml_context * ctx,
|
732
782
|
struct ggml_tensor * a);
|
733
783
|
|
784
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
785
|
+
struct ggml_context * ctx,
|
786
|
+
struct ggml_tensor * a);
|
787
|
+
|
788
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
789
|
+
struct ggml_context * ctx,
|
790
|
+
struct ggml_tensor * a);
|
791
|
+
|
792
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
793
|
+
struct ggml_context * ctx,
|
794
|
+
struct ggml_tensor * a);
|
795
|
+
|
796
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
797
|
+
struct ggml_context * ctx,
|
798
|
+
struct ggml_tensor * a);
|
799
|
+
|
734
800
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
801
|
struct ggml_context * ctx,
|
736
802
|
struct ggml_tensor * a);
|
@@ -1081,58 +1147,58 @@ extern "C" {
|
|
1081
1147
|
float min,
|
1082
1148
|
float max);
|
1083
1149
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1150
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1151
|
struct ggml_context * ctx,
|
1115
1152
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1153
|
+
struct ggml_tensor * b,
|
1154
|
+
int s0, // stride
|
1155
|
+
int p0, // padding
|
1156
|
+
int d0); // dilation
|
1117
1157
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1158
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1159
|
struct ggml_context * ctx,
|
1121
1160
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1161
|
+
struct ggml_tensor * b,
|
1162
|
+
int s0,
|
1163
|
+
int s1,
|
1164
|
+
int p0,
|
1165
|
+
int p1,
|
1166
|
+
int d0,
|
1167
|
+
int d1);
|
1123
1168
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1169
|
+
// conv_1d with padding = half
|
1170
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1171
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1172
|
struct ggml_context * ctx,
|
1134
1173
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1174
|
+
struct ggml_tensor * b,
|
1175
|
+
int s,
|
1176
|
+
int d);
|
1177
|
+
|
1178
|
+
enum ggml_op_pool {
|
1179
|
+
GGML_OP_POOL_MAX,
|
1180
|
+
GGML_OP_POOL_AVG,
|
1181
|
+
GGML_OP_POOL_COUNT,
|
1182
|
+
};
|
1183
|
+
|
1184
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1185
|
+
struct ggml_context * ctx,
|
1186
|
+
struct ggml_tensor * a,
|
1187
|
+
enum ggml_op_pool op,
|
1188
|
+
int k0, // kernel size
|
1189
|
+
int s0, // stride
|
1190
|
+
int p0); // padding
|
1191
|
+
|
1192
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1193
|
+
struct ggml_context * ctx,
|
1194
|
+
struct ggml_tensor * a,
|
1195
|
+
enum ggml_op_pool op,
|
1196
|
+
int k0,
|
1197
|
+
int k1,
|
1198
|
+
int s0,
|
1199
|
+
int s1,
|
1200
|
+
int p0,
|
1201
|
+
int p1);
|
1136
1202
|
|
1137
1203
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1204
|
struct ggml_context * ctx,
|
@@ -1263,15 +1329,22 @@ extern "C" {
|
|
1263
1329
|
|
1264
1330
|
GGML_API void ggml_set_param(
|
1265
1331
|
struct ggml_context * ctx,
|
1266
|
-
struct ggml_tensor
|
1332
|
+
struct ggml_tensor * tensor);
|
1267
1333
|
|
1268
1334
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1269
1335
|
|
1270
1336
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1271
1337
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1272
1338
|
|
1273
|
-
|
1274
|
-
|
1339
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1340
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1341
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1342
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1343
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1344
|
+
|
1345
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1346
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1347
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1275
1348
|
|
1276
1349
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1277
1350
|
|
@@ -1488,25 +1561,24 @@ extern "C" {
|
|
1488
1561
|
//
|
1489
1562
|
|
1490
1563
|
#ifdef __cplusplus
|
1491
|
-
|
1564
|
+
// restrict not standard in C++
|
1492
1565
|
#define GGML_RESTRICT
|
1493
1566
|
#else
|
1494
1567
|
#define GGML_RESTRICT restrict
|
1495
1568
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1569
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1570
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1571
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1572
|
|
1500
1573
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1574
|
+
ggml_to_float_t to_float;
|
1575
|
+
ggml_from_float_t from_float;
|
1576
|
+
ggml_from_float_t from_float_reference;
|
1577
|
+
ggml_vec_dot_t vec_dot;
|
1578
|
+
enum ggml_type vec_dot_type;
|
1579
|
+
} ggml_type_traits_t;
|
1580
|
+
|
1581
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1582
|
|
1511
1583
|
#ifdef __cplusplus
|
1512
1584
|
}
|