llama_cpp 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +11 -2
- data/ext/llama_cpp/llama_cpp.cpp +284 -111
- data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +19 -6
- data/ext/llama_cpp/src/ggml-metal.metal +56 -47
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +1734 -2248
- data/ext/llama_cpp/src/ggml.h +152 -80
- data/ext/llama_cpp/src/llama.cpp +282 -90
- data/ext/llama_cpp/src/llama.h +30 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +16 -13
- data/sig/llama_cpp.rbs +22 -2
- metadata +5 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,10 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
208
|
+
#define GGML_UNUSED(x) (void)(x)
|
209
|
+
|
210
|
+
|
204
211
|
#define GGML_ASSERT(x) \
|
205
212
|
do { \
|
206
213
|
if (!(x)) { \
|
@@ -209,6 +216,30 @@
|
|
209
216
|
} \
|
210
217
|
} while (0)
|
211
218
|
|
219
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
220
|
+
// main purpose is to reduce code duplication and improve readability.
|
221
|
+
//
|
222
|
+
// example:
|
223
|
+
//
|
224
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
225
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
226
|
+
//
|
227
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
228
|
+
const type prefix##0 = (pointer)->array[0]; \
|
229
|
+
GGML_UNUSED(prefix##0);
|
230
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
231
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
232
|
+
const type prefix##1 = (pointer)->array[1]; \
|
233
|
+
GGML_UNUSED(prefix##1);
|
234
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
235
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
236
|
+
const type prefix##2 = (pointer)->array[2]; \
|
237
|
+
GGML_UNUSED(prefix##2);
|
238
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
239
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
240
|
+
const type prefix##3 = (pointer)->array[3]; \
|
241
|
+
GGML_UNUSED(prefix##3);
|
242
|
+
|
212
243
|
#ifdef __cplusplus
|
213
244
|
extern "C" {
|
214
245
|
#endif
|
@@ -224,8 +255,8 @@ extern "C" {
|
|
224
255
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
256
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
257
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
258
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
259
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
260
|
|
230
261
|
struct ggml_object;
|
231
262
|
struct ggml_context;
|
@@ -295,12 +326,15 @@ extern "C" {
|
|
295
326
|
GGML_OP_SUM,
|
296
327
|
GGML_OP_SUM_ROWS,
|
297
328
|
GGML_OP_MEAN,
|
329
|
+
GGML_OP_ARGMAX,
|
298
330
|
GGML_OP_REPEAT,
|
299
331
|
GGML_OP_REPEAT_BACK,
|
300
332
|
GGML_OP_ABS,
|
301
333
|
GGML_OP_SGN,
|
302
334
|
GGML_OP_NEG,
|
303
335
|
GGML_OP_STEP,
|
336
|
+
GGML_OP_TANH,
|
337
|
+
GGML_OP_ELU,
|
304
338
|
GGML_OP_RELU,
|
305
339
|
GGML_OP_GELU,
|
306
340
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +366,10 @@ extern "C" {
|
|
332
366
|
GGML_OP_ROPE_BACK,
|
333
367
|
GGML_OP_ALIBI,
|
334
368
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
|
369
|
+
GGML_OP_CONV_1D,
|
370
|
+
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
338
373
|
|
339
374
|
GGML_OP_FLASH_ATTN,
|
340
375
|
GGML_OP_FLASH_FF,
|
@@ -386,12 +421,7 @@ extern "C" {
|
|
386
421
|
bool is_param;
|
387
422
|
|
388
423
|
struct ggml_tensor * grad;
|
389
|
-
struct ggml_tensor *
|
390
|
-
struct ggml_tensor * src1;
|
391
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
392
|
-
|
393
|
-
// thread scheduling
|
394
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
395
425
|
|
396
426
|
// performance
|
397
427
|
int perf_runs;
|
@@ -404,19 +434,31 @@ extern "C" {
|
|
404
434
|
|
405
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
406
436
|
|
407
|
-
char padding[
|
437
|
+
char padding[8];
|
408
438
|
};
|
409
439
|
|
410
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
411
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
412
458
|
// computation graph
|
413
459
|
struct ggml_cgraph {
|
414
460
|
int n_nodes;
|
415
461
|
int n_leafs;
|
416
|
-
int n_threads;
|
417
|
-
|
418
|
-
size_t work_size;
|
419
|
-
struct ggml_tensor * work;
|
420
462
|
|
421
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
422
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -444,6 +486,9 @@ extern "C" {
|
|
444
486
|
|
445
487
|
|
446
488
|
// compute types
|
489
|
+
|
490
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
491
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
492
|
enum ggml_task_type {
|
448
493
|
GGML_TASK_INIT = 0,
|
449
494
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +732,11 @@ extern "C" {
|
|
687
732
|
struct ggml_context * ctx,
|
688
733
|
struct ggml_tensor * a);
|
689
734
|
|
735
|
+
// argmax along rows
|
736
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
737
|
+
struct ggml_context * ctx,
|
738
|
+
struct ggml_tensor * a);
|
739
|
+
|
690
740
|
// if a is the same shape as b, and a is not parameter, return a
|
691
741
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
742
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +781,22 @@ extern "C" {
|
|
731
781
|
struct ggml_context * ctx,
|
732
782
|
struct ggml_tensor * a);
|
733
783
|
|
784
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
785
|
+
struct ggml_context * ctx,
|
786
|
+
struct ggml_tensor * a);
|
787
|
+
|
788
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
789
|
+
struct ggml_context * ctx,
|
790
|
+
struct ggml_tensor * a);
|
791
|
+
|
792
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
793
|
+
struct ggml_context * ctx,
|
794
|
+
struct ggml_tensor * a);
|
795
|
+
|
796
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
797
|
+
struct ggml_context * ctx,
|
798
|
+
struct ggml_tensor * a);
|
799
|
+
|
734
800
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
801
|
struct ggml_context * ctx,
|
736
802
|
struct ggml_tensor * a);
|
@@ -1081,58 +1147,58 @@ extern "C" {
|
|
1081
1147
|
float min,
|
1082
1148
|
float max);
|
1083
1149
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1150
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1151
|
struct ggml_context * ctx,
|
1115
1152
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1153
|
+
struct ggml_tensor * b,
|
1154
|
+
int s0, // stride
|
1155
|
+
int p0, // padding
|
1156
|
+
int d0); // dilation
|
1117
1157
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1158
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1159
|
struct ggml_context * ctx,
|
1121
1160
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1161
|
+
struct ggml_tensor * b,
|
1162
|
+
int s0,
|
1163
|
+
int s1,
|
1164
|
+
int p0,
|
1165
|
+
int p1,
|
1166
|
+
int d0,
|
1167
|
+
int d1);
|
1123
1168
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1169
|
+
// conv_1d with padding = half
|
1170
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1171
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1172
|
struct ggml_context * ctx,
|
1134
1173
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1174
|
+
struct ggml_tensor * b,
|
1175
|
+
int s,
|
1176
|
+
int d);
|
1177
|
+
|
1178
|
+
enum ggml_op_pool {
|
1179
|
+
GGML_OP_POOL_MAX,
|
1180
|
+
GGML_OP_POOL_AVG,
|
1181
|
+
GGML_OP_POOL_COUNT,
|
1182
|
+
};
|
1183
|
+
|
1184
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1185
|
+
struct ggml_context * ctx,
|
1186
|
+
struct ggml_tensor * a,
|
1187
|
+
enum ggml_op_pool op,
|
1188
|
+
int k0, // kernel size
|
1189
|
+
int s0, // stride
|
1190
|
+
int p0); // padding
|
1191
|
+
|
1192
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1193
|
+
struct ggml_context * ctx,
|
1194
|
+
struct ggml_tensor * a,
|
1195
|
+
enum ggml_op_pool op,
|
1196
|
+
int k0,
|
1197
|
+
int k1,
|
1198
|
+
int s0,
|
1199
|
+
int s1,
|
1200
|
+
int p0,
|
1201
|
+
int p1);
|
1136
1202
|
|
1137
1203
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1204
|
struct ggml_context * ctx,
|
@@ -1263,15 +1329,22 @@ extern "C" {
|
|
1263
1329
|
|
1264
1330
|
GGML_API void ggml_set_param(
|
1265
1331
|
struct ggml_context * ctx,
|
1266
|
-
struct ggml_tensor
|
1332
|
+
struct ggml_tensor * tensor);
|
1267
1333
|
|
1268
1334
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1269
1335
|
|
1270
1336
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1271
1337
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1272
1338
|
|
1273
|
-
|
1274
|
-
|
1339
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1340
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1341
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1342
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1343
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1344
|
+
|
1345
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1346
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1347
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1275
1348
|
|
1276
1349
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1277
1350
|
|
@@ -1488,25 +1561,24 @@ extern "C" {
|
|
1488
1561
|
//
|
1489
1562
|
|
1490
1563
|
#ifdef __cplusplus
|
1491
|
-
|
1564
|
+
// restrict not standard in C++
|
1492
1565
|
#define GGML_RESTRICT
|
1493
1566
|
#else
|
1494
1567
|
#define GGML_RESTRICT restrict
|
1495
1568
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1569
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1570
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1571
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1572
|
|
1500
1573
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1574
|
+
ggml_to_float_t to_float;
|
1575
|
+
ggml_from_float_t from_float;
|
1576
|
+
ggml_from_float_t from_float_reference;
|
1577
|
+
ggml_vec_dot_t vec_dot;
|
1578
|
+
enum ggml_type vec_dot_type;
|
1579
|
+
} ggml_type_traits_t;
|
1580
|
+
|
1581
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1582
|
|
1511
1583
|
#ifdef __cplusplus
|
1512
1584
|
}
|