llama_cpp 0.3.2 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,12 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
204
208
|
#define GGML_UNUSED(x) (void)(x)
|
205
209
|
|
210
|
+
|
206
211
|
#define GGML_ASSERT(x) \
|
207
212
|
do { \
|
208
213
|
if (!(x)) { \
|
@@ -363,6 +368,8 @@ extern "C" {
|
|
363
368
|
GGML_OP_CLAMP,
|
364
369
|
GGML_OP_CONV_1D,
|
365
370
|
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
366
373
|
|
367
374
|
GGML_OP_FLASH_ATTN,
|
368
375
|
GGML_OP_FLASH_FF,
|
@@ -414,12 +421,7 @@ extern "C" {
|
|
414
421
|
bool is_param;
|
415
422
|
|
416
423
|
struct ggml_tensor * grad;
|
417
|
-
struct ggml_tensor *
|
418
|
-
struct ggml_tensor * src1;
|
419
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
420
|
-
|
421
|
-
// thread scheduling
|
422
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
423
425
|
|
424
426
|
// performance
|
425
427
|
int perf_runs;
|
@@ -432,19 +434,31 @@ extern "C" {
|
|
432
434
|
|
433
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
434
436
|
|
435
|
-
char padding[
|
437
|
+
char padding[8];
|
436
438
|
};
|
437
439
|
|
438
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
439
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
440
458
|
// computation graph
|
441
459
|
struct ggml_cgraph {
|
442
460
|
int n_nodes;
|
443
461
|
int n_leafs;
|
444
|
-
int n_threads;
|
445
|
-
|
446
|
-
size_t work_size;
|
447
|
-
struct ggml_tensor * work;
|
448
462
|
|
449
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
450
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -1107,6 +1121,17 @@ extern "C" {
|
|
1107
1121
|
int mode,
|
1108
1122
|
int n_ctx);
|
1109
1123
|
|
1124
|
+
// custom RoPE, in-place, returns view(a)
|
1125
|
+
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
|
+
struct ggml_context * ctx,
|
1127
|
+
struct ggml_tensor * a,
|
1128
|
+
int n_past,
|
1129
|
+
int n_dims,
|
1130
|
+
int mode,
|
1131
|
+
int n_ctx,
|
1132
|
+
float freq_base,
|
1133
|
+
float freq_scale);
|
1134
|
+
|
1110
1135
|
// rotary position embedding backward, i.e compute dx from dy
|
1111
1136
|
// a - dy
|
1112
1137
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1114,7 +1139,8 @@ extern "C" {
|
|
1114
1139
|
struct ggml_tensor * a,
|
1115
1140
|
int n_past,
|
1116
1141
|
int n_dims,
|
1117
|
-
int mode
|
1142
|
+
int mode,
|
1143
|
+
int n_ctx);
|
1118
1144
|
|
1119
1145
|
// alibi position embedding
|
1120
1146
|
// in-place, returns view(a)
|
@@ -1161,6 +1187,31 @@ extern "C" {
|
|
1161
1187
|
int s,
|
1162
1188
|
int d);
|
1163
1189
|
|
1190
|
+
enum ggml_op_pool {
|
1191
|
+
GGML_OP_POOL_MAX,
|
1192
|
+
GGML_OP_POOL_AVG,
|
1193
|
+
GGML_OP_POOL_COUNT,
|
1194
|
+
};
|
1195
|
+
|
1196
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1197
|
+
struct ggml_context * ctx,
|
1198
|
+
struct ggml_tensor * a,
|
1199
|
+
enum ggml_op_pool op,
|
1200
|
+
int k0, // kernel size
|
1201
|
+
int s0, // stride
|
1202
|
+
int p0); // padding
|
1203
|
+
|
1204
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1205
|
+
struct ggml_context * ctx,
|
1206
|
+
struct ggml_tensor * a,
|
1207
|
+
enum ggml_op_pool op,
|
1208
|
+
int k0,
|
1209
|
+
int k1,
|
1210
|
+
int s0,
|
1211
|
+
int s1,
|
1212
|
+
int p0,
|
1213
|
+
int p1);
|
1214
|
+
|
1164
1215
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1165
1216
|
struct ggml_context * ctx,
|
1166
1217
|
struct ggml_tensor * q,
|
@@ -1290,15 +1341,22 @@ extern "C" {
|
|
1290
1341
|
|
1291
1342
|
GGML_API void ggml_set_param(
|
1292
1343
|
struct ggml_context * ctx,
|
1293
|
-
struct ggml_tensor
|
1344
|
+
struct ggml_tensor * tensor);
|
1294
1345
|
|
1295
1346
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1296
1347
|
|
1297
1348
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1298
1349
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1299
1350
|
|
1300
|
-
|
1301
|
-
|
1351
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1354
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1355
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1356
|
+
|
1357
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1358
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1359
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1302
1360
|
|
1303
1361
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1304
1362
|
|
@@ -15,6 +15,14 @@
|
|
15
15
|
#define K_SCALE_SIZE 12
|
16
16
|
#endif
|
17
17
|
|
18
|
+
#ifndef static_assert
|
19
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
20
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
21
|
+
#else
|
22
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
23
|
+
#endif
|
24
|
+
#endif
|
25
|
+
|
18
26
|
//
|
19
27
|
// Super-block quantization structures
|
20
28
|
//
|