llama_cpp 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -65,7 +65,7 @@
|
|
65
65
|
// ggml_set_f32(a, 3.0f);
|
66
66
|
// ggml_set_f32(b, 4.0f);
|
67
67
|
//
|
68
|
-
//
|
68
|
+
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
69
69
|
//
|
70
70
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
71
|
//
|
@@ -132,10 +132,10 @@
|
|
132
132
|
// {
|
133
133
|
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
134
|
//
|
135
|
-
// // a[
|
135
|
+
// // a[2, 1] = 1.0f;
|
136
136
|
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
137
|
//
|
138
|
-
// // a[
|
138
|
+
// // a[0, 2] = 2.0f;
|
139
139
|
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
140
|
//
|
141
141
|
// ...
|
@@ -197,12 +197,17 @@
|
|
197
197
|
#define GGML_MAX_NODES 4096
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
|
-
#define
|
200
|
+
#define GGML_MAX_SRC 6
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
|
205
|
+
#define GGML_EXIT_SUCCESS 0
|
206
|
+
#define GGML_EXIT_ABORTED 1
|
207
|
+
|
204
208
|
#define GGML_UNUSED(x) (void)(x)
|
205
209
|
|
210
|
+
|
206
211
|
#define GGML_ASSERT(x) \
|
207
212
|
do { \
|
208
213
|
if (!(x)) { \
|
@@ -363,6 +368,8 @@ extern "C" {
|
|
363
368
|
GGML_OP_CLAMP,
|
364
369
|
GGML_OP_CONV_1D,
|
365
370
|
GGML_OP_CONV_2D,
|
371
|
+
GGML_OP_POOL_1D,
|
372
|
+
GGML_OP_POOL_2D,
|
366
373
|
|
367
374
|
GGML_OP_FLASH_ATTN,
|
368
375
|
GGML_OP_FLASH_FF,
|
@@ -414,12 +421,7 @@ extern "C" {
|
|
414
421
|
bool is_param;
|
415
422
|
|
416
423
|
struct ggml_tensor * grad;
|
417
|
-
struct ggml_tensor *
|
418
|
-
struct ggml_tensor * src1;
|
419
|
-
struct ggml_tensor * opt[GGML_MAX_OPT];
|
420
|
-
|
421
|
-
// thread scheduling
|
422
|
-
int n_tasks;
|
424
|
+
struct ggml_tensor * src[GGML_MAX_SRC];
|
423
425
|
|
424
426
|
// performance
|
425
427
|
int perf_runs;
|
@@ -432,19 +434,31 @@ extern "C" {
|
|
432
434
|
|
433
435
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
434
436
|
|
435
|
-
char padding[
|
437
|
+
char padding[8];
|
436
438
|
};
|
437
439
|
|
438
440
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
439
441
|
|
442
|
+
// the compute plan that needs to be prepared for ggml_graph_compute()
|
443
|
+
// since https://github.com/ggerganov/ggml/issues/287
|
444
|
+
struct ggml_cplan {
|
445
|
+
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
446
|
+
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
447
|
+
|
448
|
+
int n_threads;
|
449
|
+
|
450
|
+
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
451
|
+
int n_tasks[GGML_MAX_NODES];
|
452
|
+
|
453
|
+
// abort ggml_graph_compute when true
|
454
|
+
bool (*abort_callback)(void * data);
|
455
|
+
void * abort_callback_data;
|
456
|
+
};
|
457
|
+
|
440
458
|
// computation graph
|
441
459
|
struct ggml_cgraph {
|
442
460
|
int n_nodes;
|
443
461
|
int n_leafs;
|
444
|
-
int n_threads;
|
445
|
-
|
446
|
-
size_t work_size;
|
447
|
-
struct ggml_tensor * work;
|
448
462
|
|
449
463
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
450
464
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
@@ -1107,6 +1121,17 @@ extern "C" {
|
|
1107
1121
|
int mode,
|
1108
1122
|
int n_ctx);
|
1109
1123
|
|
1124
|
+
// custom RoPE, in-place, returns view(a)
|
1125
|
+
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1126
|
+
struct ggml_context * ctx,
|
1127
|
+
struct ggml_tensor * a,
|
1128
|
+
int n_past,
|
1129
|
+
int n_dims,
|
1130
|
+
int mode,
|
1131
|
+
int n_ctx,
|
1132
|
+
float freq_base,
|
1133
|
+
float freq_scale);
|
1134
|
+
|
1110
1135
|
// rotary position embedding backward, i.e compute dx from dy
|
1111
1136
|
// a - dy
|
1112
1137
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1114,7 +1139,8 @@ extern "C" {
|
|
1114
1139
|
struct ggml_tensor * a,
|
1115
1140
|
int n_past,
|
1116
1141
|
int n_dims,
|
1117
|
-
int mode
|
1142
|
+
int mode,
|
1143
|
+
int n_ctx);
|
1118
1144
|
|
1119
1145
|
// alibi position embedding
|
1120
1146
|
// in-place, returns view(a)
|
@@ -1161,6 +1187,31 @@ extern "C" {
|
|
1161
1187
|
int s,
|
1162
1188
|
int d);
|
1163
1189
|
|
1190
|
+
enum ggml_op_pool {
|
1191
|
+
GGML_OP_POOL_MAX,
|
1192
|
+
GGML_OP_POOL_AVG,
|
1193
|
+
GGML_OP_POOL_COUNT,
|
1194
|
+
};
|
1195
|
+
|
1196
|
+
GGML_API struct ggml_tensor* ggml_pool_1d(
|
1197
|
+
struct ggml_context * ctx,
|
1198
|
+
struct ggml_tensor * a,
|
1199
|
+
enum ggml_op_pool op,
|
1200
|
+
int k0, // kernel size
|
1201
|
+
int s0, // stride
|
1202
|
+
int p0); // padding
|
1203
|
+
|
1204
|
+
GGML_API struct ggml_tensor* ggml_pool_2d(
|
1205
|
+
struct ggml_context * ctx,
|
1206
|
+
struct ggml_tensor * a,
|
1207
|
+
enum ggml_op_pool op,
|
1208
|
+
int k0,
|
1209
|
+
int k1,
|
1210
|
+
int s0,
|
1211
|
+
int s1,
|
1212
|
+
int p0,
|
1213
|
+
int p1);
|
1214
|
+
|
1164
1215
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1165
1216
|
struct ggml_context * ctx,
|
1166
1217
|
struct ggml_tensor * q,
|
@@ -1290,15 +1341,22 @@ extern "C" {
|
|
1290
1341
|
|
1291
1342
|
GGML_API void ggml_set_param(
|
1292
1343
|
struct ggml_context * ctx,
|
1293
|
-
struct ggml_tensor
|
1344
|
+
struct ggml_tensor * tensor);
|
1294
1345
|
|
1295
1346
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1296
1347
|
|
1297
1348
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1298
1349
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1299
1350
|
|
1300
|
-
|
1301
|
-
|
1351
|
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1352
|
+
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1353
|
+
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1354
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1355
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1356
|
+
|
1357
|
+
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1358
|
+
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
1359
|
+
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
1302
1360
|
|
1303
1361
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1304
1362
|
|
@@ -15,6 +15,14 @@
|
|
15
15
|
#define K_SCALE_SIZE 12
|
16
16
|
#endif
|
17
17
|
|
18
|
+
#ifndef static_assert
|
19
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
20
|
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
21
|
+
#else
|
22
|
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
23
|
+
#endif
|
24
|
+
#endif
|
25
|
+
|
18
26
|
//
|
19
27
|
// Super-block quantization structures
|
20
28
|
//
|