llama_cpp 0.3.2 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,12 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
204
208
  #define GGML_UNUSED(x) (void)(x)
205
209
 
210
+
206
211
  #define GGML_ASSERT(x) \
207
212
  do { \
208
213
  if (!(x)) { \
@@ -363,6 +368,8 @@ extern "C" {
363
368
  GGML_OP_CLAMP,
364
369
  GGML_OP_CONV_1D,
365
370
  GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
366
373
 
367
374
  GGML_OP_FLASH_ATTN,
368
375
  GGML_OP_FLASH_FF,
@@ -414,12 +421,7 @@ extern "C" {
414
421
  bool is_param;
415
422
 
416
423
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
423
425
 
424
426
  // performance
425
427
  int perf_runs;
@@ -432,19 +434,31 @@ extern "C" {
432
434
 
433
435
  void * extra; // extra things e.g. for ggml-cuda.cu
434
436
 
435
- char padding[4];
437
+ char padding[8];
436
438
  };
437
439
 
438
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
440
458
  // computation graph
441
459
  struct ggml_cgraph {
442
460
  int n_nodes;
443
461
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
462
 
449
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1107,6 +1121,17 @@ extern "C" {
1107
1121
  int mode,
1108
1122
  int n_ctx);
1109
1123
 
1124
+ // custom RoPE, in-place, returns view(a)
1125
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1126
+ struct ggml_context * ctx,
1127
+ struct ggml_tensor * a,
1128
+ int n_past,
1129
+ int n_dims,
1130
+ int mode,
1131
+ int n_ctx,
1132
+ float freq_base,
1133
+ float freq_scale);
1134
+
1110
1135
  // rotary position embedding backward, i.e compute dx from dy
1111
1136
  // a - dy
1112
1137
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1114,7 +1139,8 @@ extern "C" {
1114
1139
  struct ggml_tensor * a,
1115
1140
  int n_past,
1116
1141
  int n_dims,
1117
- int mode);
1142
+ int mode,
1143
+ int n_ctx);
1118
1144
 
1119
1145
  // alibi position embedding
1120
1146
  // in-place, returns view(a)
@@ -1161,6 +1187,31 @@ extern "C" {
1161
1187
  int s,
1162
1188
  int d);
1163
1189
 
1190
+ enum ggml_op_pool {
1191
+ GGML_OP_POOL_MAX,
1192
+ GGML_OP_POOL_AVG,
1193
+ GGML_OP_POOL_COUNT,
1194
+ };
1195
+
1196
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ enum ggml_op_pool op,
1200
+ int k0, // kernel size
1201
+ int s0, // stride
1202
+ int p0); // padding
1203
+
1204
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1205
+ struct ggml_context * ctx,
1206
+ struct ggml_tensor * a,
1207
+ enum ggml_op_pool op,
1208
+ int k0,
1209
+ int k1,
1210
+ int s0,
1211
+ int s1,
1212
+ int p0,
1213
+ int p1);
1214
+
1164
1215
  GGML_API struct ggml_tensor * ggml_flash_attn(
1165
1216
  struct ggml_context * ctx,
1166
1217
  struct ggml_tensor * q,
@@ -1290,15 +1341,22 @@ extern "C" {
1290
1341
 
1291
1342
  GGML_API void ggml_set_param(
1292
1343
  struct ggml_context * ctx,
1293
- struct ggml_tensor * tensor);
1344
+ struct ggml_tensor * tensor);
1294
1345
 
1295
1346
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1296
1347
 
1297
1348
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1298
1349
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1299
1350
 
1300
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1301
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1351
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1352
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1353
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1354
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1355
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1356
+
1357
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1358
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1359
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1302
1360
 
1303
1361
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1304
1362
 
@@ -15,6 +15,14 @@
15
15
  #define K_SCALE_SIZE 12
16
16
  #endif
17
17
 
18
+ #ifndef static_assert
19
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
21
+ #else
22
+ #define static_assert(cond, msg) struct global_scope_noop_trick
23
+ #endif
24
+ #endif
25
+
18
26
  //
19
27
  // Super-block quantization structures
20
28
  //