llama_cpp 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,12 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
204
208
  #define GGML_UNUSED(x) (void)(x)
205
209
 
210
+
206
211
  #define GGML_ASSERT(x) \
207
212
  do { \
208
213
  if (!(x)) { \
@@ -363,6 +368,8 @@ extern "C" {
363
368
  GGML_OP_CLAMP,
364
369
  GGML_OP_CONV_1D,
365
370
  GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
366
373
 
367
374
  GGML_OP_FLASH_ATTN,
368
375
  GGML_OP_FLASH_FF,
@@ -414,12 +421,7 @@ extern "C" {
414
421
  bool is_param;
415
422
 
416
423
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
423
425
 
424
426
  // performance
425
427
  int perf_runs;
@@ -432,19 +434,31 @@ extern "C" {
432
434
 
433
435
  void * extra; // extra things e.g. for ggml-cuda.cu
434
436
 
435
- char padding[4];
437
+ char padding[8];
436
438
  };
437
439
 
438
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
440
458
  // computation graph
441
459
  struct ggml_cgraph {
442
460
  int n_nodes;
443
461
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
462
 
449
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1107,6 +1121,17 @@ extern "C" {
1107
1121
  int mode,
1108
1122
  int n_ctx);
1109
1123
 
1124
+ // custom RoPE, in-place, returns view(a)
1125
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1126
+ struct ggml_context * ctx,
1127
+ struct ggml_tensor * a,
1128
+ int n_past,
1129
+ int n_dims,
1130
+ int mode,
1131
+ int n_ctx,
1132
+ float freq_base,
1133
+ float freq_scale);
1134
+
1110
1135
  // rotary position embedding backward, i.e compute dx from dy
1111
1136
  // a - dy
1112
1137
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1114,7 +1139,8 @@ extern "C" {
1114
1139
  struct ggml_tensor * a,
1115
1140
  int n_past,
1116
1141
  int n_dims,
1117
- int mode);
1142
+ int mode,
1143
+ int n_ctx);
1118
1144
 
1119
1145
  // alibi position embedding
1120
1146
  // in-place, returns view(a)
@@ -1161,6 +1187,31 @@ extern "C" {
1161
1187
  int s,
1162
1188
  int d);
1163
1189
 
1190
+ enum ggml_op_pool {
1191
+ GGML_OP_POOL_MAX,
1192
+ GGML_OP_POOL_AVG,
1193
+ GGML_OP_POOL_COUNT,
1194
+ };
1195
+
1196
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ enum ggml_op_pool op,
1200
+ int k0, // kernel size
1201
+ int s0, // stride
1202
+ int p0); // padding
1203
+
1204
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1205
+ struct ggml_context * ctx,
1206
+ struct ggml_tensor * a,
1207
+ enum ggml_op_pool op,
1208
+ int k0,
1209
+ int k1,
1210
+ int s0,
1211
+ int s1,
1212
+ int p0,
1213
+ int p1);
1214
+
1164
1215
  GGML_API struct ggml_tensor * ggml_flash_attn(
1165
1216
  struct ggml_context * ctx,
1166
1217
  struct ggml_tensor * q,
@@ -1290,15 +1341,22 @@ extern "C" {
1290
1341
 
1291
1342
  GGML_API void ggml_set_param(
1292
1343
  struct ggml_context * ctx,
1293
- struct ggml_tensor * tensor);
1344
+ struct ggml_tensor * tensor);
1294
1345
 
1295
1346
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1296
1347
 
1297
1348
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1298
1349
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1299
1350
 
1300
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1301
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1351
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1352
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1353
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1354
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1355
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1356
+
1357
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1358
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1359
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1302
1360
 
1303
1361
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1304
1362
 
@@ -15,6 +15,14 @@
15
15
  #define K_SCALE_SIZE 12
16
16
  #endif
17
17
 
18
+ #ifndef static_assert
19
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
20
+ #define static_assert(cond, msg) _Static_assert(cond, msg)
21
+ #else
22
+ #define static_assert(cond, msg) struct global_scope_noop_trick
23
+ #endif
24
+ #endif
25
+
18
26
  //
19
27
  // Super-block quantization structures
20
28
  //