llama_cpp 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,10 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
208
+ #define GGML_UNUSED(x) (void)(x)
209
+
210
+
204
211
  #define GGML_ASSERT(x) \
205
212
  do { \
206
213
  if (!(x)) { \
@@ -209,6 +216,30 @@
209
216
  } \
210
217
  } while (0)
211
218
 
219
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
220
+ // main purpose is to reduce code duplication and improve readability.
221
+ //
222
+ // example:
223
+ //
224
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
225
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
226
+ //
227
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
228
+ const type prefix##0 = (pointer)->array[0]; \
229
+ GGML_UNUSED(prefix##0);
230
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
231
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
232
+ const type prefix##1 = (pointer)->array[1]; \
233
+ GGML_UNUSED(prefix##1);
234
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
235
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
236
+ const type prefix##2 = (pointer)->array[2]; \
237
+ GGML_UNUSED(prefix##2);
238
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
239
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
240
+ const type prefix##3 = (pointer)->array[3]; \
241
+ GGML_UNUSED(prefix##3);
242
+
212
243
  #ifdef __cplusplus
213
244
  extern "C" {
214
245
  #endif
@@ -224,8 +255,8 @@ extern "C" {
224
255
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
256
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
257
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
258
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
259
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
260
 
230
261
  struct ggml_object;
231
262
  struct ggml_context;
@@ -295,12 +326,15 @@ extern "C" {
295
326
  GGML_OP_SUM,
296
327
  GGML_OP_SUM_ROWS,
297
328
  GGML_OP_MEAN,
329
+ GGML_OP_ARGMAX,
298
330
  GGML_OP_REPEAT,
299
331
  GGML_OP_REPEAT_BACK,
300
332
  GGML_OP_ABS,
301
333
  GGML_OP_SGN,
302
334
  GGML_OP_NEG,
303
335
  GGML_OP_STEP,
336
+ GGML_OP_TANH,
337
+ GGML_OP_ELU,
304
338
  GGML_OP_RELU,
305
339
  GGML_OP_GELU,
306
340
  GGML_OP_GELU_QUICK,
@@ -332,9 +366,10 @@ extern "C" {
332
366
  GGML_OP_ROPE_BACK,
333
367
  GGML_OP_ALIBI,
334
368
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
369
+ GGML_OP_CONV_1D,
370
+ GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
338
373
 
339
374
  GGML_OP_FLASH_ATTN,
340
375
  GGML_OP_FLASH_FF,
@@ -386,12 +421,7 @@ extern "C" {
386
421
  bool is_param;
387
422
 
388
423
  struct ggml_tensor * grad;
389
- struct ggml_tensor * src0;
390
- struct ggml_tensor * src1;
391
- struct ggml_tensor * opt[GGML_MAX_OPT];
392
-
393
- // thread scheduling
394
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
395
425
 
396
426
  // performance
397
427
  int perf_runs;
@@ -404,19 +434,31 @@ extern "C" {
404
434
 
405
435
  void * extra; // extra things e.g. for ggml-cuda.cu
406
436
 
407
- char padding[4];
437
+ char padding[8];
408
438
  };
409
439
 
410
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
411
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
412
458
  // computation graph
413
459
  struct ggml_cgraph {
414
460
  int n_nodes;
415
461
  int n_leafs;
416
- int n_threads;
417
-
418
- size_t work_size;
419
- struct ggml_tensor * work;
420
462
 
421
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
422
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -444,6 +486,9 @@ extern "C" {
444
486
 
445
487
 
446
488
  // compute types
489
+
490
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
491
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
492
  enum ggml_task_type {
448
493
  GGML_TASK_INIT = 0,
449
494
  GGML_TASK_COMPUTE,
@@ -687,6 +732,11 @@ extern "C" {
687
732
  struct ggml_context * ctx,
688
733
  struct ggml_tensor * a);
689
734
 
735
+ // argmax along rows
736
+ GGML_API struct ggml_tensor * ggml_argmax(
737
+ struct ggml_context * ctx,
738
+ struct ggml_tensor * a);
739
+
690
740
  // if a is the same shape as b, and a is not parameter, return a
691
741
  // otherwise, return a new tensor: repeat(a) to fit in b
692
742
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +781,22 @@ extern "C" {
731
781
  struct ggml_context * ctx,
732
782
  struct ggml_tensor * a);
733
783
 
784
+ GGML_API struct ggml_tensor * ggml_tanh(
785
+ struct ggml_context * ctx,
786
+ struct ggml_tensor * a);
787
+
788
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
789
+ struct ggml_context * ctx,
790
+ struct ggml_tensor * a);
791
+
792
+ GGML_API struct ggml_tensor * ggml_elu(
793
+ struct ggml_context * ctx,
794
+ struct ggml_tensor * a);
795
+
796
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
797
+ struct ggml_context * ctx,
798
+ struct ggml_tensor * a);
799
+
734
800
  GGML_API struct ggml_tensor * ggml_relu(
735
801
  struct ggml_context * ctx,
736
802
  struct ggml_tensor * a);
@@ -1081,58 +1147,58 @@ extern "C" {
1081
1147
  float min,
1082
1148
  float max);
1083
1149
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1150
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1151
  struct ggml_context * ctx,
1115
1152
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1153
+ struct ggml_tensor * b,
1154
+ int s0, // stride
1155
+ int p0, // padding
1156
+ int d0); // dilation
1117
1157
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1158
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1159
  struct ggml_context * ctx,
1121
1160
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1161
+ struct ggml_tensor * b,
1162
+ int s0,
1163
+ int s1,
1164
+ int p0,
1165
+ int p1,
1166
+ int d0,
1167
+ int d1);
1123
1168
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1169
+ // conv_1d with padding = half
1170
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1171
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1172
  struct ggml_context * ctx,
1134
1173
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1174
+ struct ggml_tensor * b,
1175
+ int s,
1176
+ int d);
1177
+
1178
+ enum ggml_op_pool {
1179
+ GGML_OP_POOL_MAX,
1180
+ GGML_OP_POOL_AVG,
1181
+ GGML_OP_POOL_COUNT,
1182
+ };
1183
+
1184
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1185
+ struct ggml_context * ctx,
1186
+ struct ggml_tensor * a,
1187
+ enum ggml_op_pool op,
1188
+ int k0, // kernel size
1189
+ int s0, // stride
1190
+ int p0); // padding
1191
+
1192
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1193
+ struct ggml_context * ctx,
1194
+ struct ggml_tensor * a,
1195
+ enum ggml_op_pool op,
1196
+ int k0,
1197
+ int k1,
1198
+ int s0,
1199
+ int s1,
1200
+ int p0,
1201
+ int p1);
1136
1202
 
1137
1203
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1204
  struct ggml_context * ctx,
@@ -1263,15 +1329,22 @@ extern "C" {
1263
1329
 
1264
1330
  GGML_API void ggml_set_param(
1265
1331
  struct ggml_context * ctx,
1266
- struct ggml_tensor * tensor);
1332
+ struct ggml_tensor * tensor);
1267
1333
 
1268
1334
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1269
1335
 
1270
1336
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1271
1337
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1272
1338
 
1273
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1274
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1339
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1340
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1341
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1342
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1343
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1344
+
1345
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1346
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1347
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1275
1348
 
1276
1349
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1277
1350
 
@@ -1488,25 +1561,24 @@ extern "C" {
1488
1561
  //
1489
1562
 
1490
1563
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1564
+ // restrict not standard in C++
1492
1565
  #define GGML_RESTRICT
1493
1566
  #else
1494
1567
  #define GGML_RESTRICT restrict
1495
1568
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1569
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1570
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1571
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1572
 
1500
1573
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1574
+ ggml_to_float_t to_float;
1575
+ ggml_from_float_t from_float;
1576
+ ggml_from_float_t from_float_reference;
1577
+ ggml_vec_dot_t vec_dot;
1578
+ enum ggml_type vec_dot_type;
1579
+ } ggml_type_traits_t;
1580
+
1581
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1582
 
1511
1583
  #ifdef __cplusplus
1512
1584
  }