llama_cpp 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,10 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
208
+ #define GGML_UNUSED(x) (void)(x)
209
+
210
+
204
211
  #define GGML_ASSERT(x) \
205
212
  do { \
206
213
  if (!(x)) { \
@@ -209,6 +216,30 @@
209
216
  } \
210
217
  } while (0)
211
218
 
219
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
220
+ // main purpose is to reduce code duplication and improve readability.
221
+ //
222
+ // example:
223
+ //
224
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
225
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
226
+ //
227
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
228
+ const type prefix##0 = (pointer)->array[0]; \
229
+ GGML_UNUSED(prefix##0);
230
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
231
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
232
+ const type prefix##1 = (pointer)->array[1]; \
233
+ GGML_UNUSED(prefix##1);
234
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
235
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
236
+ const type prefix##2 = (pointer)->array[2]; \
237
+ GGML_UNUSED(prefix##2);
238
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
239
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
240
+ const type prefix##3 = (pointer)->array[3]; \
241
+ GGML_UNUSED(prefix##3);
242
+
212
243
  #ifdef __cplusplus
213
244
  extern "C" {
214
245
  #endif
@@ -224,8 +255,8 @@ extern "C" {
224
255
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
256
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
257
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
258
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
259
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
260
 
230
261
  struct ggml_object;
231
262
  struct ggml_context;
@@ -295,12 +326,15 @@ extern "C" {
295
326
  GGML_OP_SUM,
296
327
  GGML_OP_SUM_ROWS,
297
328
  GGML_OP_MEAN,
329
+ GGML_OP_ARGMAX,
298
330
  GGML_OP_REPEAT,
299
331
  GGML_OP_REPEAT_BACK,
300
332
  GGML_OP_ABS,
301
333
  GGML_OP_SGN,
302
334
  GGML_OP_NEG,
303
335
  GGML_OP_STEP,
336
+ GGML_OP_TANH,
337
+ GGML_OP_ELU,
304
338
  GGML_OP_RELU,
305
339
  GGML_OP_GELU,
306
340
  GGML_OP_GELU_QUICK,
@@ -332,9 +366,10 @@ extern "C" {
332
366
  GGML_OP_ROPE_BACK,
333
367
  GGML_OP_ALIBI,
334
368
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
369
+ GGML_OP_CONV_1D,
370
+ GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
338
373
 
339
374
  GGML_OP_FLASH_ATTN,
340
375
  GGML_OP_FLASH_FF,
@@ -386,12 +421,7 @@ extern "C" {
386
421
  bool is_param;
387
422
 
388
423
  struct ggml_tensor * grad;
389
- struct ggml_tensor * src0;
390
- struct ggml_tensor * src1;
391
- struct ggml_tensor * opt[GGML_MAX_OPT];
392
-
393
- // thread scheduling
394
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
395
425
 
396
426
  // performance
397
427
  int perf_runs;
@@ -404,19 +434,31 @@ extern "C" {
404
434
 
405
435
  void * extra; // extra things e.g. for ggml-cuda.cu
406
436
 
407
- char padding[4];
437
+ char padding[8];
408
438
  };
409
439
 
410
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
411
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
412
458
  // computation graph
413
459
  struct ggml_cgraph {
414
460
  int n_nodes;
415
461
  int n_leafs;
416
- int n_threads;
417
-
418
- size_t work_size;
419
- struct ggml_tensor * work;
420
462
 
421
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
422
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -444,6 +486,9 @@ extern "C" {
444
486
 
445
487
 
446
488
  // compute types
489
+
490
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
491
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
492
  enum ggml_task_type {
448
493
  GGML_TASK_INIT = 0,
449
494
  GGML_TASK_COMPUTE,
@@ -687,6 +732,11 @@ extern "C" {
687
732
  struct ggml_context * ctx,
688
733
  struct ggml_tensor * a);
689
734
 
735
+ // argmax along rows
736
+ GGML_API struct ggml_tensor * ggml_argmax(
737
+ struct ggml_context * ctx,
738
+ struct ggml_tensor * a);
739
+
690
740
  // if a is the same shape as b, and a is not parameter, return a
691
741
  // otherwise, return a new tensor: repeat(a) to fit in b
692
742
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +781,22 @@ extern "C" {
731
781
  struct ggml_context * ctx,
732
782
  struct ggml_tensor * a);
733
783
 
784
+ GGML_API struct ggml_tensor * ggml_tanh(
785
+ struct ggml_context * ctx,
786
+ struct ggml_tensor * a);
787
+
788
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
789
+ struct ggml_context * ctx,
790
+ struct ggml_tensor * a);
791
+
792
+ GGML_API struct ggml_tensor * ggml_elu(
793
+ struct ggml_context * ctx,
794
+ struct ggml_tensor * a);
795
+
796
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
797
+ struct ggml_context * ctx,
798
+ struct ggml_tensor * a);
799
+
734
800
  GGML_API struct ggml_tensor * ggml_relu(
735
801
  struct ggml_context * ctx,
736
802
  struct ggml_tensor * a);
@@ -1081,58 +1147,58 @@ extern "C" {
1081
1147
  float min,
1082
1148
  float max);
1083
1149
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1150
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1151
  struct ggml_context * ctx,
1115
1152
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1153
+ struct ggml_tensor * b,
1154
+ int s0, // stride
1155
+ int p0, // padding
1156
+ int d0); // dilation
1117
1157
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1158
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1159
  struct ggml_context * ctx,
1121
1160
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1161
+ struct ggml_tensor * b,
1162
+ int s0,
1163
+ int s1,
1164
+ int p0,
1165
+ int p1,
1166
+ int d0,
1167
+ int d1);
1123
1168
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1169
+ // conv_1d with padding = half
1170
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1171
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1172
  struct ggml_context * ctx,
1134
1173
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1174
+ struct ggml_tensor * b,
1175
+ int s,
1176
+ int d);
1177
+
1178
+ enum ggml_op_pool {
1179
+ GGML_OP_POOL_MAX,
1180
+ GGML_OP_POOL_AVG,
1181
+ GGML_OP_POOL_COUNT,
1182
+ };
1183
+
1184
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1185
+ struct ggml_context * ctx,
1186
+ struct ggml_tensor * a,
1187
+ enum ggml_op_pool op,
1188
+ int k0, // kernel size
1189
+ int s0, // stride
1190
+ int p0); // padding
1191
+
1192
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1193
+ struct ggml_context * ctx,
1194
+ struct ggml_tensor * a,
1195
+ enum ggml_op_pool op,
1196
+ int k0,
1197
+ int k1,
1198
+ int s0,
1199
+ int s1,
1200
+ int p0,
1201
+ int p1);
1136
1202
 
1137
1203
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1204
  struct ggml_context * ctx,
@@ -1263,15 +1329,22 @@ extern "C" {
1263
1329
 
1264
1330
  GGML_API void ggml_set_param(
1265
1331
  struct ggml_context * ctx,
1266
- struct ggml_tensor * tensor);
1332
+ struct ggml_tensor * tensor);
1267
1333
 
1268
1334
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1269
1335
 
1270
1336
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1271
1337
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1272
1338
 
1273
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1274
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1339
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1340
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1341
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1342
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1343
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1344
+
1345
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1346
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1347
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1275
1348
 
1276
1349
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1277
1350
 
@@ -1488,25 +1561,24 @@ extern "C" {
1488
1561
  //
1489
1562
 
1490
1563
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1564
+ // restrict not standard in C++
1492
1565
  #define GGML_RESTRICT
1493
1566
  #else
1494
1567
  #define GGML_RESTRICT restrict
1495
1568
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1569
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1570
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1571
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1572
 
1500
1573
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1574
+ ggml_to_float_t to_float;
1575
+ ggml_from_float_t from_float;
1576
+ ggml_from_float_t from_float_reference;
1577
+ ggml_vec_dot_t vec_dot;
1578
+ enum ggml_type vec_dot_type;
1579
+ } ggml_type_traits_t;
1580
+
1581
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1582
 
1511
1583
  #ifdef __cplusplus
1512
1584
  }