llama_cpp 0.5.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -214,8 +214,8 @@
214
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
215
 
216
216
  #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 4096
218
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
@@ -248,6 +248,14 @@
248
248
  } \
249
249
  } while (0)
250
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
251
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
252
260
  // main purpose is to reduce code duplication and improve readability.
253
261
  //
@@ -393,10 +401,14 @@ extern "C" {
393
401
  GGML_OP_CLAMP,
394
402
  GGML_OP_CONV_1D,
395
403
  GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_TRANSPOSE_1D,
396
405
  GGML_OP_CONV_TRANSPOSE_2D,
397
406
  GGML_OP_POOL_1D,
398
407
  GGML_OP_POOL_2D,
399
408
 
409
+ GGML_OP_CONV_1D_STAGE_0, // internal
410
+ GGML_OP_CONV_1D_STAGE_1, // internal
411
+
400
412
  GGML_OP_UPSCALE, // nearest interpolate
401
413
 
402
414
  GGML_OP_FLASH_ATTN,
@@ -445,6 +457,12 @@ extern "C" {
445
457
  GGML_OBJECT_WORK_BUFFER
446
458
  };
447
459
 
460
+ enum ggml_log_level {
461
+ GGML_LOG_LEVEL_ERROR = 2,
462
+ GGML_LOG_LEVEL_WARN = 3,
463
+ GGML_LOG_LEVEL_INFO = 4
464
+ };
465
+
448
466
  // ggml object
449
467
  struct ggml_object {
450
468
  size_t offs;
@@ -467,8 +485,8 @@ extern "C" {
467
485
  int n_dims;
468
486
  int64_t ne[GGML_MAX_DIMS]; // number of elements
469
487
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
470
- // nb[0] = sizeof(type)
471
- // nb[1] = nb[0] * ne[0] + padding
488
+ // nb[0] = ggml_type_size(type)
489
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
472
490
  // nb[i] = nb[i-1] * ne[i-1]
473
491
 
474
492
  // compute data
@@ -520,7 +538,15 @@ extern "C" {
520
538
  // next prime after GGML_MAX_NODES
521
539
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
522
540
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
523
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
541
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
542
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
543
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
544
+
545
+ enum ggml_cgraph_eval_order {
546
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
547
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
548
+ GGML_CGRAPH_EVAL_ORDER_COUNT
549
+ };
524
550
 
525
551
  // computation graph
526
552
  struct ggml_cgraph {
@@ -533,6 +559,8 @@ extern "C" {
533
559
 
534
560
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
535
561
 
562
+ enum ggml_cgraph_eval_order order;
563
+
536
564
  // performance
537
565
  int perf_runs;
538
566
  int64_t perf_cycles;
@@ -680,12 +708,21 @@ extern "C" {
680
708
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
681
709
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
682
710
 
711
+ // Converts a flat index into coordinates
712
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
713
+
683
714
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
684
715
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
685
716
 
717
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
718
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
719
+
686
720
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
687
721
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
688
722
 
723
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
724
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
725
+
689
726
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
690
727
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
691
728
 
@@ -719,6 +756,12 @@ extern "C" {
719
756
  struct ggml_tensor * a,
720
757
  struct ggml_tensor * b);
721
758
 
759
+ GGML_API struct ggml_tensor * ggml_add_cast(
760
+ struct ggml_context * ctx,
761
+ struct ggml_tensor * a,
762
+ struct ggml_tensor * b,
763
+ enum ggml_type type);
764
+
722
765
  GGML_API struct ggml_tensor * ggml_add1(
723
766
  struct ggml_context * ctx,
724
767
  struct ggml_tensor * a,
@@ -828,6 +871,7 @@ extern "C" {
828
871
  struct ggml_tensor * a,
829
872
  struct ggml_tensor * b);
830
873
 
874
+ // sums repetitions in a into shape of b
831
875
  GGML_API struct ggml_tensor * ggml_repeat_back(
832
876
  struct ggml_context * ctx,
833
877
  struct ggml_tensor * a,
@@ -1049,7 +1093,6 @@ extern "C" {
1049
1093
  size_t nb1,
1050
1094
  size_t offset);
1051
1095
 
1052
-
1053
1096
  // a -> b, return view(b)
1054
1097
  GGML_API struct ggml_tensor * ggml_cpy(
1055
1098
  struct ggml_context * ctx,
@@ -1072,6 +1115,33 @@ extern "C" {
1072
1115
  struct ggml_context * ctx,
1073
1116
  struct ggml_tensor * a);
1074
1117
 
1118
+ // make contiguous, with new shape
1119
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1120
+ struct ggml_context * ctx,
1121
+ struct ggml_tensor * a,
1122
+ int64_t ne0);
1123
+
1124
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1125
+ struct ggml_context * ctx,
1126
+ struct ggml_tensor * a,
1127
+ int64_t ne0,
1128
+ int64_t ne1);
1129
+
1130
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1131
+ struct ggml_context * ctx,
1132
+ struct ggml_tensor * a,
1133
+ int64_t ne0,
1134
+ int64_t ne1,
1135
+ int64_t ne2);
1136
+
1137
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1138
+ struct ggml_context * ctx,
1139
+ struct ggml_tensor * a,
1140
+ int64_t ne0,
1141
+ int64_t ne1,
1142
+ int64_t ne2,
1143
+ int64_t ne3);
1144
+
1075
1145
  // return view(a), b specifies the new shape
1076
1146
  // TODO: when we start computing gradient, make a copy instead of view
1077
1147
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1219,14 +1289,15 @@ extern "C" {
1219
1289
  struct ggml_tensor * b);
1220
1290
 
1221
1291
  // rotary position embedding
1222
- // if mode & 1 == 1, skip n_past elements
1292
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1223
1293
  // if mode & 2 == 1, GPT-NeoX style
1224
1294
  // if mode & 4 == 1, ChatGLM style
1225
- // TODO: avoid creating a new tensor every time
1295
+ //
1296
+ // b is an int32 vector with size a->ne[2], it contains the positions
1226
1297
  GGML_API struct ggml_tensor * ggml_rope(
1227
1298
  struct ggml_context * ctx,
1228
1299
  struct ggml_tensor * a,
1229
- int n_past,
1300
+ struct ggml_tensor * b,
1230
1301
  int n_dims,
1231
1302
  int mode,
1232
1303
  int n_ctx);
@@ -1235,7 +1306,7 @@ extern "C" {
1235
1306
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1236
1307
  struct ggml_context * ctx,
1237
1308
  struct ggml_tensor * a,
1238
- int n_past,
1309
+ struct ggml_tensor * b,
1239
1310
  int n_dims,
1240
1311
  int mode,
1241
1312
  int n_ctx);
@@ -1244,7 +1315,7 @@ extern "C" {
1244
1315
  GGML_API struct ggml_tensor * ggml_rope_custom(
1245
1316
  struct ggml_context * ctx,
1246
1317
  struct ggml_tensor * a,
1247
- int n_past,
1318
+ struct ggml_tensor * b,
1248
1319
  int n_dims,
1249
1320
  int mode,
1250
1321
  int n_ctx,
@@ -1255,7 +1326,7 @@ extern "C" {
1255
1326
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1256
1327
  struct ggml_context * ctx,
1257
1328
  struct ggml_tensor * a,
1258
- int n_past,
1329
+ struct ggml_tensor * b,
1259
1330
  int n_dims,
1260
1331
  int mode,
1261
1332
  int n_ctx,
@@ -1266,7 +1337,7 @@ extern "C" {
1266
1337
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1267
1338
  struct ggml_context * ctx,
1268
1339
  struct ggml_tensor * a,
1269
- int n_past,
1340
+ struct ggml_tensor * b,
1270
1341
  int n_dims,
1271
1342
  float base,
1272
1343
  bool down);
@@ -1276,7 +1347,7 @@ extern "C" {
1276
1347
  GGML_API struct ggml_tensor * ggml_rope_back(
1277
1348
  struct ggml_context * ctx,
1278
1349
  struct ggml_tensor * a,
1279
- int n_past,
1350
+ struct ggml_tensor * b,
1280
1351
  int n_dims,
1281
1352
  int mode,
1282
1353
  int n_ctx,
@@ -1319,6 +1390,14 @@ extern "C" {
1319
1390
  int s,
1320
1391
  int d);
1321
1392
 
1393
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a,
1396
+ struct ggml_tensor * b,
1397
+ int s0,
1398
+ int p0,
1399
+ int d0);
1400
+
1322
1401
  GGML_API struct ggml_tensor * ggml_conv_2d(
1323
1402
  struct ggml_context * ctx,
1324
1403
  struct ggml_tensor * a,
@@ -1656,6 +1735,16 @@ extern "C" {
1656
1735
  // dump the graph into a file using the dot format
1657
1736
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1658
1737
 
1738
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1739
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1740
+ // but without the second forward pass nodes.
1741
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1742
+ struct ggml_context * ctx,
1743
+ struct ggml_cgraph * gf,
1744
+ struct ggml_cgraph * gb,
1745
+ struct ggml_cgraph * gb_tmp,
1746
+ struct ggml_tensor * * checkpoints,
1747
+ int n_checkpoints);
1659
1748
  //
1660
1749
  // optimization
1661
1750
  //
@@ -1682,6 +1771,7 @@ extern "C" {
1682
1771
  GGML_OPT_NO_CONTEXT,
1683
1772
  GGML_OPT_INVALID_WOLFE,
1684
1773
  GGML_OPT_FAIL,
1774
+ GGML_OPT_CANCEL,
1685
1775
 
1686
1776
  GGML_LINESEARCH_FAIL = -128,
1687
1777
  GGML_LINESEARCH_MINIMUM_STEP,
@@ -1690,7 +1780,8 @@ extern "C" {
1690
1780
  GGML_LINESEARCH_INVALID_PARAMETERS,
1691
1781
  };
1692
1782
 
1693
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1783
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1784
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1694
1785
 
1695
1786
  // optimization parameters
1696
1787
  //
@@ -1721,6 +1812,8 @@ extern "C" {
1721
1812
  bool print_forward_graph;
1722
1813
  bool print_backward_graph;
1723
1814
 
1815
+ int n_gradient_accumulation;
1816
+
1724
1817
  // ADAM parameters
1725
1818
  struct {
1726
1819
  int n_iter;
@@ -1766,6 +1859,7 @@ extern "C" {
1766
1859
  float loss_after;
1767
1860
 
1768
1861
  struct {
1862
+ struct ggml_tensor * g; // current gradient
1769
1863
  struct ggml_tensor * m; // first moment
1770
1864
  struct ggml_tensor * v; // second moment
1771
1865
  struct ggml_tensor * pf; // past function values
@@ -1882,26 +1976,26 @@ extern "C" {
1882
1976
 
1883
1977
  GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1884
1978
  GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1885
- GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
1886
-
1887
- GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
1888
- GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
1889
-
1890
- // results are undefined if the wrong type is used for the key
1891
- GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
1892
- GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
1893
- GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
1894
- GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
1895
- GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
1896
- GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
1897
- GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
1898
- GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
1899
- GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
1900
- GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
1901
- GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
1902
- GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
1903
- GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
1904
- GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
1979
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1980
+
1981
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1983
+
1984
+ // will abort if the wrong type is used for the key
1985
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1986
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1987
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1988
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1989
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1990
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1991
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1992
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1993
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1994
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1995
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1996
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1997
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1998
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1905
1999
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1906
2000
 
1907
2001
  GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);