llama_cpp 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -214,8 +214,8 @@
214
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
215
 
216
216
  #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 4096
218
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
@@ -248,6 +248,14 @@
248
248
  } \
249
249
  } while (0)
250
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
251
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
252
260
  // main purpose is to reduce code duplication and improve readability.
253
261
  //
@@ -393,10 +401,14 @@ extern "C" {
393
401
  GGML_OP_CLAMP,
394
402
  GGML_OP_CONV_1D,
395
403
  GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_TRANSPOSE_1D,
396
405
  GGML_OP_CONV_TRANSPOSE_2D,
397
406
  GGML_OP_POOL_1D,
398
407
  GGML_OP_POOL_2D,
399
408
 
409
+ GGML_OP_CONV_1D_STAGE_0, // internal
410
+ GGML_OP_CONV_1D_STAGE_1, // internal
411
+
400
412
  GGML_OP_UPSCALE, // nearest interpolate
401
413
 
402
414
  GGML_OP_FLASH_ATTN,
@@ -445,6 +457,12 @@ extern "C" {
445
457
  GGML_OBJECT_WORK_BUFFER
446
458
  };
447
459
 
460
+ enum ggml_log_level {
461
+ GGML_LOG_LEVEL_ERROR = 2,
462
+ GGML_LOG_LEVEL_WARN = 3,
463
+ GGML_LOG_LEVEL_INFO = 4
464
+ };
465
+
448
466
  // ggml object
449
467
  struct ggml_object {
450
468
  size_t offs;
@@ -467,8 +485,8 @@ extern "C" {
467
485
  int n_dims;
468
486
  int64_t ne[GGML_MAX_DIMS]; // number of elements
469
487
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
470
- // nb[0] = sizeof(type)
471
- // nb[1] = nb[0] * ne[0] + padding
488
+ // nb[0] = ggml_type_size(type)
489
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
472
490
  // nb[i] = nb[i-1] * ne[i-1]
473
491
 
474
492
  // compute data
@@ -520,7 +538,15 @@ extern "C" {
520
538
  // next prime after GGML_MAX_NODES
521
539
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
522
540
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
523
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
541
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
542
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
543
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
544
+
545
+ enum ggml_cgraph_eval_order {
546
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
547
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
548
+ GGML_CGRAPH_EVAL_ORDER_COUNT
549
+ };
524
550
 
525
551
  // computation graph
526
552
  struct ggml_cgraph {
@@ -533,6 +559,8 @@ extern "C" {
533
559
 
534
560
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
535
561
 
562
+ enum ggml_cgraph_eval_order order;
563
+
536
564
  // performance
537
565
  int perf_runs;
538
566
  int64_t perf_cycles;
@@ -680,12 +708,21 @@ extern "C" {
680
708
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
681
709
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
682
710
 
711
+ // Converts a flat index into coordinates
712
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
713
+
683
714
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
684
715
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
685
716
 
717
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
718
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
719
+
686
720
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
687
721
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
688
722
 
723
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
724
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
725
+
689
726
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
690
727
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
691
728
 
@@ -719,6 +756,12 @@ extern "C" {
719
756
  struct ggml_tensor * a,
720
757
  struct ggml_tensor * b);
721
758
 
759
+ GGML_API struct ggml_tensor * ggml_add_cast(
760
+ struct ggml_context * ctx,
761
+ struct ggml_tensor * a,
762
+ struct ggml_tensor * b,
763
+ enum ggml_type type);
764
+
722
765
  GGML_API struct ggml_tensor * ggml_add1(
723
766
  struct ggml_context * ctx,
724
767
  struct ggml_tensor * a,
@@ -828,6 +871,7 @@ extern "C" {
828
871
  struct ggml_tensor * a,
829
872
  struct ggml_tensor * b);
830
873
 
874
+ // sums repetitions in a into shape of b
831
875
  GGML_API struct ggml_tensor * ggml_repeat_back(
832
876
  struct ggml_context * ctx,
833
877
  struct ggml_tensor * a,
@@ -1049,7 +1093,6 @@ extern "C" {
1049
1093
  size_t nb1,
1050
1094
  size_t offset);
1051
1095
 
1052
-
1053
1096
  // a -> b, return view(b)
1054
1097
  GGML_API struct ggml_tensor * ggml_cpy(
1055
1098
  struct ggml_context * ctx,
@@ -1072,6 +1115,33 @@ extern "C" {
1072
1115
  struct ggml_context * ctx,
1073
1116
  struct ggml_tensor * a);
1074
1117
 
1118
+ // make contiguous, with new shape
1119
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1120
+ struct ggml_context * ctx,
1121
+ struct ggml_tensor * a,
1122
+ int64_t ne0);
1123
+
1124
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1125
+ struct ggml_context * ctx,
1126
+ struct ggml_tensor * a,
1127
+ int64_t ne0,
1128
+ int64_t ne1);
1129
+
1130
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1131
+ struct ggml_context * ctx,
1132
+ struct ggml_tensor * a,
1133
+ int64_t ne0,
1134
+ int64_t ne1,
1135
+ int64_t ne2);
1136
+
1137
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1138
+ struct ggml_context * ctx,
1139
+ struct ggml_tensor * a,
1140
+ int64_t ne0,
1141
+ int64_t ne1,
1142
+ int64_t ne2,
1143
+ int64_t ne3);
1144
+
1075
1145
  // return view(a), b specifies the new shape
1076
1146
  // TODO: when we start computing gradient, make a copy instead of view
1077
1147
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1219,14 +1289,15 @@ extern "C" {
1219
1289
  struct ggml_tensor * b);
1220
1290
 
1221
1291
  // rotary position embedding
1222
- // if mode & 1 == 1, skip n_past elements
1292
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1223
1293
  // if mode & 2 == 1, GPT-NeoX style
1224
1294
  // if mode & 4 == 1, ChatGLM style
1225
- // TODO: avoid creating a new tensor every time
1295
+ //
1296
+ // b is an int32 vector with size a->ne[2], it contains the positions
1226
1297
  GGML_API struct ggml_tensor * ggml_rope(
1227
1298
  struct ggml_context * ctx,
1228
1299
  struct ggml_tensor * a,
1229
- int n_past,
1300
+ struct ggml_tensor * b,
1230
1301
  int n_dims,
1231
1302
  int mode,
1232
1303
  int n_ctx);
@@ -1235,7 +1306,7 @@ extern "C" {
1235
1306
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1236
1307
  struct ggml_context * ctx,
1237
1308
  struct ggml_tensor * a,
1238
- int n_past,
1309
+ struct ggml_tensor * b,
1239
1310
  int n_dims,
1240
1311
  int mode,
1241
1312
  int n_ctx);
@@ -1244,7 +1315,7 @@ extern "C" {
1244
1315
  GGML_API struct ggml_tensor * ggml_rope_custom(
1245
1316
  struct ggml_context * ctx,
1246
1317
  struct ggml_tensor * a,
1247
- int n_past,
1318
+ struct ggml_tensor * b,
1248
1319
  int n_dims,
1249
1320
  int mode,
1250
1321
  int n_ctx,
@@ -1255,7 +1326,7 @@ extern "C" {
1255
1326
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1256
1327
  struct ggml_context * ctx,
1257
1328
  struct ggml_tensor * a,
1258
- int n_past,
1329
+ struct ggml_tensor * b,
1259
1330
  int n_dims,
1260
1331
  int mode,
1261
1332
  int n_ctx,
@@ -1266,7 +1337,7 @@ extern "C" {
1266
1337
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1267
1338
  struct ggml_context * ctx,
1268
1339
  struct ggml_tensor * a,
1269
- int n_past,
1340
+ struct ggml_tensor * b,
1270
1341
  int n_dims,
1271
1342
  float base,
1272
1343
  bool down);
@@ -1276,7 +1347,7 @@ extern "C" {
1276
1347
  GGML_API struct ggml_tensor * ggml_rope_back(
1277
1348
  struct ggml_context * ctx,
1278
1349
  struct ggml_tensor * a,
1279
- int n_past,
1350
+ struct ggml_tensor * b,
1280
1351
  int n_dims,
1281
1352
  int mode,
1282
1353
  int n_ctx,
@@ -1319,6 +1390,14 @@ extern "C" {
1319
1390
  int s,
1320
1391
  int d);
1321
1392
 
1393
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a,
1396
+ struct ggml_tensor * b,
1397
+ int s0,
1398
+ int p0,
1399
+ int d0);
1400
+
1322
1401
  GGML_API struct ggml_tensor * ggml_conv_2d(
1323
1402
  struct ggml_context * ctx,
1324
1403
  struct ggml_tensor * a,
@@ -1656,6 +1735,16 @@ extern "C" {
1656
1735
  // dump the graph into a file using the dot format
1657
1736
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1658
1737
 
1738
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1739
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1740
+ // but without the second forward pass nodes.
1741
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1742
+ struct ggml_context * ctx,
1743
+ struct ggml_cgraph * gf,
1744
+ struct ggml_cgraph * gb,
1745
+ struct ggml_cgraph * gb_tmp,
1746
+ struct ggml_tensor * * checkpoints,
1747
+ int n_checkpoints);
1659
1748
  //
1660
1749
  // optimization
1661
1750
  //
@@ -1682,6 +1771,7 @@ extern "C" {
1682
1771
  GGML_OPT_NO_CONTEXT,
1683
1772
  GGML_OPT_INVALID_WOLFE,
1684
1773
  GGML_OPT_FAIL,
1774
+ GGML_OPT_CANCEL,
1685
1775
 
1686
1776
  GGML_LINESEARCH_FAIL = -128,
1687
1777
  GGML_LINESEARCH_MINIMUM_STEP,
@@ -1690,7 +1780,8 @@ extern "C" {
1690
1780
  GGML_LINESEARCH_INVALID_PARAMETERS,
1691
1781
  };
1692
1782
 
1693
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1783
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1784
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1694
1785
 
1695
1786
  // optimization parameters
1696
1787
  //
@@ -1721,6 +1812,8 @@ extern "C" {
1721
1812
  bool print_forward_graph;
1722
1813
  bool print_backward_graph;
1723
1814
 
1815
+ int n_gradient_accumulation;
1816
+
1724
1817
  // ADAM parameters
1725
1818
  struct {
1726
1819
  int n_iter;
@@ -1766,6 +1859,7 @@ extern "C" {
1766
1859
  float loss_after;
1767
1860
 
1768
1861
  struct {
1862
+ struct ggml_tensor * g; // current gradient
1769
1863
  struct ggml_tensor * m; // first moment
1770
1864
  struct ggml_tensor * v; // second moment
1771
1865
  struct ggml_tensor * pf; // past function values
@@ -1882,26 +1976,26 @@ extern "C" {
1882
1976
 
1883
1977
  GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1884
1978
  GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1885
- GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
1886
-
1887
- GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
1888
- GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
1889
-
1890
- // results are undefined if the wrong type is used for the key
1891
- GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
1892
- GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
1893
- GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
1894
- GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
1895
- GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
1896
- GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
1897
- GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
1898
- GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
1899
- GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
1900
- GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
1901
- GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
1902
- GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
1903
- GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
1904
- GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
1979
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1980
+
1981
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1983
+
1984
+ // will abort if the wrong type is used for the key
1985
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1986
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1987
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1988
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1989
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1990
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1991
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1992
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1993
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1994
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1995
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1996
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1997
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1998
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1905
1999
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1906
2000
 
1907
2001
  GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);