llama_cpp 0.5.3 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -214,8 +214,8 @@
214
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
215
 
216
216
  #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 4096
218
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
@@ -248,6 +248,14 @@
248
248
  } \
249
249
  } while (0)
250
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
251
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
252
260
  // main purpose is to reduce code duplication and improve readability.
253
261
  //
@@ -445,6 +453,12 @@ extern "C" {
445
453
  GGML_OBJECT_WORK_BUFFER
446
454
  };
447
455
 
456
+ enum ggml_log_level {
457
+ GGML_LOG_LEVEL_ERROR = 2,
458
+ GGML_LOG_LEVEL_WARN = 3,
459
+ GGML_LOG_LEVEL_INFO = 4
460
+ };
461
+
448
462
  // ggml object
449
463
  struct ggml_object {
450
464
  size_t offs;
@@ -467,8 +481,8 @@ extern "C" {
467
481
  int n_dims;
468
482
  int64_t ne[GGML_MAX_DIMS]; // number of elements
469
483
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
470
- // nb[0] = sizeof(type)
471
- // nb[1] = nb[0] * ne[0] + padding
484
+ // nb[0] = ggml_type_size(type)
485
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
472
486
  // nb[i] = nb[i-1] * ne[i-1]
473
487
 
474
488
  // compute data
@@ -520,7 +534,15 @@ extern "C" {
520
534
  // next prime after GGML_MAX_NODES
521
535
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
522
536
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
523
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
537
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
538
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
539
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
540
+
541
+ enum ggml_cgraph_eval_order {
542
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
543
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
544
+ GGML_CGRAPH_EVAL_ORDER_COUNT
545
+ };
524
546
 
525
547
  // computation graph
526
548
  struct ggml_cgraph {
@@ -533,6 +555,8 @@ extern "C" {
533
555
 
534
556
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
535
557
 
558
+ enum ggml_cgraph_eval_order order;
559
+
536
560
  // performance
537
561
  int perf_runs;
538
562
  int64_t perf_cycles;
@@ -680,12 +704,21 @@ extern "C" {
680
704
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
681
705
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
682
706
 
707
+ // Converts a flat index into coordinates
708
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
709
+
683
710
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
684
711
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
685
712
 
713
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
714
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
715
+
686
716
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
687
717
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
688
718
 
719
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
720
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
721
+
689
722
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
690
723
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
691
724
 
@@ -719,6 +752,12 @@ extern "C" {
719
752
  struct ggml_tensor * a,
720
753
  struct ggml_tensor * b);
721
754
 
755
+ GGML_API struct ggml_tensor * ggml_add_cast(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a,
758
+ struct ggml_tensor * b,
759
+ enum ggml_type type);
760
+
722
761
  GGML_API struct ggml_tensor * ggml_add1(
723
762
  struct ggml_context * ctx,
724
763
  struct ggml_tensor * a,
@@ -828,6 +867,7 @@ extern "C" {
828
867
  struct ggml_tensor * a,
829
868
  struct ggml_tensor * b);
830
869
 
870
+ // sums repetitions in a into shape of b
831
871
  GGML_API struct ggml_tensor * ggml_repeat_back(
832
872
  struct ggml_context * ctx,
833
873
  struct ggml_tensor * a,
@@ -1049,7 +1089,6 @@ extern "C" {
1049
1089
  size_t nb1,
1050
1090
  size_t offset);
1051
1091
 
1052
-
1053
1092
  // a -> b, return view(b)
1054
1093
  GGML_API struct ggml_tensor * ggml_cpy(
1055
1094
  struct ggml_context * ctx,
@@ -1072,6 +1111,33 @@ extern "C" {
1072
1111
  struct ggml_context * ctx,
1073
1112
  struct ggml_tensor * a);
1074
1113
 
1114
+ // make contiguous, with new shape
1115
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1116
+ struct ggml_context * ctx,
1117
+ struct ggml_tensor * a,
1118
+ int64_t ne0);
1119
+
1120
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1121
+ struct ggml_context * ctx,
1122
+ struct ggml_tensor * a,
1123
+ int64_t ne0,
1124
+ int64_t ne1);
1125
+
1126
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1127
+ struct ggml_context * ctx,
1128
+ struct ggml_tensor * a,
1129
+ int64_t ne0,
1130
+ int64_t ne1,
1131
+ int64_t ne2);
1132
+
1133
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * a,
1136
+ int64_t ne0,
1137
+ int64_t ne1,
1138
+ int64_t ne2,
1139
+ int64_t ne3);
1140
+
1075
1141
  // return view(a), b specifies the new shape
1076
1142
  // TODO: when we start computing gradient, make a copy instead of view
1077
1143
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1219,14 +1285,15 @@ extern "C" {
1219
1285
  struct ggml_tensor * b);
1220
1286
 
1221
1287
  // rotary position embedding
1222
- // if mode & 1 == 1, skip n_past elements
1288
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1223
1289
  // if mode & 2 == 1, GPT-NeoX style
1224
1290
  // if mode & 4 == 1, ChatGLM style
1225
- // TODO: avoid creating a new tensor every time
1291
+ //
1292
+ // b is an int32 vector with size a->ne[2], it contains the positions
1226
1293
  GGML_API struct ggml_tensor * ggml_rope(
1227
1294
  struct ggml_context * ctx,
1228
1295
  struct ggml_tensor * a,
1229
- int n_past,
1296
+ struct ggml_tensor * b,
1230
1297
  int n_dims,
1231
1298
  int mode,
1232
1299
  int n_ctx);
@@ -1235,7 +1302,7 @@ extern "C" {
1235
1302
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1236
1303
  struct ggml_context * ctx,
1237
1304
  struct ggml_tensor * a,
1238
- int n_past,
1305
+ struct ggml_tensor * b,
1239
1306
  int n_dims,
1240
1307
  int mode,
1241
1308
  int n_ctx);
@@ -1244,7 +1311,7 @@ extern "C" {
1244
1311
  GGML_API struct ggml_tensor * ggml_rope_custom(
1245
1312
  struct ggml_context * ctx,
1246
1313
  struct ggml_tensor * a,
1247
- int n_past,
1314
+ struct ggml_tensor * b,
1248
1315
  int n_dims,
1249
1316
  int mode,
1250
1317
  int n_ctx,
@@ -1255,7 +1322,7 @@ extern "C" {
1255
1322
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1256
1323
  struct ggml_context * ctx,
1257
1324
  struct ggml_tensor * a,
1258
- int n_past,
1325
+ struct ggml_tensor * b,
1259
1326
  int n_dims,
1260
1327
  int mode,
1261
1328
  int n_ctx,
@@ -1266,7 +1333,7 @@ extern "C" {
1266
1333
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1267
1334
  struct ggml_context * ctx,
1268
1335
  struct ggml_tensor * a,
1269
- int n_past,
1336
+ struct ggml_tensor * b,
1270
1337
  int n_dims,
1271
1338
  float base,
1272
1339
  bool down);
@@ -1276,7 +1343,7 @@ extern "C" {
1276
1343
  GGML_API struct ggml_tensor * ggml_rope_back(
1277
1344
  struct ggml_context * ctx,
1278
1345
  struct ggml_tensor * a,
1279
- int n_past,
1346
+ struct ggml_tensor * b,
1280
1347
  int n_dims,
1281
1348
  int mode,
1282
1349
  int n_ctx,
@@ -1656,6 +1723,16 @@ extern "C" {
1656
1723
  // dump the graph into a file using the dot format
1657
1724
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1658
1725
 
1726
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1727
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1728
+ // but without the second forward pass nodes.
1729
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1730
+ struct ggml_context * ctx,
1731
+ struct ggml_cgraph * gf,
1732
+ struct ggml_cgraph * gb,
1733
+ struct ggml_cgraph * gb_tmp,
1734
+ struct ggml_tensor * * checkpoints,
1735
+ int n_checkpoints);
1659
1736
  //
1660
1737
  // optimization
1661
1738
  //
@@ -1690,7 +1767,8 @@ extern "C" {
1690
1767
  GGML_LINESEARCH_INVALID_PARAMETERS,
1691
1768
  };
1692
1769
 
1693
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1770
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1771
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1694
1772
 
1695
1773
  // optimization parameters
1696
1774
  //
@@ -1721,6 +1799,8 @@ extern "C" {
1721
1799
  bool print_forward_graph;
1722
1800
  bool print_backward_graph;
1723
1801
 
1802
+ int n_gradient_accumulation;
1803
+
1724
1804
  // ADAM parameters
1725
1805
  struct {
1726
1806
  int n_iter;
@@ -1766,6 +1846,7 @@ extern "C" {
1766
1846
  float loss_after;
1767
1847
 
1768
1848
  struct {
1849
+ struct ggml_tensor * g; // current gradient
1769
1850
  struct ggml_tensor * m; // first moment
1770
1851
  struct ggml_tensor * v; // second moment
1771
1852
  struct ggml_tensor * pf; // past function values
@@ -1882,26 +1963,26 @@ extern "C" {
1882
1963
 
1883
1964
  GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1884
1965
  GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1885
- GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
1886
-
1887
- GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
1888
- GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
1889
-
1890
- // results are undefined if the wrong type is used for the key
1891
- GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
1892
- GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
1893
- GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
1894
- GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
1895
- GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
1896
- GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
1897
- GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
1898
- GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
1899
- GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
1900
- GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
1901
- GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
1902
- GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
1903
- GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
1904
- GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
1966
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1967
+
1968
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1969
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1970
+
1971
+ // will abort if the wrong type is used for the key
1972
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1973
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1974
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1975
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1976
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1977
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1978
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1979
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1980
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1981
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1983
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1984
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1985
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1905
1986
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1906
1987
 
1907
1988
  GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);