llama_cpp 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -214,8 +214,8 @@
214
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
215
 
216
216
  #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 4096
218
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 6
221
221
  #define GGML_MAX_NAME 64
@@ -248,6 +248,14 @@
248
248
  } \
249
249
  } while (0)
250
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
251
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
252
260
  // main purpose is to reduce code duplication and improve readability.
253
261
  //
@@ -445,6 +453,12 @@ extern "C" {
445
453
  GGML_OBJECT_WORK_BUFFER
446
454
  };
447
455
 
456
+ enum ggml_log_level {
457
+ GGML_LOG_LEVEL_ERROR = 2,
458
+ GGML_LOG_LEVEL_WARN = 3,
459
+ GGML_LOG_LEVEL_INFO = 4
460
+ };
461
+
448
462
  // ggml object
449
463
  struct ggml_object {
450
464
  size_t offs;
@@ -467,8 +481,8 @@ extern "C" {
467
481
  int n_dims;
468
482
  int64_t ne[GGML_MAX_DIMS]; // number of elements
469
483
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
470
- // nb[0] = sizeof(type)
471
- // nb[1] = nb[0] * ne[0] + padding
484
+ // nb[0] = ggml_type_size(type)
485
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
472
486
  // nb[i] = nb[i-1] * ne[i-1]
473
487
 
474
488
  // compute data
@@ -520,7 +534,15 @@ extern "C" {
520
534
  // next prime after GGML_MAX_NODES
521
535
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
522
536
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
523
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
537
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
538
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
539
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
540
+
541
+ enum ggml_cgraph_eval_order {
542
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
543
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
544
+ GGML_CGRAPH_EVAL_ORDER_COUNT
545
+ };
524
546
 
525
547
  // computation graph
526
548
  struct ggml_cgraph {
@@ -533,6 +555,8 @@ extern "C" {
533
555
 
534
556
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
535
557
 
558
+ enum ggml_cgraph_eval_order order;
559
+
536
560
  // performance
537
561
  int perf_runs;
538
562
  int64_t perf_cycles;
@@ -680,12 +704,21 @@ extern "C" {
680
704
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
681
705
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
682
706
 
707
+ // Converts a flat index into coordinates
708
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
709
+
683
710
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
684
711
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
685
712
 
713
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
714
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
715
+
686
716
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
687
717
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
688
718
 
719
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
720
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
721
+
689
722
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
690
723
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
691
724
 
@@ -719,6 +752,12 @@ extern "C" {
719
752
  struct ggml_tensor * a,
720
753
  struct ggml_tensor * b);
721
754
 
755
+ GGML_API struct ggml_tensor * ggml_add_cast(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a,
758
+ struct ggml_tensor * b,
759
+ enum ggml_type type);
760
+
722
761
  GGML_API struct ggml_tensor * ggml_add1(
723
762
  struct ggml_context * ctx,
724
763
  struct ggml_tensor * a,
@@ -828,6 +867,7 @@ extern "C" {
828
867
  struct ggml_tensor * a,
829
868
  struct ggml_tensor * b);
830
869
 
870
+ // sums repetitions in a into shape of b
831
871
  GGML_API struct ggml_tensor * ggml_repeat_back(
832
872
  struct ggml_context * ctx,
833
873
  struct ggml_tensor * a,
@@ -1049,7 +1089,6 @@ extern "C" {
1049
1089
  size_t nb1,
1050
1090
  size_t offset);
1051
1091
 
1052
-
1053
1092
  // a -> b, return view(b)
1054
1093
  GGML_API struct ggml_tensor * ggml_cpy(
1055
1094
  struct ggml_context * ctx,
@@ -1072,6 +1111,33 @@ extern "C" {
1072
1111
  struct ggml_context * ctx,
1073
1112
  struct ggml_tensor * a);
1074
1113
 
1114
+ // make contiguous, with new shape
1115
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1116
+ struct ggml_context * ctx,
1117
+ struct ggml_tensor * a,
1118
+ int64_t ne0);
1119
+
1120
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1121
+ struct ggml_context * ctx,
1122
+ struct ggml_tensor * a,
1123
+ int64_t ne0,
1124
+ int64_t ne1);
1125
+
1126
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1127
+ struct ggml_context * ctx,
1128
+ struct ggml_tensor * a,
1129
+ int64_t ne0,
1130
+ int64_t ne1,
1131
+ int64_t ne2);
1132
+
1133
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * a,
1136
+ int64_t ne0,
1137
+ int64_t ne1,
1138
+ int64_t ne2,
1139
+ int64_t ne3);
1140
+
1075
1141
  // return view(a), b specifies the new shape
1076
1142
  // TODO: when we start computing gradient, make a copy instead of view
1077
1143
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1219,14 +1285,15 @@ extern "C" {
1219
1285
  struct ggml_tensor * b);
1220
1286
 
1221
1287
  // rotary position embedding
1222
- // if mode & 1 == 1, skip n_past elements
1288
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1223
1289
  // if mode & 2 == 1, GPT-NeoX style
1224
1290
  // if mode & 4 == 1, ChatGLM style
1225
- // TODO: avoid creating a new tensor every time
1291
+ //
1292
+ // b is an int32 vector with size a->ne[2], it contains the positions
1226
1293
  GGML_API struct ggml_tensor * ggml_rope(
1227
1294
  struct ggml_context * ctx,
1228
1295
  struct ggml_tensor * a,
1229
- int n_past,
1296
+ struct ggml_tensor * b,
1230
1297
  int n_dims,
1231
1298
  int mode,
1232
1299
  int n_ctx);
@@ -1235,7 +1302,7 @@ extern "C" {
1235
1302
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1236
1303
  struct ggml_context * ctx,
1237
1304
  struct ggml_tensor * a,
1238
- int n_past,
1305
+ struct ggml_tensor * b,
1239
1306
  int n_dims,
1240
1307
  int mode,
1241
1308
  int n_ctx);
@@ -1244,7 +1311,7 @@ extern "C" {
1244
1311
  GGML_API struct ggml_tensor * ggml_rope_custom(
1245
1312
  struct ggml_context * ctx,
1246
1313
  struct ggml_tensor * a,
1247
- int n_past,
1314
+ struct ggml_tensor * b,
1248
1315
  int n_dims,
1249
1316
  int mode,
1250
1317
  int n_ctx,
@@ -1255,7 +1322,7 @@ extern "C" {
1255
1322
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1256
1323
  struct ggml_context * ctx,
1257
1324
  struct ggml_tensor * a,
1258
- int n_past,
1325
+ struct ggml_tensor * b,
1259
1326
  int n_dims,
1260
1327
  int mode,
1261
1328
  int n_ctx,
@@ -1266,7 +1333,7 @@ extern "C" {
1266
1333
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1267
1334
  struct ggml_context * ctx,
1268
1335
  struct ggml_tensor * a,
1269
- int n_past,
1336
+ struct ggml_tensor * b,
1270
1337
  int n_dims,
1271
1338
  float base,
1272
1339
  bool down);
@@ -1276,7 +1343,7 @@ extern "C" {
1276
1343
  GGML_API struct ggml_tensor * ggml_rope_back(
1277
1344
  struct ggml_context * ctx,
1278
1345
  struct ggml_tensor * a,
1279
- int n_past,
1346
+ struct ggml_tensor * b,
1280
1347
  int n_dims,
1281
1348
  int mode,
1282
1349
  int n_ctx,
@@ -1656,6 +1723,16 @@ extern "C" {
1656
1723
  // dump the graph into a file using the dot format
1657
1724
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1658
1725
 
1726
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1727
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1728
+ // but without the second forward pass nodes.
1729
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1730
+ struct ggml_context * ctx,
1731
+ struct ggml_cgraph * gf,
1732
+ struct ggml_cgraph * gb,
1733
+ struct ggml_cgraph * gb_tmp,
1734
+ struct ggml_tensor * * checkpoints,
1735
+ int n_checkpoints);
1659
1736
  //
1660
1737
  // optimization
1661
1738
  //
@@ -1690,7 +1767,8 @@ extern "C" {
1690
1767
  GGML_LINESEARCH_INVALID_PARAMETERS,
1691
1768
  };
1692
1769
 
1693
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1770
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1771
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1694
1772
 
1695
1773
  // optimization parameters
1696
1774
  //
@@ -1721,6 +1799,8 @@ extern "C" {
1721
1799
  bool print_forward_graph;
1722
1800
  bool print_backward_graph;
1723
1801
 
1802
+ int n_gradient_accumulation;
1803
+
1724
1804
  // ADAM parameters
1725
1805
  struct {
1726
1806
  int n_iter;
@@ -1766,6 +1846,7 @@ extern "C" {
1766
1846
  float loss_after;
1767
1847
 
1768
1848
  struct {
1849
+ struct ggml_tensor * g; // current gradient
1769
1850
  struct ggml_tensor * m; // first moment
1770
1851
  struct ggml_tensor * v; // second moment
1771
1852
  struct ggml_tensor * pf; // past function values
@@ -1882,26 +1963,26 @@ extern "C" {
1882
1963
 
1883
1964
  GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1884
1965
  GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1885
- GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
1886
-
1887
- GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
1888
- GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
1889
-
1890
- // results are undefined if the wrong type is used for the key
1891
- GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
1892
- GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
1893
- GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
1894
- GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
1895
- GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
1896
- GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
1897
- GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
1898
- GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
1899
- GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
1900
- GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
1901
- GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
1902
- GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
1903
- GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
1904
- GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
1966
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1967
+
1968
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1969
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1970
+
1971
+ // will abort if the wrong type is used for the key
1972
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1973
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1974
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1975
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1976
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1977
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1978
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1979
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1980
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1981
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1983
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1984
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1985
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1905
1986
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1906
1987
 
1907
1988
  GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);