llama_cpp 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -195,6 +195,14 @@
195
195
  # define GGML_DEPRECATED(func, hint) func
196
196
  #endif
197
197
 
198
+ #ifndef __GNUC__
199
+ # define GGML_ATTRIBUTE_FORMAT(...)
200
+ #elif defined(__MINGW32__)
201
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
202
+ #else
203
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
204
+ #endif
205
+
198
206
  #include <stdint.h>
199
207
  #include <stddef.h>
200
208
  #include <stdbool.h>
@@ -206,8 +214,8 @@
206
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
207
215
 
208
216
  #define GGML_MAX_DIMS 4
209
- #define GGML_MAX_NODES 4096
210
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
211
219
  #define GGML_MAX_CONTEXTS 64
212
220
  #define GGML_MAX_SRC 6
213
221
  #define GGML_MAX_NAME 64
@@ -240,6 +248,14 @@
240
248
  } \
241
249
  } while (0)
242
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
243
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
244
260
  // main purpose is to reduce code duplication and improve readability.
245
261
  //
@@ -270,7 +286,7 @@ extern "C" {
270
286
 
271
287
  #if defined(__ARM_NEON) && defined(__CUDACC__)
272
288
  typedef half ggml_fp16_t;
273
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
289
+ #elif defined(__ARM_NEON)
274
290
  typedef __fp16 ggml_fp16_t;
275
291
  #else
276
292
  typedef uint16_t ggml_fp16_t;
@@ -437,6 +453,12 @@ extern "C" {
437
453
  GGML_OBJECT_WORK_BUFFER
438
454
  };
439
455
 
456
+ enum ggml_log_level {
457
+ GGML_LOG_LEVEL_ERROR = 2,
458
+ GGML_LOG_LEVEL_WARN = 3,
459
+ GGML_LOG_LEVEL_INFO = 4
460
+ };
461
+
440
462
  // ggml object
441
463
  struct ggml_object {
442
464
  size_t offs;
@@ -459,8 +481,8 @@ extern "C" {
459
481
  int n_dims;
460
482
  int64_t ne[GGML_MAX_DIMS]; // number of elements
461
483
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
462
- // nb[0] = sizeof(type)
463
- // nb[1] = nb[0] * ne[0] + padding
484
+ // nb[0] = ggml_type_size(type)
485
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
464
486
  // nb[i] = nb[i-1] * ne[i-1]
465
487
 
466
488
  // compute data
@@ -512,7 +534,15 @@ extern "C" {
512
534
  // next prime after GGML_MAX_NODES
513
535
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
514
536
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
515
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
537
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
538
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
539
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
540
+
541
+ enum ggml_cgraph_eval_order {
542
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
543
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
544
+ GGML_CGRAPH_EVAL_ORDER_COUNT
545
+ };
516
546
 
517
547
  // computation graph
518
548
  struct ggml_cgraph {
@@ -525,6 +555,8 @@ extern "C" {
525
555
 
526
556
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
527
557
 
558
+ enum ggml_cgraph_eval_order order;
559
+
528
560
  // performance
529
561
  int perf_runs;
530
562
  int64_t perf_cycles;
@@ -672,12 +704,21 @@ extern "C" {
672
704
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
673
705
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
674
706
 
707
+ // Converts a flat index into coordinates
708
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
709
+
675
710
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
676
711
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
677
712
 
713
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
714
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
715
+
678
716
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
679
717
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
680
718
 
719
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
720
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
721
+
681
722
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
682
723
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
683
724
 
@@ -685,6 +726,7 @@ extern "C" {
685
726
 
686
727
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
687
728
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
729
+ GGML_ATTRIBUTE_FORMAT(2, 3)
688
730
  GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
689
731
 
690
732
  //
@@ -710,6 +752,12 @@ extern "C" {
710
752
  struct ggml_tensor * a,
711
753
  struct ggml_tensor * b);
712
754
 
755
+ GGML_API struct ggml_tensor * ggml_add_cast(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a,
758
+ struct ggml_tensor * b,
759
+ enum ggml_type type);
760
+
713
761
  GGML_API struct ggml_tensor * ggml_add1(
714
762
  struct ggml_context * ctx,
715
763
  struct ggml_tensor * a,
@@ -819,6 +867,7 @@ extern "C" {
819
867
  struct ggml_tensor * a,
820
868
  struct ggml_tensor * b);
821
869
 
870
+ // sums repetitions in a into shape of b
822
871
  GGML_API struct ggml_tensor * ggml_repeat_back(
823
872
  struct ggml_context * ctx,
824
873
  struct ggml_tensor * a,
@@ -1040,7 +1089,6 @@ extern "C" {
1040
1089
  size_t nb1,
1041
1090
  size_t offset);
1042
1091
 
1043
-
1044
1092
  // a -> b, return view(b)
1045
1093
  GGML_API struct ggml_tensor * ggml_cpy(
1046
1094
  struct ggml_context * ctx,
@@ -1063,6 +1111,33 @@ extern "C" {
1063
1111
  struct ggml_context * ctx,
1064
1112
  struct ggml_tensor * a);
1065
1113
 
1114
+ // make contiguous, with new shape
1115
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1116
+ struct ggml_context * ctx,
1117
+ struct ggml_tensor * a,
1118
+ int64_t ne0);
1119
+
1120
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1121
+ struct ggml_context * ctx,
1122
+ struct ggml_tensor * a,
1123
+ int64_t ne0,
1124
+ int64_t ne1);
1125
+
1126
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1127
+ struct ggml_context * ctx,
1128
+ struct ggml_tensor * a,
1129
+ int64_t ne0,
1130
+ int64_t ne1,
1131
+ int64_t ne2);
1132
+
1133
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * a,
1136
+ int64_t ne0,
1137
+ int64_t ne1,
1138
+ int64_t ne2,
1139
+ int64_t ne3);
1140
+
1066
1141
  // return view(a), b specifies the new shape
1067
1142
  // TODO: when we start computing gradient, make a copy instead of view
1068
1143
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1210,14 +1285,15 @@ extern "C" {
1210
1285
  struct ggml_tensor * b);
1211
1286
 
1212
1287
  // rotary position embedding
1213
- // if mode & 1 == 1, skip n_past elements
1288
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1214
1289
  // if mode & 2 == 1, GPT-NeoX style
1215
1290
  // if mode & 4 == 1, ChatGLM style
1216
- // TODO: avoid creating a new tensor every time
1291
+ //
1292
+ // b is an int32 vector with size a->ne[2], it contains the positions
1217
1293
  GGML_API struct ggml_tensor * ggml_rope(
1218
1294
  struct ggml_context * ctx,
1219
1295
  struct ggml_tensor * a,
1220
- int n_past,
1296
+ struct ggml_tensor * b,
1221
1297
  int n_dims,
1222
1298
  int mode,
1223
1299
  int n_ctx);
@@ -1226,7 +1302,7 @@ extern "C" {
1226
1302
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1227
1303
  struct ggml_context * ctx,
1228
1304
  struct ggml_tensor * a,
1229
- int n_past,
1305
+ struct ggml_tensor * b,
1230
1306
  int n_dims,
1231
1307
  int mode,
1232
1308
  int n_ctx);
@@ -1235,7 +1311,7 @@ extern "C" {
1235
1311
  GGML_API struct ggml_tensor * ggml_rope_custom(
1236
1312
  struct ggml_context * ctx,
1237
1313
  struct ggml_tensor * a,
1238
- int n_past,
1314
+ struct ggml_tensor * b,
1239
1315
  int n_dims,
1240
1316
  int mode,
1241
1317
  int n_ctx,
@@ -1246,7 +1322,7 @@ extern "C" {
1246
1322
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1247
1323
  struct ggml_context * ctx,
1248
1324
  struct ggml_tensor * a,
1249
- int n_past,
1325
+ struct ggml_tensor * b,
1250
1326
  int n_dims,
1251
1327
  int mode,
1252
1328
  int n_ctx,
@@ -1257,7 +1333,7 @@ extern "C" {
1257
1333
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1258
1334
  struct ggml_context * ctx,
1259
1335
  struct ggml_tensor * a,
1260
- int n_past,
1336
+ struct ggml_tensor * b,
1261
1337
  int n_dims,
1262
1338
  float base,
1263
1339
  bool down);
@@ -1267,7 +1343,7 @@ extern "C" {
1267
1343
  GGML_API struct ggml_tensor * ggml_rope_back(
1268
1344
  struct ggml_context * ctx,
1269
1345
  struct ggml_tensor * a,
1270
- int n_past,
1346
+ struct ggml_tensor * b,
1271
1347
  int n_dims,
1272
1348
  int mode,
1273
1349
  int n_ctx,
@@ -1647,6 +1723,16 @@ extern "C" {
1647
1723
  // dump the graph into a file using the dot format
1648
1724
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1649
1725
 
1726
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1727
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1728
+ // but without the second forward pass nodes.
1729
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1730
+ struct ggml_context * ctx,
1731
+ struct ggml_cgraph * gf,
1732
+ struct ggml_cgraph * gb,
1733
+ struct ggml_cgraph * gb_tmp,
1734
+ struct ggml_tensor * * checkpoints,
1735
+ int n_checkpoints);
1650
1736
  //
1651
1737
  // optimization
1652
1738
  //
@@ -1681,7 +1767,8 @@ extern "C" {
1681
1767
  GGML_LINESEARCH_INVALID_PARAMETERS,
1682
1768
  };
1683
1769
 
1684
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1770
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1771
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1685
1772
 
1686
1773
  // optimization parameters
1687
1774
  //
@@ -1712,6 +1799,8 @@ extern "C" {
1712
1799
  bool print_forward_graph;
1713
1800
  bool print_backward_graph;
1714
1801
 
1802
+ int n_gradient_accumulation;
1803
+
1715
1804
  // ADAM parameters
1716
1805
  struct {
1717
1806
  int n_iter;
@@ -1757,6 +1846,7 @@ extern "C" {
1757
1846
  float loss_after;
1758
1847
 
1759
1848
  struct {
1849
+ struct ggml_tensor * g; // current gradient
1760
1850
  struct ggml_tensor * m; // first moment
1761
1851
  struct ggml_tensor * v; // second moment
1762
1852
  struct ggml_tensor * pf; // past function values
@@ -1866,39 +1956,39 @@ extern "C" {
1866
1956
 
1867
1957
  GGML_API const char * gguf_type_name(enum gguf_type type);
1868
1958
 
1869
- GGML_API int gguf_get_version (struct gguf_context * ctx);
1870
- GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
1871
- GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
1872
- GGML_API void * gguf_get_data (struct gguf_context * ctx);
1873
-
1874
- GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1875
- GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1876
- GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1877
-
1878
- GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1879
- GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
1880
-
1881
- // results are undefined if the wrong type is used for the key
1882
- GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
1883
- GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
1884
- GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
1885
- GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
1886
- GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1887
- GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1888
- GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1889
- GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
1890
- GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
1891
- GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
1892
- GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1893
- GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1894
- GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
1895
- GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
1896
- GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
1897
-
1898
- GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
1899
- GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
1900
- GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
1901
- GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
1959
+ GGML_API int gguf_get_version (const struct gguf_context * ctx);
1960
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
1961
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
1962
+ GGML_API void * gguf_get_data (const struct gguf_context * ctx);
1963
+
1964
+ GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1965
+ GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1966
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1967
+
1968
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1969
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1970
+
1971
+ // will abort if the wrong type is used for the key
1972
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1973
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1974
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1975
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1976
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1977
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1978
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1979
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1980
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1981
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1983
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1984
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1985
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1986
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1987
+
1988
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
1989
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
1990
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
1991
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
1902
1992
 
1903
1993
  // overrides existing values or adds a new one
1904
1994
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
@@ -1943,11 +2033,11 @@ extern "C" {
1943
2033
  //
1944
2034
 
1945
2035
  // write the entire context to a binary file
1946
- GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
2036
+ GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
1947
2037
 
1948
2038
  // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
1949
- GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
1950
- GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
2039
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2040
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
1951
2041
 
1952
2042
  //
1953
2043
  // system info
@@ -1961,6 +2051,7 @@ extern "C" {
1961
2051
  GGML_API int ggml_cpu_has_fma (void);
1962
2052
  GGML_API int ggml_cpu_has_neon (void);
1963
2053
  GGML_API int ggml_cpu_has_arm_fma (void);
2054
+ GGML_API int ggml_cpu_has_metal (void);
1964
2055
  GGML_API int ggml_cpu_has_f16c (void);
1965
2056
  GGML_API int ggml_cpu_has_fp16_va (void);
1966
2057
  GGML_API int ggml_cpu_has_wasm_simd (void);