llama_cpp 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -195,6 +195,14 @@
195
195
  # define GGML_DEPRECATED(func, hint) func
196
196
  #endif
197
197
 
198
+ #ifndef __GNUC__
199
+ # define GGML_ATTRIBUTE_FORMAT(...)
200
+ #elif defined(__MINGW32__)
201
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
202
+ #else
203
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
204
+ #endif
205
+
198
206
  #include <stdint.h>
199
207
  #include <stddef.h>
200
208
  #include <stdbool.h>
@@ -206,8 +214,8 @@
206
214
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
207
215
 
208
216
  #define GGML_MAX_DIMS 4
209
- #define GGML_MAX_NODES 4096
210
- #define GGML_MAX_PARAMS 256
217
+ #define GGML_MAX_NODES 16384
218
+ #define GGML_MAX_PARAMS 1024
211
219
  #define GGML_MAX_CONTEXTS 64
212
220
  #define GGML_MAX_SRC 6
213
221
  #define GGML_MAX_NAME 64
@@ -240,6 +248,14 @@
240
248
  } \
241
249
  } while (0)
242
250
 
251
+ #ifndef NDEBUG
252
+ #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
253
+ #elif defined(__GNUC__)
254
+ #define GGML_UNREACHABLE() __builtin_unreachable()
255
+ #else
256
+ #define GGML_UNREACHABLE() ((void) 0)
257
+ #endif
258
+
243
259
  // used to copy the number of elements and stride in bytes of tensors into local variables.
244
260
  // main purpose is to reduce code duplication and improve readability.
245
261
  //
@@ -270,7 +286,7 @@ extern "C" {
270
286
 
271
287
  #if defined(__ARM_NEON) && defined(__CUDACC__)
272
288
  typedef half ggml_fp16_t;
273
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
289
+ #elif defined(__ARM_NEON)
274
290
  typedef __fp16 ggml_fp16_t;
275
291
  #else
276
292
  typedef uint16_t ggml_fp16_t;
@@ -437,6 +453,12 @@ extern "C" {
437
453
  GGML_OBJECT_WORK_BUFFER
438
454
  };
439
455
 
456
+ enum ggml_log_level {
457
+ GGML_LOG_LEVEL_ERROR = 2,
458
+ GGML_LOG_LEVEL_WARN = 3,
459
+ GGML_LOG_LEVEL_INFO = 4
460
+ };
461
+
440
462
  // ggml object
441
463
  struct ggml_object {
442
464
  size_t offs;
@@ -459,8 +481,8 @@ extern "C" {
459
481
  int n_dims;
460
482
  int64_t ne[GGML_MAX_DIMS]; // number of elements
461
483
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
462
- // nb[0] = sizeof(type)
463
- // nb[1] = nb[0] * ne[0] + padding
484
+ // nb[0] = ggml_type_size(type)
485
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
464
486
  // nb[i] = nb[i-1] * ne[i-1]
465
487
 
466
488
  // compute data
@@ -512,7 +534,15 @@ extern "C" {
512
534
  // next prime after GGML_MAX_NODES
513
535
  // #define GGML_GRAPH_HASHTABLE_SIZE 4099
514
536
  // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
515
- #define GGML_GRAPH_HASHTABLE_SIZE 8273
537
+ // #define GGML_GRAPH_HASHTABLE_SIZE 8273
538
+ // #define GGML_GRAPH_HASHTABLE_SIZE 16411
539
+ #define GGML_GRAPH_HASHTABLE_SIZE 32771
540
+
541
+ enum ggml_cgraph_eval_order {
542
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
543
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
544
+ GGML_CGRAPH_EVAL_ORDER_COUNT
545
+ };
516
546
 
517
547
  // computation graph
518
548
  struct ggml_cgraph {
@@ -525,6 +555,8 @@ extern "C" {
525
555
 
526
556
  void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
527
557
 
558
+ enum ggml_cgraph_eval_order order;
559
+
528
560
  // performance
529
561
  int perf_runs;
530
562
  int64_t perf_cycles;
@@ -672,12 +704,21 @@ extern "C" {
672
704
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
673
705
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
674
706
 
707
+ // Converts a flat index into coordinates
708
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
709
+
675
710
  GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
676
711
  GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
677
712
 
713
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
714
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
715
+
678
716
  GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
679
717
  GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
680
718
 
719
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
720
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
721
+
681
722
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
682
723
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
683
724
 
@@ -685,6 +726,7 @@ extern "C" {
685
726
 
686
727
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
687
728
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
729
+ GGML_ATTRIBUTE_FORMAT(2, 3)
688
730
  GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
689
731
 
690
732
  //
@@ -710,6 +752,12 @@ extern "C" {
710
752
  struct ggml_tensor * a,
711
753
  struct ggml_tensor * b);
712
754
 
755
+ GGML_API struct ggml_tensor * ggml_add_cast(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a,
758
+ struct ggml_tensor * b,
759
+ enum ggml_type type);
760
+
713
761
  GGML_API struct ggml_tensor * ggml_add1(
714
762
  struct ggml_context * ctx,
715
763
  struct ggml_tensor * a,
@@ -819,6 +867,7 @@ extern "C" {
819
867
  struct ggml_tensor * a,
820
868
  struct ggml_tensor * b);
821
869
 
870
+ // sums repetitions in a into shape of b
822
871
  GGML_API struct ggml_tensor * ggml_repeat_back(
823
872
  struct ggml_context * ctx,
824
873
  struct ggml_tensor * a,
@@ -1040,7 +1089,6 @@ extern "C" {
1040
1089
  size_t nb1,
1041
1090
  size_t offset);
1042
1091
 
1043
-
1044
1092
  // a -> b, return view(b)
1045
1093
  GGML_API struct ggml_tensor * ggml_cpy(
1046
1094
  struct ggml_context * ctx,
@@ -1063,6 +1111,33 @@ extern "C" {
1063
1111
  struct ggml_context * ctx,
1064
1112
  struct ggml_tensor * a);
1065
1113
 
1114
+ // make contiguous, with new shape
1115
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1116
+ struct ggml_context * ctx,
1117
+ struct ggml_tensor * a,
1118
+ int64_t ne0);
1119
+
1120
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1121
+ struct ggml_context * ctx,
1122
+ struct ggml_tensor * a,
1123
+ int64_t ne0,
1124
+ int64_t ne1);
1125
+
1126
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1127
+ struct ggml_context * ctx,
1128
+ struct ggml_tensor * a,
1129
+ int64_t ne0,
1130
+ int64_t ne1,
1131
+ int64_t ne2);
1132
+
1133
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * a,
1136
+ int64_t ne0,
1137
+ int64_t ne1,
1138
+ int64_t ne2,
1139
+ int64_t ne3);
1140
+
1066
1141
  // return view(a), b specifies the new shape
1067
1142
  // TODO: when we start computing gradient, make a copy instead of view
1068
1143
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1210,14 +1285,15 @@ extern "C" {
1210
1285
  struct ggml_tensor * b);
1211
1286
 
1212
1287
  // rotary position embedding
1213
- // if mode & 1 == 1, skip n_past elements
1288
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1214
1289
  // if mode & 2 == 1, GPT-NeoX style
1215
1290
  // if mode & 4 == 1, ChatGLM style
1216
- // TODO: avoid creating a new tensor every time
1291
+ //
1292
+ // b is an int32 vector with size a->ne[2], it contains the positions
1217
1293
  GGML_API struct ggml_tensor * ggml_rope(
1218
1294
  struct ggml_context * ctx,
1219
1295
  struct ggml_tensor * a,
1220
- int n_past,
1296
+ struct ggml_tensor * b,
1221
1297
  int n_dims,
1222
1298
  int mode,
1223
1299
  int n_ctx);
@@ -1226,7 +1302,7 @@ extern "C" {
1226
1302
  GGML_API struct ggml_tensor * ggml_rope_inplace(
1227
1303
  struct ggml_context * ctx,
1228
1304
  struct ggml_tensor * a,
1229
- int n_past,
1305
+ struct ggml_tensor * b,
1230
1306
  int n_dims,
1231
1307
  int mode,
1232
1308
  int n_ctx);
@@ -1235,7 +1311,7 @@ extern "C" {
1235
1311
  GGML_API struct ggml_tensor * ggml_rope_custom(
1236
1312
  struct ggml_context * ctx,
1237
1313
  struct ggml_tensor * a,
1238
- int n_past,
1314
+ struct ggml_tensor * b,
1239
1315
  int n_dims,
1240
1316
  int mode,
1241
1317
  int n_ctx,
@@ -1246,7 +1322,7 @@ extern "C" {
1246
1322
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1247
1323
  struct ggml_context * ctx,
1248
1324
  struct ggml_tensor * a,
1249
- int n_past,
1325
+ struct ggml_tensor * b,
1250
1326
  int n_dims,
1251
1327
  int mode,
1252
1328
  int n_ctx,
@@ -1257,7 +1333,7 @@ extern "C" {
1257
1333
  GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1258
1334
  struct ggml_context * ctx,
1259
1335
  struct ggml_tensor * a,
1260
- int n_past,
1336
+ struct ggml_tensor * b,
1261
1337
  int n_dims,
1262
1338
  float base,
1263
1339
  bool down);
@@ -1267,7 +1343,7 @@ extern "C" {
1267
1343
  GGML_API struct ggml_tensor * ggml_rope_back(
1268
1344
  struct ggml_context * ctx,
1269
1345
  struct ggml_tensor * a,
1270
- int n_past,
1346
+ struct ggml_tensor * b,
1271
1347
  int n_dims,
1272
1348
  int mode,
1273
1349
  int n_ctx,
@@ -1647,6 +1723,16 @@ extern "C" {
1647
1723
  // dump the graph into a file using the dot format
1648
1724
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1649
1725
 
1726
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
1727
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
1728
+ // but without the second forward pass nodes.
1729
+ GGML_API void ggml_build_backward_gradient_checkpointing(
1730
+ struct ggml_context * ctx,
1731
+ struct ggml_cgraph * gf,
1732
+ struct ggml_cgraph * gb,
1733
+ struct ggml_cgraph * gb_tmp,
1734
+ struct ggml_tensor * * checkpoints,
1735
+ int n_checkpoints);
1650
1736
  //
1651
1737
  // optimization
1652
1738
  //
@@ -1681,7 +1767,8 @@ extern "C" {
1681
1767
  GGML_LINESEARCH_INVALID_PARAMETERS,
1682
1768
  };
1683
1769
 
1684
- typedef void (*ggml_opt_callback)(void * data, float * sched);
1770
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
1771
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
1685
1772
 
1686
1773
  // optimization parameters
1687
1774
  //
@@ -1712,6 +1799,8 @@ extern "C" {
1712
1799
  bool print_forward_graph;
1713
1800
  bool print_backward_graph;
1714
1801
 
1802
+ int n_gradient_accumulation;
1803
+
1715
1804
  // ADAM parameters
1716
1805
  struct {
1717
1806
  int n_iter;
@@ -1757,6 +1846,7 @@ extern "C" {
1757
1846
  float loss_after;
1758
1847
 
1759
1848
  struct {
1849
+ struct ggml_tensor * g; // current gradient
1760
1850
  struct ggml_tensor * m; // first moment
1761
1851
  struct ggml_tensor * v; // second moment
1762
1852
  struct ggml_tensor * pf; // past function values
@@ -1866,39 +1956,39 @@ extern "C" {
1866
1956
 
1867
1957
  GGML_API const char * gguf_type_name(enum gguf_type type);
1868
1958
 
1869
- GGML_API int gguf_get_version (struct gguf_context * ctx);
1870
- GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
1871
- GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
1872
- GGML_API void * gguf_get_data (struct gguf_context * ctx);
1873
-
1874
- GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1875
- GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1876
- GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1877
-
1878
- GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1879
- GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
1880
-
1881
- // results are undefined if the wrong type is used for the key
1882
- GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
1883
- GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
1884
- GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
1885
- GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
1886
- GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1887
- GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1888
- GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1889
- GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
1890
- GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
1891
- GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
1892
- GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1893
- GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1894
- GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
1895
- GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
1896
- GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
1897
-
1898
- GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
1899
- GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
1900
- GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
1901
- GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
1959
+ GGML_API int gguf_get_version (const struct gguf_context * ctx);
1960
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
1961
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
1962
+ GGML_API void * gguf_get_data (const struct gguf_context * ctx);
1963
+
1964
+ GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
1965
+ GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
1966
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
1967
+
1968
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
1969
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
1970
+
1971
+ // will abort if the wrong type is used for the key
1972
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
1973
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
1974
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
1975
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
1976
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
1977
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
1978
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
1979
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
1980
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
1981
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
1982
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
1983
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
1984
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
1985
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
1986
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
1987
+
1988
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
1989
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
1990
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
1991
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
1902
1992
 
1903
1993
  // overrides existing values or adds a new one
1904
1994
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
@@ -1943,11 +2033,11 @@ extern "C" {
1943
2033
  //
1944
2034
 
1945
2035
  // write the entire context to a binary file
1946
- GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
2036
+ GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
1947
2037
 
1948
2038
  // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
1949
- GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
1950
- GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
2039
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2040
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
1951
2041
 
1952
2042
  //
1953
2043
  // system info
@@ -1961,6 +2051,7 @@ extern "C" {
1961
2051
  GGML_API int ggml_cpu_has_fma (void);
1962
2052
  GGML_API int ggml_cpu_has_neon (void);
1963
2053
  GGML_API int ggml_cpu_has_arm_fma (void);
2054
+ GGML_API int ggml_cpu_has_metal (void);
1964
2055
  GGML_API int ggml_cpu_has_f16c (void);
1965
2056
  GGML_API int ggml_cpu_has_fp16_va (void);
1966
2057
  GGML_API int ggml_cpu_has_wasm_simd (void);