llama_cpp 0.10.0 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -215,9 +215,9 @@
215
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
216
216
 
217
217
  #define GGML_MAX_DIMS 4
218
- #define GGML_MAX_PARAMS 1024
218
+ #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
220
+ #define GGML_MAX_SRC 10
221
221
  #define GGML_MAX_NAME 64
222
222
  #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
@@ -303,7 +303,7 @@ extern "C" {
303
303
 
304
304
  #if defined(__ARM_NEON) && defined(__CUDACC__)
305
305
  typedef half ggml_fp16_t;
306
- #elif defined(__ARM_NEON)
306
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
307
307
  typedef __fp16 ggml_fp16_t;
308
308
  #else
309
309
  typedef uint16_t ggml_fp16_t;
@@ -343,6 +343,12 @@ extern "C" {
343
343
  GGML_TYPE_COUNT,
344
344
  };
345
345
 
346
+ // precision
347
+ enum ggml_prec {
348
+ GGML_PREC_DEFAULT,
349
+ GGML_PREC_F32,
350
+ };
351
+
346
352
  enum ggml_backend_type {
347
353
  GGML_BACKEND_CPU = 0,
348
354
  GGML_BACKEND_GPU = 10,
@@ -423,7 +429,9 @@ extern "C" {
423
429
  GGML_OP_POOL_1D,
424
430
  GGML_OP_POOL_2D,
425
431
  GGML_OP_UPSCALE, // nearest interpolate
432
+ GGML_OP_PAD,
426
433
  GGML_OP_ARGSORT,
434
+ GGML_OP_LEAKY_RELU,
427
435
 
428
436
  GGML_OP_FLASH_ATTN,
429
437
  GGML_OP_FLASH_FF,
@@ -463,7 +471,6 @@ extern "C" {
463
471
  GGML_UNARY_OP_GELU,
464
472
  GGML_UNARY_OP_GELU_QUICK,
465
473
  GGML_UNARY_OP_SILU,
466
- GGML_UNARY_OP_LEAKY,
467
474
 
468
475
  GGML_UNARY_OP_COUNT,
469
476
  };
@@ -477,7 +484,8 @@ extern "C" {
477
484
  enum ggml_log_level {
478
485
  GGML_LOG_LEVEL_ERROR = 2,
479
486
  GGML_LOG_LEVEL_WARN = 3,
480
- GGML_LOG_LEVEL_INFO = 4
487
+ GGML_LOG_LEVEL_INFO = 4,
488
+ GGML_LOG_LEVEL_DEBUG = 5
481
489
  };
482
490
 
483
491
  // ggml object
@@ -501,7 +509,6 @@ extern "C" {
501
509
 
502
510
  struct ggml_backend_buffer * buffer;
503
511
 
504
- int n_dims;
505
512
  int64_t ne[GGML_MAX_DIMS]; // number of elements
506
513
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
507
514
  // nb[0] = ggml_type_size(type)
@@ -533,7 +540,7 @@ extern "C" {
533
540
 
534
541
  void * extra; // extra things e.g. for ggml-cuda.cu
535
542
 
536
- char padding[12];
543
+ char padding[8];
537
544
  };
538
545
 
539
546
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -638,11 +645,14 @@ extern "C" {
638
645
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
639
646
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
640
647
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
641
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
642
648
 
643
- GGML_API int ggml_blck_size (enum ggml_type type);
644
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
645
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
649
+ GGML_API int ggml_blck_size(enum ggml_type type);
650
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
651
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
652
+
653
+ GGML_DEPRECATED(
654
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
655
+ "use ggml_row_size() instead");
646
656
 
647
657
  GGML_API const char * ggml_type_name(enum ggml_type type);
648
658
  GGML_API const char * ggml_op_name (enum ggml_op op);
@@ -661,6 +671,11 @@ extern "C" {
661
671
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
662
672
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
663
673
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
674
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
675
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
676
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
677
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
678
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
664
679
 
665
680
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
666
681
 
@@ -721,8 +736,8 @@ extern "C" {
721
736
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
722
737
 
723
738
  // Context tensor enumeration and lookup
724
- GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
725
- GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
739
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
740
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
726
741
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
727
742
 
728
743
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -793,6 +808,9 @@ extern "C" {
793
808
  struct ggml_tensor * a,
794
809
  struct ggml_tensor * b);
795
810
 
811
+ // dst = a
812
+ // view(dst, nb1, nb2, nb3, offset) += b
813
+ // return dst
796
814
  GGML_API struct ggml_tensor * ggml_acc(
797
815
  struct ggml_context * ctx,
798
816
  struct ggml_tensor * a,
@@ -957,15 +975,14 @@ extern "C" {
957
975
  struct ggml_context * ctx,
958
976
  struct ggml_tensor * a);
959
977
 
960
- GGML_API struct ggml_tensor * ggml_leaky(
978
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
961
979
  struct ggml_context * ctx,
962
- struct ggml_tensor * a);
980
+ struct ggml_tensor * a, float negative_slope, bool inplace);
963
981
 
964
982
  GGML_API struct ggml_tensor * ggml_relu_inplace(
965
983
  struct ggml_context * ctx,
966
984
  struct ggml_tensor * a);
967
985
 
968
- // TODO: double-check this computation is correct
969
986
  GGML_API struct ggml_tensor * ggml_gelu(
970
987
  struct ggml_context * ctx,
971
988
  struct ggml_tensor * a);
@@ -1047,11 +1064,18 @@ extern "C" {
1047
1064
  struct ggml_tensor * a,
1048
1065
  struct ggml_tensor * b);
1049
1066
 
1067
+ // change the precision of a matrix multiplication
1068
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1069
+ GGML_API void ggml_mul_mat_set_prec(
1070
+ struct ggml_tensor * a,
1071
+ enum ggml_prec prec);
1072
+
1050
1073
  // indirect matrix multiplication
1051
1074
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1052
1075
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1053
1076
  struct ggml_context * ctx,
1054
- struct ggml_tensor * as[],
1077
+ struct ggml_tensor * const as[],
1078
+ int n_as,
1055
1079
  struct ggml_tensor * ids,
1056
1080
  int id,
1057
1081
  struct ggml_tensor * b);
@@ -1071,13 +1095,13 @@ extern "C" {
1071
1095
  GGML_API struct ggml_tensor * ggml_scale(
1072
1096
  struct ggml_context * ctx,
1073
1097
  struct ggml_tensor * a,
1074
- struct ggml_tensor * b);
1098
+ float s);
1075
1099
 
1076
1100
  // in-place, returns view(a)
1077
1101
  GGML_API struct ggml_tensor * ggml_scale_inplace(
1078
1102
  struct ggml_context * ctx,
1079
1103
  struct ggml_tensor * a,
1080
- struct ggml_tensor * b);
1104
+ float s);
1081
1105
 
1082
1106
  // b -> view(a,offset,nb1,nb2,3), return modified a
1083
1107
  GGML_API struct ggml_tensor * ggml_set(
@@ -1263,6 +1287,7 @@ extern "C" {
1263
1287
  struct ggml_context * ctx,
1264
1288
  struct ggml_tensor * a);
1265
1289
 
1290
+ // supports 3D: a->ne[2] == b->ne[1]
1266
1291
  GGML_API struct ggml_tensor * ggml_get_rows(
1267
1292
  struct ggml_context * ctx,
1268
1293
  struct ggml_tensor * a,
@@ -1549,6 +1574,15 @@ extern "C" {
1549
1574
  struct ggml_tensor * a,
1550
1575
  int scale_factor);
1551
1576
 
1577
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1578
+ GGML_API struct ggml_tensor * ggml_pad(
1579
+ struct ggml_context * ctx,
1580
+ struct ggml_tensor * a,
1581
+ int p0,
1582
+ int p1,
1583
+ int p2,
1584
+ int p3);
1585
+
1552
1586
  // sort rows
1553
1587
  enum ggml_sort_order {
1554
1588
  GGML_SORT_ASC,
@@ -2102,10 +2136,11 @@ extern "C" {
2102
2136
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2103
2137
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2104
2138
 
2105
- GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2106
- GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2107
- GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2108
- GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2139
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2140
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2141
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2142
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2143
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2109
2144
 
2110
2145
  // overrides existing values or adds a new one
2111
2146
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);