llama_cpp 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -215,9 +215,9 @@
215
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
216
216
 
217
217
  #define GGML_MAX_DIMS 4
218
- #define GGML_MAX_PARAMS 1024
218
+ #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
220
+ #define GGML_MAX_SRC 10
221
221
  #define GGML_MAX_NAME 64
222
222
  #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
@@ -303,7 +303,7 @@ extern "C" {
303
303
 
304
304
  #if defined(__ARM_NEON) && defined(__CUDACC__)
305
305
  typedef half ggml_fp16_t;
306
- #elif defined(__ARM_NEON)
306
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
307
307
  typedef __fp16 ggml_fp16_t;
308
308
  #else
309
309
  typedef uint16_t ggml_fp16_t;
@@ -343,6 +343,12 @@ extern "C" {
343
343
  GGML_TYPE_COUNT,
344
344
  };
345
345
 
346
+ // precision
347
+ enum ggml_prec {
348
+ GGML_PREC_DEFAULT,
349
+ GGML_PREC_F32,
350
+ };
351
+
346
352
  enum ggml_backend_type {
347
353
  GGML_BACKEND_CPU = 0,
348
354
  GGML_BACKEND_GPU = 10,
@@ -423,7 +429,9 @@ extern "C" {
423
429
  GGML_OP_POOL_1D,
424
430
  GGML_OP_POOL_2D,
425
431
  GGML_OP_UPSCALE, // nearest interpolate
432
+ GGML_OP_PAD,
426
433
  GGML_OP_ARGSORT,
434
+ GGML_OP_LEAKY_RELU,
427
435
 
428
436
  GGML_OP_FLASH_ATTN,
429
437
  GGML_OP_FLASH_FF,
@@ -463,7 +471,6 @@ extern "C" {
463
471
  GGML_UNARY_OP_GELU,
464
472
  GGML_UNARY_OP_GELU_QUICK,
465
473
  GGML_UNARY_OP_SILU,
466
- GGML_UNARY_OP_LEAKY,
467
474
 
468
475
  GGML_UNARY_OP_COUNT,
469
476
  };
@@ -477,7 +484,8 @@ extern "C" {
477
484
  enum ggml_log_level {
478
485
  GGML_LOG_LEVEL_ERROR = 2,
479
486
  GGML_LOG_LEVEL_WARN = 3,
480
- GGML_LOG_LEVEL_INFO = 4
487
+ GGML_LOG_LEVEL_INFO = 4,
488
+ GGML_LOG_LEVEL_DEBUG = 5
481
489
  };
482
490
 
483
491
  // ggml object
@@ -501,7 +509,6 @@ extern "C" {
501
509
 
502
510
  struct ggml_backend_buffer * buffer;
503
511
 
504
- int n_dims;
505
512
  int64_t ne[GGML_MAX_DIMS]; // number of elements
506
513
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
507
514
  // nb[0] = ggml_type_size(type)
@@ -533,7 +540,7 @@ extern "C" {
533
540
 
534
541
  void * extra; // extra things e.g. for ggml-cuda.cu
535
542
 
536
- char padding[12];
543
+ char padding[8];
537
544
  };
538
545
 
539
546
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -638,11 +645,14 @@ extern "C" {
638
645
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
639
646
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
640
647
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
641
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
642
648
 
643
- GGML_API int ggml_blck_size (enum ggml_type type);
644
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
645
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
649
+ GGML_API int ggml_blck_size(enum ggml_type type);
650
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
651
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
652
+
653
+ GGML_DEPRECATED(
654
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
655
+ "use ggml_row_size() instead");
646
656
 
647
657
  GGML_API const char * ggml_type_name(enum ggml_type type);
648
658
  GGML_API const char * ggml_op_name (enum ggml_op op);
@@ -661,6 +671,11 @@ extern "C" {
661
671
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
662
672
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
663
673
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
674
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
675
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
676
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
677
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
678
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
664
679
 
665
680
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
666
681
 
@@ -721,8 +736,8 @@ extern "C" {
721
736
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
722
737
 
723
738
  // Context tensor enumeration and lookup
724
- GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
725
- GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
739
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
740
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
726
741
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
727
742
 
728
743
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -793,6 +808,9 @@ extern "C" {
793
808
  struct ggml_tensor * a,
794
809
  struct ggml_tensor * b);
795
810
 
811
+ // dst = a
812
+ // view(dst, nb1, nb2, nb3, offset) += b
813
+ // return dst
796
814
  GGML_API struct ggml_tensor * ggml_acc(
797
815
  struct ggml_context * ctx,
798
816
  struct ggml_tensor * a,
@@ -957,15 +975,14 @@ extern "C" {
957
975
  struct ggml_context * ctx,
958
976
  struct ggml_tensor * a);
959
977
 
960
- GGML_API struct ggml_tensor * ggml_leaky(
978
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
961
979
  struct ggml_context * ctx,
962
- struct ggml_tensor * a);
980
+ struct ggml_tensor * a, float negative_slope, bool inplace);
963
981
 
964
982
  GGML_API struct ggml_tensor * ggml_relu_inplace(
965
983
  struct ggml_context * ctx,
966
984
  struct ggml_tensor * a);
967
985
 
968
- // TODO: double-check this computation is correct
969
986
  GGML_API struct ggml_tensor * ggml_gelu(
970
987
  struct ggml_context * ctx,
971
988
  struct ggml_tensor * a);
@@ -1047,11 +1064,18 @@ extern "C" {
1047
1064
  struct ggml_tensor * a,
1048
1065
  struct ggml_tensor * b);
1049
1066
 
1067
+ // change the precision of a matrix multiplication
1068
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1069
+ GGML_API void ggml_mul_mat_set_prec(
1070
+ struct ggml_tensor * a,
1071
+ enum ggml_prec prec);
1072
+
1050
1073
  // indirect matrix multiplication
1051
1074
  // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1052
1075
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1053
1076
  struct ggml_context * ctx,
1054
- struct ggml_tensor * as[],
1077
+ struct ggml_tensor * const as[],
1078
+ int n_as,
1055
1079
  struct ggml_tensor * ids,
1056
1080
  int id,
1057
1081
  struct ggml_tensor * b);
@@ -1071,13 +1095,13 @@ extern "C" {
1071
1095
  GGML_API struct ggml_tensor * ggml_scale(
1072
1096
  struct ggml_context * ctx,
1073
1097
  struct ggml_tensor * a,
1074
- struct ggml_tensor * b);
1098
+ float s);
1075
1099
 
1076
1100
  // in-place, returns view(a)
1077
1101
  GGML_API struct ggml_tensor * ggml_scale_inplace(
1078
1102
  struct ggml_context * ctx,
1079
1103
  struct ggml_tensor * a,
1080
- struct ggml_tensor * b);
1104
+ float s);
1081
1105
 
1082
1106
  // b -> view(a,offset,nb1,nb2,3), return modified a
1083
1107
  GGML_API struct ggml_tensor * ggml_set(
@@ -1263,6 +1287,7 @@ extern "C" {
1263
1287
  struct ggml_context * ctx,
1264
1288
  struct ggml_tensor * a);
1265
1289
 
1290
+ // supports 3D: a->ne[2] == b->ne[1]
1266
1291
  GGML_API struct ggml_tensor * ggml_get_rows(
1267
1292
  struct ggml_context * ctx,
1268
1293
  struct ggml_tensor * a,
@@ -1549,6 +1574,15 @@ extern "C" {
1549
1574
  struct ggml_tensor * a,
1550
1575
  int scale_factor);
1551
1576
 
1577
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1578
+ GGML_API struct ggml_tensor * ggml_pad(
1579
+ struct ggml_context * ctx,
1580
+ struct ggml_tensor * a,
1581
+ int p0,
1582
+ int p1,
1583
+ int p2,
1584
+ int p3);
1585
+
1552
1586
  // sort rows
1553
1587
  enum ggml_sort_order {
1554
1588
  GGML_SORT_ASC,
@@ -2102,10 +2136,11 @@ extern "C" {
2102
2136
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2103
2137
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2104
2138
 
2105
- GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2106
- GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2107
- GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2108
- GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2139
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2140
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2141
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2142
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2143
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2109
2144
 
2110
2145
  // overrides existing values or adds a new one
2111
2146
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);