llama_cpp 0.9.5 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -215,9 +215,9 @@
215
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
216
216
 
217
217
  #define GGML_MAX_DIMS 4
218
- #define GGML_MAX_PARAMS 1024
218
+ #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
220
+ #define GGML_MAX_SRC 10
221
221
  #define GGML_MAX_NAME 64
222
222
  #define GGML_MAX_OP_PARAMS 64
223
223
  #define GGML_DEFAULT_N_THREADS 4
@@ -283,6 +283,20 @@
283
283
  const type prefix##3 = (pointer)->array[3]; \
284
284
  GGML_UNUSED(prefix##3);
285
285
 
286
+ #define GGML_TENSOR_UNARY_OP_LOCALS \
287
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
288
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
289
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
290
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
291
+
292
+ #define GGML_TENSOR_BINARY_OP_LOCALS \
293
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
294
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
295
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
296
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
297
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
298
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
299
+
286
300
  #ifdef __cplusplus
287
301
  extern "C" {
288
302
  #endif
@@ -381,6 +395,7 @@ extern "C" {
381
395
  GGML_OP_GROUP_NORM,
382
396
 
383
397
  GGML_OP_MUL_MAT,
398
+ GGML_OP_MUL_MAT_ID,
384
399
  GGML_OP_OUT_PROD,
385
400
 
386
401
  GGML_OP_SCALE,
@@ -407,8 +422,10 @@ extern "C" {
407
422
  GGML_OP_CONV_TRANSPOSE_2D,
408
423
  GGML_OP_POOL_1D,
409
424
  GGML_OP_POOL_2D,
410
-
411
425
  GGML_OP_UPSCALE, // nearest interpolate
426
+ GGML_OP_PAD,
427
+ GGML_OP_ARGSORT,
428
+ GGML_OP_LEAKY_RELU,
412
429
 
413
430
  GGML_OP_FLASH_ATTN,
414
431
  GGML_OP_FLASH_FF,
@@ -448,7 +465,8 @@ extern "C" {
448
465
  GGML_UNARY_OP_GELU,
449
466
  GGML_UNARY_OP_GELU_QUICK,
450
467
  GGML_UNARY_OP_SILU,
451
- GGML_UNARY_OP_LEAKY
468
+
469
+ GGML_UNARY_OP_COUNT,
452
470
  };
453
471
 
454
472
  enum ggml_object_type {
@@ -484,7 +502,6 @@ extern "C" {
484
502
 
485
503
  struct ggml_backend_buffer * buffer;
486
504
 
487
- int n_dims;
488
505
  int64_t ne[GGML_MAX_DIMS]; // number of elements
489
506
  size_t nb[GGML_MAX_DIMS]; // stride in bytes:
490
507
  // nb[0] = ggml_type_size(type)
@@ -516,7 +533,7 @@ extern "C" {
516
533
 
517
534
  void * extra; // extra things e.g. for ggml-cuda.cu
518
535
 
519
- char padding[12];
536
+ char padding[8];
520
537
  };
521
538
 
522
539
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -621,16 +638,22 @@ extern "C" {
621
638
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
622
639
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
623
640
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
624
- GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
625
641
 
626
- GGML_API int ggml_blck_size (enum ggml_type type);
627
- GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
628
- GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
642
+ GGML_API int ggml_blck_size(enum ggml_type type);
643
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
644
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
645
+
646
+ GGML_DEPRECATED(
647
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
648
+ "use ggml_row_size() instead");
629
649
 
630
650
  GGML_API const char * ggml_type_name(enum ggml_type type);
631
651
  GGML_API const char * ggml_op_name (enum ggml_op op);
632
652
  GGML_API const char * ggml_op_symbol(enum ggml_op op);
633
653
 
654
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
655
+ GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
656
+
634
657
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
635
658
 
636
659
  GGML_API bool ggml_is_quantized(enum ggml_type type);
@@ -641,6 +664,11 @@ extern "C" {
641
664
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
642
665
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
643
666
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
667
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
668
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
669
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
670
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
671
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
644
672
 
645
673
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
646
674
 
@@ -773,6 +801,9 @@ extern "C" {
773
801
  struct ggml_tensor * a,
774
802
  struct ggml_tensor * b);
775
803
 
804
+ // dst = a
805
+ // view(dst, nb1, nb2, nb3, offset) += b
806
+ // return dst
776
807
  GGML_API struct ggml_tensor * ggml_acc(
777
808
  struct ggml_context * ctx,
778
809
  struct ggml_tensor * a,
@@ -937,15 +968,14 @@ extern "C" {
937
968
  struct ggml_context * ctx,
938
969
  struct ggml_tensor * a);
939
970
 
940
- GGML_API struct ggml_tensor * ggml_leaky(
971
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
941
972
  struct ggml_context * ctx,
942
- struct ggml_tensor * a);
973
+ struct ggml_tensor * a, float negative_slope, bool inplace);
943
974
 
944
975
  GGML_API struct ggml_tensor * ggml_relu_inplace(
945
976
  struct ggml_context * ctx,
946
977
  struct ggml_tensor * a);
947
978
 
948
- // TODO: double-check this computation is correct
949
979
  GGML_API struct ggml_tensor * ggml_gelu(
950
980
  struct ggml_context * ctx,
951
981
  struct ggml_tensor * a);
@@ -1027,6 +1057,16 @@ extern "C" {
1027
1057
  struct ggml_tensor * a,
1028
1058
  struct ggml_tensor * b);
1029
1059
 
1060
+ // indirect matrix multiplication
1061
+ // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1062
+ GGML_API struct ggml_tensor * ggml_mul_mat_id(
1063
+ struct ggml_context * ctx,
1064
+ struct ggml_tensor * const as[],
1065
+ int n_as,
1066
+ struct ggml_tensor * ids,
1067
+ int id,
1068
+ struct ggml_tensor * b);
1069
+
1030
1070
  // A: m columns, n rows,
1031
1071
  // B: p columns, n rows,
1032
1072
  // result is m columns, p rows
@@ -1234,6 +1274,7 @@ extern "C" {
1234
1274
  struct ggml_context * ctx,
1235
1275
  struct ggml_tensor * a);
1236
1276
 
1277
+ // supports 3D: a->ne[2] == b->ne[1]
1237
1278
  GGML_API struct ggml_tensor * ggml_get_rows(
1238
1279
  struct ggml_context * ctx,
1239
1280
  struct ggml_tensor * a,
@@ -1520,6 +1561,32 @@ extern "C" {
1520
1561
  struct ggml_tensor * a,
1521
1562
  int scale_factor);
1522
1563
 
1564
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1565
+ GGML_API struct ggml_tensor * ggml_pad(
1566
+ struct ggml_context * ctx,
1567
+ struct ggml_tensor * a,
1568
+ int p0,
1569
+ int p1,
1570
+ int p2,
1571
+ int p3);
1572
+
1573
+ // sort rows
1574
+ enum ggml_sort_order {
1575
+ GGML_SORT_ASC,
1576
+ GGML_SORT_DESC,
1577
+ };
1578
+
1579
+ GGML_API struct ggml_tensor * ggml_argsort(
1580
+ struct ggml_context * ctx,
1581
+ struct ggml_tensor * a,
1582
+ enum ggml_sort_order order);
1583
+
1584
+ // top k elements per row
1585
+ GGML_API struct ggml_tensor * ggml_top_k(
1586
+ struct ggml_context * ctx,
1587
+ struct ggml_tensor * a,
1588
+ int k);
1589
+
1523
1590
  GGML_API struct ggml_tensor * ggml_flash_attn(
1524
1591
  struct ggml_context * ctx,
1525
1592
  struct ggml_tensor * q,
@@ -1581,7 +1648,6 @@ extern "C" {
1581
1648
  int kh);
1582
1649
 
1583
1650
  // used in sam
1584
-
1585
1651
  GGML_API struct ggml_tensor * ggml_add_rel_pos(
1586
1652
  struct ggml_context * ctx,
1587
1653
  struct ggml_tensor * a,
@@ -1756,7 +1822,7 @@ extern "C" {
1756
1822
  GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
1757
1823
  GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
1758
1824
  GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1759
- GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
1825
+ GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
1760
1826
  GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
1761
1827
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
1762
1828
  GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);