llama_cpp 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -296,12 +296,14 @@ extern "C" {
296
296
  GGML_OP_SUM_ROWS,
297
297
  GGML_OP_MEAN,
298
298
  GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
299
300
  GGML_OP_ABS,
300
301
  GGML_OP_SGN,
301
302
  GGML_OP_NEG,
302
303
  GGML_OP_STEP,
303
304
  GGML_OP_RELU,
304
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
305
307
  GGML_OP_SILU,
306
308
  GGML_OP_SILU_BACK,
307
309
  GGML_OP_NORM, // normalize
@@ -309,6 +311,7 @@ extern "C" {
309
311
  GGML_OP_RMS_NORM_BACK,
310
312
 
311
313
  GGML_OP_MUL_MAT,
314
+ GGML_OP_OUT_PROD,
312
315
 
313
316
  GGML_OP_SCALE,
314
317
  GGML_OP_SET,
@@ -324,19 +327,27 @@ extern "C" {
324
327
  GGML_OP_DIAG_MASK_INF,
325
328
  GGML_OP_DIAG_MASK_ZERO,
326
329
  GGML_OP_SOFT_MAX,
330
+ GGML_OP_SOFT_MAX_BACK,
327
331
  GGML_OP_ROPE,
328
332
  GGML_OP_ROPE_BACK,
329
333
  GGML_OP_ALIBI,
330
334
  GGML_OP_CLAMP,
331
- GGML_OP_CONV_1D_1S,
332
- GGML_OP_CONV_1D_2S,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
333
338
 
334
339
  GGML_OP_FLASH_ATTN,
335
340
  GGML_OP_FLASH_FF,
341
+ GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
336
344
 
337
345
  GGML_OP_MAP_UNARY,
338
346
  GGML_OP_MAP_BINARY,
339
347
 
348
+ GGML_OP_CROSS_ENTROPY_LOSS,
349
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
350
+
340
351
  GGML_OP_COUNT,
341
352
  };
342
353
 
@@ -478,6 +489,7 @@ extern "C" {
478
489
 
479
490
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
491
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
492
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
481
493
 
482
494
  // use this to compute the memory overhead of a tensor
483
495
  GGML_API size_t ggml_tensor_overhead(void);
@@ -492,8 +504,9 @@ extern "C" {
492
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
493
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
494
506
 
495
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
496
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
497
510
 
498
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
499
512
  struct ggml_context * ctx,
@@ -548,8 +561,8 @@ extern "C" {
548
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
549
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
550
563
 
551
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
552
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
553
566
 
554
567
  //
555
568
  // operations on tensors with backpropagation
@@ -574,6 +587,11 @@ extern "C" {
574
587
  struct ggml_tensor * a,
575
588
  struct ggml_tensor * b);
576
589
 
590
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
591
+ struct ggml_context * ctx,
592
+ struct ggml_tensor * a,
593
+ struct ggml_tensor * b);
594
+
577
595
  GGML_API struct ggml_tensor * ggml_acc(
578
596
  struct ggml_context * ctx,
579
597
  struct ggml_tensor * a,
@@ -597,24 +615,47 @@ extern "C" {
597
615
  struct ggml_tensor * a,
598
616
  struct ggml_tensor * b);
599
617
 
618
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ struct ggml_tensor * b);
622
+
600
623
  GGML_API struct ggml_tensor * ggml_mul(
601
624
  struct ggml_context * ctx,
602
625
  struct ggml_tensor * a,
603
626
  struct ggml_tensor * b);
604
627
 
628
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
629
+ struct ggml_context * ctx,
630
+ struct ggml_tensor * a,
631
+ struct ggml_tensor * b);
632
+
605
633
  GGML_API struct ggml_tensor * ggml_div(
606
634
  struct ggml_context * ctx,
607
635
  struct ggml_tensor * a,
608
636
  struct ggml_tensor * b);
609
637
 
638
+ GGML_API struct ggml_tensor * ggml_div_inplace(
639
+ struct ggml_context * ctx,
640
+ struct ggml_tensor * a,
641
+ struct ggml_tensor * b);
642
+
610
643
  GGML_API struct ggml_tensor * ggml_sqr(
611
644
  struct ggml_context * ctx,
612
645
  struct ggml_tensor * a);
613
646
 
647
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
648
+ struct ggml_context * ctx,
649
+ struct ggml_tensor * a);
650
+
614
651
  GGML_API struct ggml_tensor * ggml_sqrt(
615
652
  struct ggml_context * ctx,
616
653
  struct ggml_tensor * a);
617
654
 
655
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
656
+ struct ggml_context * ctx,
657
+ struct ggml_tensor * a);
658
+
618
659
  GGML_API struct ggml_tensor * ggml_log(
619
660
  struct ggml_context * ctx,
620
661
  struct ggml_tensor * a);
@@ -645,35 +686,76 @@ extern "C" {
645
686
  struct ggml_tensor * a,
646
687
  struct ggml_tensor * b);
647
688
 
689
+ GGML_API struct ggml_tensor * ggml_repeat_back(
690
+ struct ggml_context * ctx,
691
+ struct ggml_tensor * a,
692
+ struct ggml_tensor * b);
693
+
648
694
  GGML_API struct ggml_tensor * ggml_abs(
649
695
  struct ggml_context * ctx,
650
696
  struct ggml_tensor * a);
651
697
 
698
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a);
701
+
652
702
  GGML_API struct ggml_tensor * ggml_sgn(
653
703
  struct ggml_context * ctx,
654
704
  struct ggml_tensor * a);
655
705
 
706
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
707
+ struct ggml_context * ctx,
708
+ struct ggml_tensor * a);
709
+
656
710
  GGML_API struct ggml_tensor * ggml_neg(
657
711
  struct ggml_context * ctx,
658
712
  struct ggml_tensor * a);
659
713
 
714
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a);
717
+
660
718
  GGML_API struct ggml_tensor * ggml_step(
661
719
  struct ggml_context * ctx,
662
720
  struct ggml_tensor * a);
663
721
 
722
+ GGML_API struct ggml_tensor * ggml_step_inplace(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
664
726
  GGML_API struct ggml_tensor * ggml_relu(
665
727
  struct ggml_context * ctx,
666
728
  struct ggml_tensor * a);
667
729
 
730
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a);
733
+
668
734
  // TODO: double-check this computation is correct
669
735
  GGML_API struct ggml_tensor * ggml_gelu(
670
736
  struct ggml_context * ctx,
671
737
  struct ggml_tensor * a);
672
738
 
739
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
740
+ struct ggml_context * ctx,
741
+ struct ggml_tensor * a);
742
+
743
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
744
+ struct ggml_context * ctx,
745
+ struct ggml_tensor * a);
746
+
747
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
748
+ struct ggml_context * ctx,
749
+ struct ggml_tensor * a);
750
+
673
751
  GGML_API struct ggml_tensor * ggml_silu(
674
752
  struct ggml_context * ctx,
675
753
  struct ggml_tensor * a);
676
754
 
755
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a);
758
+
677
759
  // a - x
678
760
  // b - dy
679
761
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -687,10 +769,18 @@ extern "C" {
687
769
  struct ggml_context * ctx,
688
770
  struct ggml_tensor * a);
689
771
 
772
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
773
+ struct ggml_context * ctx,
774
+ struct ggml_tensor * a);
775
+
690
776
  GGML_API struct ggml_tensor * ggml_rms_norm(
691
777
  struct ggml_context * ctx,
692
778
  struct ggml_tensor * a);
693
779
 
780
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
694
784
  // a - x
695
785
  // b - dy
696
786
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -698,14 +788,22 @@ extern "C" {
698
788
  struct ggml_tensor * a,
699
789
  struct ggml_tensor * b);
700
790
 
701
- // A: m rows, n columns
702
- // B: p rows, n columns (i.e. we transpose it internally)
791
+ // A: n columns, m rows
792
+ // B: n columns, p rows (i.e. we transpose it internally)
703
793
  // result is m columns, p rows
704
794
  GGML_API struct ggml_tensor * ggml_mul_mat(
705
795
  struct ggml_context * ctx,
706
796
  struct ggml_tensor * a,
707
797
  struct ggml_tensor * b);
708
798
 
799
+ // A: m columns, n rows,
800
+ // B: p columns, n rows,
801
+ // result is m columns, p rows
802
+ GGML_API struct ggml_tensor * ggml_out_prod(
803
+ struct ggml_context * ctx,
804
+ struct ggml_tensor * a,
805
+ struct ggml_tensor * b);
806
+
709
807
  //
710
808
  // operations on tensors without backpropagation
711
809
  //
@@ -916,6 +1014,17 @@ extern "C" {
916
1014
  struct ggml_context * ctx,
917
1015
  struct ggml_tensor * a);
918
1016
 
1017
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
1018
+ struct ggml_context * ctx,
1019
+ struct ggml_tensor * a,
1020
+ struct ggml_tensor * b);
1021
+
1022
+ // in-place, returns view(a)
1023
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1024
+ struct ggml_context * ctx,
1025
+ struct ggml_tensor * a,
1026
+ struct ggml_tensor * b);
1027
+
919
1028
  // rotary position embedding
920
1029
  // if mode & 1 == 1, skip n_past elements
921
1030
  // if mode & 2 == 1, GPT-NeoX style
@@ -961,16 +1070,55 @@ extern "C" {
961
1070
  float min,
962
1071
  float max);
963
1072
 
964
- // padding = 1
1073
+ // TODO: implement general-purpose convolutions
1074
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1075
+ // struct ggml_context * ctx,
1076
+ // struct ggml_tensor * a,
1077
+ // struct ggml_tensor * b,
1078
+ // int s0
1079
+ // int p0,
1080
+ // int d0);
1081
+ //
1082
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1083
+ // struct ggml_context * ctx,
1084
+ // struct ggml_tensor * a,
1085
+ // struct ggml_tensor * b,
1086
+ // int s0,
1087
+ // int s1,
1088
+ // int p0,
1089
+ // int p1,
1090
+ // int d0,
1091
+ // int d1);
1092
+
1093
+ // padding = half
965
1094
  // TODO: we don't support extra parameters for now
966
1095
  // that's why we are hard-coding the stride, padding, and dilation
967
1096
  // not great ..
968
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1097
+ // example:
1098
+ // a: 3 80 768 1
1099
+ // b: 3000 80 1 1
1100
+ // res: 3000 768 1 1
1101
+ // used in whisper
1102
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1103
+ struct ggml_context * ctx,
1104
+ struct ggml_tensor * a,
1105
+ struct ggml_tensor * b);
1106
+
1107
+ // used in whisper
1108
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
969
1109
  struct ggml_context * ctx,
970
1110
  struct ggml_tensor * a,
971
1111
  struct ggml_tensor * b);
972
1112
 
973
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1113
+ // kernel size is a->ne[0] x a->ne[1]
1114
+ // stride is equal to kernel size
1115
+ // padding is zero
1116
+ // example:
1117
+ // a: 16 16 3 768
1118
+ // b: 1024 1024 3 1
1119
+ // res: 64 64 768 1
1120
+ // used in sam
1121
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
974
1122
  struct ggml_context * ctx,
975
1123
  struct ggml_tensor * a,
976
1124
  struct ggml_tensor * b);
@@ -982,6 +1130,14 @@ extern "C" {
982
1130
  struct ggml_tensor * v,
983
1131
  bool masked);
984
1132
 
1133
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * q,
1136
+ struct ggml_tensor * k,
1137
+ struct ggml_tensor * v,
1138
+ struct ggml_tensor * d,
1139
+ bool masked);
1140
+
985
1141
  GGML_API struct ggml_tensor * ggml_flash_ff(
986
1142
  struct ggml_context * ctx,
987
1143
  struct ggml_tensor * a,
@@ -990,6 +1146,26 @@ extern "C" {
990
1146
  struct ggml_tensor * c0,
991
1147
  struct ggml_tensor * c1);
992
1148
 
1149
+ // partition into non-overlapping windows with padding if needed
1150
+ // example:
1151
+ // a: 768 64 64 1
1152
+ // w: 14
1153
+ // res: 768 14 14 25
1154
+ // used in sam
1155
+ GGML_API struct ggml_tensor * ggml_win_part(
1156
+ struct ggml_context * ctx,
1157
+ struct ggml_tensor * a,
1158
+ int w);
1159
+
1160
+ // reverse of ggml_win_part
1161
+ // used in sam
1162
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1163
+ struct ggml_context * ctx,
1164
+ struct ggml_tensor * a,
1165
+ int w0,
1166
+ int h0,
1167
+ int w);
1168
+
993
1169
  // Mapping operations
994
1170
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
995
1171
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
@@ -1005,6 +1181,19 @@ extern "C" {
1005
1181
  struct ggml_tensor * b,
1006
1182
  ggml_binary_op_f32_t fun);
1007
1183
 
1184
+ // loss function
1185
+
1186
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1187
+ struct ggml_context * ctx,
1188
+ struct ggml_tensor * a,
1189
+ struct ggml_tensor * b);
1190
+
1191
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1192
+ struct ggml_context * ctx,
1193
+ struct ggml_tensor * a,
1194
+ struct ggml_tensor * b,
1195
+ struct ggml_tensor * c);
1196
+
1008
1197
  //
1009
1198
  // automatic differentiation
1010
1199
  //
@@ -1099,6 +1288,8 @@ extern "C" {
1099
1288
  struct {
1100
1289
  int n_iter;
1101
1290
 
1291
+ float sched; // schedule multiplier (fixed, decay or warmup)
1292
+ float decay; // weight decay for AdamW, use 0.0f to disable
1102
1293
  float alpha; // learning rate
1103
1294
  float beta1;
1104
1295
  float beta2;
@@ -1123,6 +1314,49 @@ extern "C" {
1123
1314
  } lbfgs;
1124
1315
  };
1125
1316
 
1317
+ struct ggml_opt_context {
1318
+ struct ggml_context * ctx;
1319
+ struct ggml_opt_params params;
1320
+
1321
+ int iter;
1322
+ int64_t nx; // number of parameter elements
1323
+
1324
+ bool just_initialized;
1325
+
1326
+ struct {
1327
+ struct ggml_tensor * x; // view of the parameters
1328
+ struct ggml_tensor * g1; // gradient
1329
+ struct ggml_tensor * g2; // gradient squared
1330
+ struct ggml_tensor * m; // first moment
1331
+ struct ggml_tensor * v; // second moment
1332
+ struct ggml_tensor * mh; // first moment hat
1333
+ struct ggml_tensor * vh; // second moment hat
1334
+ struct ggml_tensor * pf; // past function values
1335
+ float fx_best;
1336
+ float fx_prev;
1337
+ int n_no_improvement;
1338
+ } adam;
1339
+
1340
+ struct {
1341
+ struct ggml_tensor * x; // current parameters
1342
+ struct ggml_tensor * xp; // previous parameters
1343
+ struct ggml_tensor * g; // current gradient
1344
+ struct ggml_tensor * gp; // previous gradient
1345
+ struct ggml_tensor * d; // search direction
1346
+ struct ggml_tensor * pf; // past function values
1347
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1348
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1349
+ struct ggml_tensor * lms; // the L-BFGS memory s
1350
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1351
+ float fx_best;
1352
+ float step;
1353
+ int j;
1354
+ int k;
1355
+ int end;
1356
+ int n_no_improvement;
1357
+ } lbfgs;
1358
+ };
1359
+
1126
1360
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1127
1361
 
1128
1362
  // optimize the function defined by the tensor f
@@ -1131,6 +1365,27 @@ extern "C" {
1131
1365
  struct ggml_opt_params params,
1132
1366
  struct ggml_tensor * f);
1133
1367
 
1368
+ // initialize optimizer context
1369
+ GGML_API void ggml_opt_init(
1370
+ struct ggml_context * ctx,
1371
+ struct ggml_opt_context * opt,
1372
+ struct ggml_opt_params params,
1373
+ int64_t nx);
1374
+
1375
+ // continue optimizing the function defined by the tensor f
1376
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1377
+ struct ggml_context * ctx,
1378
+ struct ggml_opt_context * opt,
1379
+ struct ggml_tensor * f);
1380
+
1381
+ // continue optimizing the function defined by the tensor f
1382
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1383
+ struct ggml_context * ctx,
1384
+ struct ggml_opt_context * opt,
1385
+ struct ggml_tensor * f,
1386
+ struct ggml_cgraph * gf,
1387
+ struct ggml_cgraph * gb);
1388
+
1134
1389
  //
1135
1390
  // quantization
1136
1391
  //