llama_cpp 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -296,12 +296,14 @@ extern "C" {
296
296
  GGML_OP_SUM_ROWS,
297
297
  GGML_OP_MEAN,
298
298
  GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
299
300
  GGML_OP_ABS,
300
301
  GGML_OP_SGN,
301
302
  GGML_OP_NEG,
302
303
  GGML_OP_STEP,
303
304
  GGML_OP_RELU,
304
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
305
307
  GGML_OP_SILU,
306
308
  GGML_OP_SILU_BACK,
307
309
  GGML_OP_NORM, // normalize
@@ -309,6 +311,7 @@ extern "C" {
309
311
  GGML_OP_RMS_NORM_BACK,
310
312
 
311
313
  GGML_OP_MUL_MAT,
314
+ GGML_OP_OUT_PROD,
312
315
 
313
316
  GGML_OP_SCALE,
314
317
  GGML_OP_SET,
@@ -324,19 +327,27 @@ extern "C" {
324
327
  GGML_OP_DIAG_MASK_INF,
325
328
  GGML_OP_DIAG_MASK_ZERO,
326
329
  GGML_OP_SOFT_MAX,
330
+ GGML_OP_SOFT_MAX_BACK,
327
331
  GGML_OP_ROPE,
328
332
  GGML_OP_ROPE_BACK,
329
333
  GGML_OP_ALIBI,
330
334
  GGML_OP_CLAMP,
331
- GGML_OP_CONV_1D_1S,
332
- GGML_OP_CONV_1D_2S,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
333
338
 
334
339
  GGML_OP_FLASH_ATTN,
335
340
  GGML_OP_FLASH_FF,
341
+ GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
336
344
 
337
345
  GGML_OP_MAP_UNARY,
338
346
  GGML_OP_MAP_BINARY,
339
347
 
348
+ GGML_OP_CROSS_ENTROPY_LOSS,
349
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
350
+
340
351
  GGML_OP_COUNT,
341
352
  };
342
353
 
@@ -478,6 +489,7 @@ extern "C" {
478
489
 
479
490
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
480
491
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
492
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
481
493
 
482
494
  // use this to compute the memory overhead of a tensor
483
495
  GGML_API size_t ggml_tensor_overhead(void);
@@ -492,8 +504,9 @@ extern "C" {
492
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
493
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
494
506
 
495
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
496
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
497
510
 
498
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
499
512
  struct ggml_context * ctx,
@@ -548,8 +561,8 @@ extern "C" {
548
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
549
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
550
563
 
551
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
552
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
553
566
 
554
567
  //
555
568
  // operations on tensors with backpropagation
@@ -574,6 +587,11 @@ extern "C" {
574
587
  struct ggml_tensor * a,
575
588
  struct ggml_tensor * b);
576
589
 
590
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
591
+ struct ggml_context * ctx,
592
+ struct ggml_tensor * a,
593
+ struct ggml_tensor * b);
594
+
577
595
  GGML_API struct ggml_tensor * ggml_acc(
578
596
  struct ggml_context * ctx,
579
597
  struct ggml_tensor * a,
@@ -597,24 +615,47 @@ extern "C" {
597
615
  struct ggml_tensor * a,
598
616
  struct ggml_tensor * b);
599
617
 
618
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ struct ggml_tensor * b);
622
+
600
623
  GGML_API struct ggml_tensor * ggml_mul(
601
624
  struct ggml_context * ctx,
602
625
  struct ggml_tensor * a,
603
626
  struct ggml_tensor * b);
604
627
 
628
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
629
+ struct ggml_context * ctx,
630
+ struct ggml_tensor * a,
631
+ struct ggml_tensor * b);
632
+
605
633
  GGML_API struct ggml_tensor * ggml_div(
606
634
  struct ggml_context * ctx,
607
635
  struct ggml_tensor * a,
608
636
  struct ggml_tensor * b);
609
637
 
638
+ GGML_API struct ggml_tensor * ggml_div_inplace(
639
+ struct ggml_context * ctx,
640
+ struct ggml_tensor * a,
641
+ struct ggml_tensor * b);
642
+
610
643
  GGML_API struct ggml_tensor * ggml_sqr(
611
644
  struct ggml_context * ctx,
612
645
  struct ggml_tensor * a);
613
646
 
647
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
648
+ struct ggml_context * ctx,
649
+ struct ggml_tensor * a);
650
+
614
651
  GGML_API struct ggml_tensor * ggml_sqrt(
615
652
  struct ggml_context * ctx,
616
653
  struct ggml_tensor * a);
617
654
 
655
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
656
+ struct ggml_context * ctx,
657
+ struct ggml_tensor * a);
658
+
618
659
  GGML_API struct ggml_tensor * ggml_log(
619
660
  struct ggml_context * ctx,
620
661
  struct ggml_tensor * a);
@@ -645,35 +686,76 @@ extern "C" {
645
686
  struct ggml_tensor * a,
646
687
  struct ggml_tensor * b);
647
688
 
689
+ GGML_API struct ggml_tensor * ggml_repeat_back(
690
+ struct ggml_context * ctx,
691
+ struct ggml_tensor * a,
692
+ struct ggml_tensor * b);
693
+
648
694
  GGML_API struct ggml_tensor * ggml_abs(
649
695
  struct ggml_context * ctx,
650
696
  struct ggml_tensor * a);
651
697
 
698
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a);
701
+
652
702
  GGML_API struct ggml_tensor * ggml_sgn(
653
703
  struct ggml_context * ctx,
654
704
  struct ggml_tensor * a);
655
705
 
706
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
707
+ struct ggml_context * ctx,
708
+ struct ggml_tensor * a);
709
+
656
710
  GGML_API struct ggml_tensor * ggml_neg(
657
711
  struct ggml_context * ctx,
658
712
  struct ggml_tensor * a);
659
713
 
714
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a);
717
+
660
718
  GGML_API struct ggml_tensor * ggml_step(
661
719
  struct ggml_context * ctx,
662
720
  struct ggml_tensor * a);
663
721
 
722
+ GGML_API struct ggml_tensor * ggml_step_inplace(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
664
726
  GGML_API struct ggml_tensor * ggml_relu(
665
727
  struct ggml_context * ctx,
666
728
  struct ggml_tensor * a);
667
729
 
730
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a);
733
+
668
734
  // TODO: double-check this computation is correct
669
735
  GGML_API struct ggml_tensor * ggml_gelu(
670
736
  struct ggml_context * ctx,
671
737
  struct ggml_tensor * a);
672
738
 
739
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
740
+ struct ggml_context * ctx,
741
+ struct ggml_tensor * a);
742
+
743
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
744
+ struct ggml_context * ctx,
745
+ struct ggml_tensor * a);
746
+
747
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
748
+ struct ggml_context * ctx,
749
+ struct ggml_tensor * a);
750
+
673
751
  GGML_API struct ggml_tensor * ggml_silu(
674
752
  struct ggml_context * ctx,
675
753
  struct ggml_tensor * a);
676
754
 
755
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a);
758
+
677
759
  // a - x
678
760
  // b - dy
679
761
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -687,10 +769,18 @@ extern "C" {
687
769
  struct ggml_context * ctx,
688
770
  struct ggml_tensor * a);
689
771
 
772
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
773
+ struct ggml_context * ctx,
774
+ struct ggml_tensor * a);
775
+
690
776
  GGML_API struct ggml_tensor * ggml_rms_norm(
691
777
  struct ggml_context * ctx,
692
778
  struct ggml_tensor * a);
693
779
 
780
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
694
784
  // a - x
695
785
  // b - dy
696
786
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -698,14 +788,22 @@ extern "C" {
698
788
  struct ggml_tensor * a,
699
789
  struct ggml_tensor * b);
700
790
 
701
- // A: m rows, n columns
702
- // B: p rows, n columns (i.e. we transpose it internally)
791
+ // A: n columns, m rows
792
+ // B: n columns, p rows (i.e. we transpose it internally)
703
793
  // result is m columns, p rows
704
794
  GGML_API struct ggml_tensor * ggml_mul_mat(
705
795
  struct ggml_context * ctx,
706
796
  struct ggml_tensor * a,
707
797
  struct ggml_tensor * b);
708
798
 
799
+ // A: m columns, n rows,
800
+ // B: p columns, n rows,
801
+ // result is m columns, p rows
802
+ GGML_API struct ggml_tensor * ggml_out_prod(
803
+ struct ggml_context * ctx,
804
+ struct ggml_tensor * a,
805
+ struct ggml_tensor * b);
806
+
709
807
  //
710
808
  // operations on tensors without backpropagation
711
809
  //
@@ -916,6 +1014,17 @@ extern "C" {
916
1014
  struct ggml_context * ctx,
917
1015
  struct ggml_tensor * a);
918
1016
 
1017
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
1018
+ struct ggml_context * ctx,
1019
+ struct ggml_tensor * a,
1020
+ struct ggml_tensor * b);
1021
+
1022
+ // in-place, returns view(a)
1023
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1024
+ struct ggml_context * ctx,
1025
+ struct ggml_tensor * a,
1026
+ struct ggml_tensor * b);
1027
+
919
1028
  // rotary position embedding
920
1029
  // if mode & 1 == 1, skip n_past elements
921
1030
  // if mode & 2 == 1, GPT-NeoX style
@@ -961,16 +1070,55 @@ extern "C" {
961
1070
  float min,
962
1071
  float max);
963
1072
 
964
- // padding = 1
1073
+ // TODO: implement general-purpose convolutions
1074
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1075
+ // struct ggml_context * ctx,
1076
+ // struct ggml_tensor * a,
1077
+ // struct ggml_tensor * b,
1078
+ // int s0
1079
+ // int p0,
1080
+ // int d0);
1081
+ //
1082
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1083
+ // struct ggml_context * ctx,
1084
+ // struct ggml_tensor * a,
1085
+ // struct ggml_tensor * b,
1086
+ // int s0,
1087
+ // int s1,
1088
+ // int p0,
1089
+ // int p1,
1090
+ // int d0,
1091
+ // int d1);
1092
+
1093
+ // padding = half
965
1094
  // TODO: we don't support extra parameters for now
966
1095
  // that's why we are hard-coding the stride, padding, and dilation
967
1096
  // not great ..
968
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1097
+ // example:
1098
+ // a: 3 80 768 1
1099
+ // b: 3000 80 1 1
1100
+ // res: 3000 768 1 1
1101
+ // used in whisper
1102
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1103
+ struct ggml_context * ctx,
1104
+ struct ggml_tensor * a,
1105
+ struct ggml_tensor * b);
1106
+
1107
+ // used in whisper
1108
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
969
1109
  struct ggml_context * ctx,
970
1110
  struct ggml_tensor * a,
971
1111
  struct ggml_tensor * b);
972
1112
 
973
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1113
+ // kernel size is a->ne[0] x a->ne[1]
1114
+ // stride is equal to kernel size
1115
+ // padding is zero
1116
+ // example:
1117
+ // a: 16 16 3 768
1118
+ // b: 1024 1024 3 1
1119
+ // res: 64 64 768 1
1120
+ // used in sam
1121
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
974
1122
  struct ggml_context * ctx,
975
1123
  struct ggml_tensor * a,
976
1124
  struct ggml_tensor * b);
@@ -982,6 +1130,14 @@ extern "C" {
982
1130
  struct ggml_tensor * v,
983
1131
  bool masked);
984
1132
 
1133
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1134
+ struct ggml_context * ctx,
1135
+ struct ggml_tensor * q,
1136
+ struct ggml_tensor * k,
1137
+ struct ggml_tensor * v,
1138
+ struct ggml_tensor * d,
1139
+ bool masked);
1140
+
985
1141
  GGML_API struct ggml_tensor * ggml_flash_ff(
986
1142
  struct ggml_context * ctx,
987
1143
  struct ggml_tensor * a,
@@ -990,6 +1146,26 @@ extern "C" {
990
1146
  struct ggml_tensor * c0,
991
1147
  struct ggml_tensor * c1);
992
1148
 
1149
+ // partition into non-overlapping windows with padding if needed
1150
+ // example:
1151
+ // a: 768 64 64 1
1152
+ // w: 14
1153
+ // res: 768 14 14 25
1154
+ // used in sam
1155
+ GGML_API struct ggml_tensor * ggml_win_part(
1156
+ struct ggml_context * ctx,
1157
+ struct ggml_tensor * a,
1158
+ int w);
1159
+
1160
+ // reverse of ggml_win_part
1161
+ // used in sam
1162
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1163
+ struct ggml_context * ctx,
1164
+ struct ggml_tensor * a,
1165
+ int w0,
1166
+ int h0,
1167
+ int w);
1168
+
993
1169
  // Mapping operations
994
1170
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
995
1171
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
@@ -1005,6 +1181,19 @@ extern "C" {
1005
1181
  struct ggml_tensor * b,
1006
1182
  ggml_binary_op_f32_t fun);
1007
1183
 
1184
+ // loss function
1185
+
1186
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1187
+ struct ggml_context * ctx,
1188
+ struct ggml_tensor * a,
1189
+ struct ggml_tensor * b);
1190
+
1191
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1192
+ struct ggml_context * ctx,
1193
+ struct ggml_tensor * a,
1194
+ struct ggml_tensor * b,
1195
+ struct ggml_tensor * c);
1196
+
1008
1197
  //
1009
1198
  // automatic differentiation
1010
1199
  //
@@ -1099,6 +1288,8 @@ extern "C" {
1099
1288
  struct {
1100
1289
  int n_iter;
1101
1290
 
1291
+ float sched; // schedule multiplier (fixed, decay or warmup)
1292
+ float decay; // weight decay for AdamW, use 0.0f to disable
1102
1293
  float alpha; // learning rate
1103
1294
  float beta1;
1104
1295
  float beta2;
@@ -1123,6 +1314,49 @@ extern "C" {
1123
1314
  } lbfgs;
1124
1315
  };
1125
1316
 
1317
+ struct ggml_opt_context {
1318
+ struct ggml_context * ctx;
1319
+ struct ggml_opt_params params;
1320
+
1321
+ int iter;
1322
+ int64_t nx; // number of parameter elements
1323
+
1324
+ bool just_initialized;
1325
+
1326
+ struct {
1327
+ struct ggml_tensor * x; // view of the parameters
1328
+ struct ggml_tensor * g1; // gradient
1329
+ struct ggml_tensor * g2; // gradient squared
1330
+ struct ggml_tensor * m; // first moment
1331
+ struct ggml_tensor * v; // second moment
1332
+ struct ggml_tensor * mh; // first moment hat
1333
+ struct ggml_tensor * vh; // second moment hat
1334
+ struct ggml_tensor * pf; // past function values
1335
+ float fx_best;
1336
+ float fx_prev;
1337
+ int n_no_improvement;
1338
+ } adam;
1339
+
1340
+ struct {
1341
+ struct ggml_tensor * x; // current parameters
1342
+ struct ggml_tensor * xp; // previous parameters
1343
+ struct ggml_tensor * g; // current gradient
1344
+ struct ggml_tensor * gp; // previous gradient
1345
+ struct ggml_tensor * d; // search direction
1346
+ struct ggml_tensor * pf; // past function values
1347
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1348
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1349
+ struct ggml_tensor * lms; // the L-BFGS memory s
1350
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1351
+ float fx_best;
1352
+ float step;
1353
+ int j;
1354
+ int k;
1355
+ int end;
1356
+ int n_no_improvement;
1357
+ } lbfgs;
1358
+ };
1359
+
1126
1360
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1127
1361
 
1128
1362
  // optimize the function defined by the tensor f
@@ -1131,6 +1365,27 @@ extern "C" {
1131
1365
  struct ggml_opt_params params,
1132
1366
  struct ggml_tensor * f);
1133
1367
 
1368
+ // initialize optimizer context
1369
+ GGML_API void ggml_opt_init(
1370
+ struct ggml_context * ctx,
1371
+ struct ggml_opt_context * opt,
1372
+ struct ggml_opt_params params,
1373
+ int64_t nx);
1374
+
1375
+ // continue optimizing the function defined by the tensor f
1376
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1377
+ struct ggml_context * ctx,
1378
+ struct ggml_opt_context * opt,
1379
+ struct ggml_tensor * f);
1380
+
1381
+ // continue optimizing the function defined by the tensor f
1382
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1383
+ struct ggml_context * ctx,
1384
+ struct ggml_opt_context * opt,
1385
+ struct ggml_tensor * f,
1386
+ struct ggml_cgraph * gf,
1387
+ struct ggml_cgraph * gb);
1388
+
1134
1389
  //
1135
1390
  // quantization
1136
1391
  //