llama_cpp 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -198,7 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
- #define GGML_MAX_NAME 32
201
+ #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
204
  #define GGML_ASSERT(x) \
@@ -303,6 +303,7 @@ extern "C" {
303
303
  GGML_OP_STEP,
304
304
  GGML_OP_RELU,
305
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
306
307
  GGML_OP_SILU,
307
308
  GGML_OP_SILU_BACK,
308
309
  GGML_OP_NORM, // normalize
@@ -331,16 +332,23 @@ extern "C" {
331
332
  GGML_OP_ROPE_BACK,
332
333
  GGML_OP_ALIBI,
333
334
  GGML_OP_CLAMP,
334
- GGML_OP_CONV_1D_1S,
335
- GGML_OP_CONV_1D_2S,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
336
338
 
337
339
  GGML_OP_FLASH_ATTN,
338
340
  GGML_OP_FLASH_FF,
339
341
  GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
340
344
 
341
345
  GGML_OP_MAP_UNARY,
342
346
  GGML_OP_MAP_BINARY,
343
347
 
348
+ GGML_OP_MAP_CUSTOM1,
349
+ GGML_OP_MAP_CUSTOM2,
350
+ GGML_OP_MAP_CUSTOM3,
351
+
344
352
  GGML_OP_CROSS_ENTROPY_LOSS,
345
353
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
346
354
 
@@ -461,6 +469,9 @@ extern "C" {
461
469
  GGML_API int64_t ggml_cycles(void);
462
470
  GGML_API int64_t ggml_cycles_per_ms(void);
463
471
 
472
+ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
473
+ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
474
+
464
475
  GGML_API void ggml_print_object (const struct ggml_object * obj);
465
476
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
466
477
 
@@ -500,8 +511,9 @@ extern "C" {
500
511
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
512
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
513
 
503
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
514
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
515
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
516
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
505
517
 
506
518
  GGML_API struct ggml_tensor * ggml_new_tensor(
507
519
  struct ggml_context * ctx,
@@ -556,8 +568,9 @@ extern "C" {
556
568
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
569
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
570
 
559
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
571
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
572
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
573
+ GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
561
574
 
562
575
  //
563
576
  // operations on tensors with backpropagation
@@ -610,24 +623,47 @@ extern "C" {
610
623
  struct ggml_tensor * a,
611
624
  struct ggml_tensor * b);
612
625
 
626
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
613
631
  GGML_API struct ggml_tensor * ggml_mul(
614
632
  struct ggml_context * ctx,
615
633
  struct ggml_tensor * a,
616
634
  struct ggml_tensor * b);
617
635
 
636
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
637
+ struct ggml_context * ctx,
638
+ struct ggml_tensor * a,
639
+ struct ggml_tensor * b);
640
+
618
641
  GGML_API struct ggml_tensor * ggml_div(
619
642
  struct ggml_context * ctx,
620
643
  struct ggml_tensor * a,
621
644
  struct ggml_tensor * b);
622
645
 
646
+ GGML_API struct ggml_tensor * ggml_div_inplace(
647
+ struct ggml_context * ctx,
648
+ struct ggml_tensor * a,
649
+ struct ggml_tensor * b);
650
+
623
651
  GGML_API struct ggml_tensor * ggml_sqr(
624
652
  struct ggml_context * ctx,
625
653
  struct ggml_tensor * a);
626
654
 
655
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
656
+ struct ggml_context * ctx,
657
+ struct ggml_tensor * a);
658
+
627
659
  GGML_API struct ggml_tensor * ggml_sqrt(
628
660
  struct ggml_context * ctx,
629
661
  struct ggml_tensor * a);
630
662
 
663
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
664
+ struct ggml_context * ctx,
665
+ struct ggml_tensor * a);
666
+
631
667
  GGML_API struct ggml_tensor * ggml_log(
632
668
  struct ggml_context * ctx,
633
669
  struct ggml_tensor * a);
@@ -667,31 +703,67 @@ extern "C" {
667
703
  struct ggml_context * ctx,
668
704
  struct ggml_tensor * a);
669
705
 
706
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
707
+ struct ggml_context * ctx,
708
+ struct ggml_tensor * a);
709
+
670
710
  GGML_API struct ggml_tensor * ggml_sgn(
671
711
  struct ggml_context * ctx,
672
712
  struct ggml_tensor * a);
673
713
 
714
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a);
717
+
674
718
  GGML_API struct ggml_tensor * ggml_neg(
675
719
  struct ggml_context * ctx,
676
720
  struct ggml_tensor * a);
677
721
 
722
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
678
726
  GGML_API struct ggml_tensor * ggml_step(
679
727
  struct ggml_context * ctx,
680
728
  struct ggml_tensor * a);
681
729
 
730
+ GGML_API struct ggml_tensor * ggml_step_inplace(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a);
733
+
682
734
  GGML_API struct ggml_tensor * ggml_relu(
683
735
  struct ggml_context * ctx,
684
736
  struct ggml_tensor * a);
685
737
 
738
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
739
+ struct ggml_context * ctx,
740
+ struct ggml_tensor * a);
741
+
686
742
  // TODO: double-check this computation is correct
687
743
  GGML_API struct ggml_tensor * ggml_gelu(
688
744
  struct ggml_context * ctx,
689
745
  struct ggml_tensor * a);
690
746
 
747
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
748
+ struct ggml_context * ctx,
749
+ struct ggml_tensor * a);
750
+
751
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
752
+ struct ggml_context * ctx,
753
+ struct ggml_tensor * a);
754
+
755
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
756
+ struct ggml_context * ctx,
757
+ struct ggml_tensor * a);
758
+
691
759
  GGML_API struct ggml_tensor * ggml_silu(
692
760
  struct ggml_context * ctx,
693
761
  struct ggml_tensor * a);
694
762
 
763
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
764
+ struct ggml_context * ctx,
765
+ struct ggml_tensor * a);
766
+
695
767
  // a - x
696
768
  // b - dy
697
769
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -705,10 +777,18 @@ extern "C" {
705
777
  struct ggml_context * ctx,
706
778
  struct ggml_tensor * a);
707
779
 
780
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
708
784
  GGML_API struct ggml_tensor * ggml_rms_norm(
709
785
  struct ggml_context * ctx,
710
786
  struct ggml_tensor * a);
711
787
 
788
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
789
+ struct ggml_context * ctx,
790
+ struct ggml_tensor * a);
791
+
712
792
  // a - x
713
793
  // b - dy
714
794
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -956,13 +1036,15 @@ extern "C" {
956
1036
  // rotary position embedding
957
1037
  // if mode & 1 == 1, skip n_past elements
958
1038
  // if mode & 2 == 1, GPT-NeoX style
1039
+ // if mode & 4 == 1, ChatGLM style
959
1040
  // TODO: avoid creating a new tensor every time
960
1041
  GGML_API struct ggml_tensor * ggml_rope(
961
1042
  struct ggml_context * ctx,
962
1043
  struct ggml_tensor * a,
963
1044
  int n_past,
964
1045
  int n_dims,
965
- int mode);
1046
+ int mode,
1047
+ int n_ctx);
966
1048
 
967
1049
  // in-place, returns view(a)
968
1050
  GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -970,7 +1052,8 @@ extern "C" {
970
1052
  struct ggml_tensor * a,
971
1053
  int n_past,
972
1054
  int n_dims,
973
- int mode);
1055
+ int mode,
1056
+ int n_ctx);
974
1057
 
975
1058
  // rotary position embedding backward, i.e compute dx from dy
976
1059
  // a - dy
@@ -998,16 +1081,55 @@ extern "C" {
998
1081
  float min,
999
1082
  float max);
1000
1083
 
1001
- // padding = 1
1084
+ // TODO: implement general-purpose convolutions
1085
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
+ // struct ggml_context * ctx,
1087
+ // struct ggml_tensor * a,
1088
+ // struct ggml_tensor * b,
1089
+ // int s0
1090
+ // int p0,
1091
+ // int d0);
1092
+ //
1093
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
+ // struct ggml_context * ctx,
1095
+ // struct ggml_tensor * a,
1096
+ // struct ggml_tensor * b,
1097
+ // int s0,
1098
+ // int s1,
1099
+ // int p0,
1100
+ // int p1,
1101
+ // int d0,
1102
+ // int d1);
1103
+
1104
+ // padding = half
1002
1105
  // TODO: we don't support extra parameters for now
1003
1106
  // that's why we are hard-coding the stride, padding, and dilation
1004
1107
  // not great ..
1005
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1108
+ // example:
1109
+ // a: 3 80 768 1
1110
+ // b: 3000 80 1 1
1111
+ // res: 3000 768 1 1
1112
+ // used in whisper
1113
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1006
1114
  struct ggml_context * ctx,
1007
1115
  struct ggml_tensor * a,
1008
1116
  struct ggml_tensor * b);
1009
1117
 
1010
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1118
+ // used in whisper
1119
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1120
+ struct ggml_context * ctx,
1121
+ struct ggml_tensor * a,
1122
+ struct ggml_tensor * b);
1123
+
1124
+ // kernel size is a->ne[0] x a->ne[1]
1125
+ // stride is equal to kernel size
1126
+ // padding is zero
1127
+ // example:
1128
+ // a: 16 16 3 768
1129
+ // b: 1024 1024 3 1
1130
+ // res: 64 64 768 1
1131
+ // used in sam
1132
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1011
1133
  struct ggml_context * ctx,
1012
1134
  struct ggml_tensor * a,
1013
1135
  struct ggml_tensor * b);
@@ -1035,21 +1157,93 @@ extern "C" {
1035
1157
  struct ggml_tensor * c0,
1036
1158
  struct ggml_tensor * c1);
1037
1159
 
1038
- // Mapping operations
1039
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1160
+ // partition into non-overlapping windows with padding if needed
1161
+ // example:
1162
+ // a: 768 64 64 1
1163
+ // w: 14
1164
+ // res: 768 14 14 25
1165
+ // used in sam
1166
+ GGML_API struct ggml_tensor * ggml_win_part(
1167
+ struct ggml_context * ctx,
1168
+ struct ggml_tensor * a,
1169
+ int w);
1170
+
1171
+ // reverse of ggml_win_part
1172
+ // used in sam
1173
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1174
+ struct ggml_context * ctx,
1175
+ struct ggml_tensor * a,
1176
+ int w0,
1177
+ int h0,
1178
+ int w);
1179
+
1180
+ // custom operators
1181
+
1182
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1040
1183
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1041
1184
 
1185
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1186
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1187
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1188
+
1042
1189
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1043
1190
  struct ggml_context * ctx,
1044
1191
  struct ggml_tensor * a,
1045
1192
  ggml_unary_op_f32_t fun);
1046
1193
 
1194
+ GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1195
+ struct ggml_context * ctx,
1196
+ struct ggml_tensor * a,
1197
+ ggml_unary_op_f32_t fun);
1198
+
1047
1199
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
1048
1200
  struct ggml_context * ctx,
1049
1201
  struct ggml_tensor * a,
1050
1202
  struct ggml_tensor * b,
1051
1203
  ggml_binary_op_f32_t fun);
1052
1204
 
1205
+ GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1206
+ struct ggml_context * ctx,
1207
+ struct ggml_tensor * a,
1208
+ struct ggml_tensor * b,
1209
+ ggml_binary_op_f32_t fun);
1210
+
1211
+ GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1212
+ struct ggml_context * ctx,
1213
+ struct ggml_tensor * a,
1214
+ ggml_custom1_op_f32_t fun);
1215
+
1216
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1217
+ struct ggml_context * ctx,
1218
+ struct ggml_tensor * a,
1219
+ ggml_custom1_op_f32_t fun);
1220
+
1221
+ GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1222
+ struct ggml_context * ctx,
1223
+ struct ggml_tensor * a,
1224
+ struct ggml_tensor * b,
1225
+ ggml_custom2_op_f32_t fun);
1226
+
1227
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1228
+ struct ggml_context * ctx,
1229
+ struct ggml_tensor * a,
1230
+ struct ggml_tensor * b,
1231
+ ggml_custom2_op_f32_t fun);
1232
+
1233
+ GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1234
+ struct ggml_context * ctx,
1235
+ struct ggml_tensor * a,
1236
+ struct ggml_tensor * b,
1237
+ struct ggml_tensor * c,
1238
+ ggml_custom3_op_f32_t fun);
1239
+
1240
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1241
+ struct ggml_context * ctx,
1242
+ struct ggml_tensor * a,
1243
+ struct ggml_tensor * b,
1244
+ struct ggml_tensor * c,
1245
+ ggml_custom3_op_f32_t fun);
1246
+
1053
1247
  // loss function
1054
1248
 
1055
1249
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(