llama_cpp 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,9 +305,13 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
298
316
  GGML_OP_CONV_1D_1S,
299
317
  GGML_OP_CONV_1D_2S,
@@ -322,7 +340,8 @@ extern "C" {
322
340
 
323
341
  // n-dimensional tensor
324
342
  struct ggml_tensor {
325
- enum ggml_type type;
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
326
345
 
327
346
  int n_dims;
328
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +372,7 @@ extern "C" {
353
372
 
354
373
  char name[32];
355
374
 
356
- char padding[8]; // TODO: remove and add padding to name?
375
+ char padding[16];
357
376
  };
358
377
 
359
378
  // computation graph
@@ -497,6 +516,29 @@ extern "C" {
497
516
  struct ggml_tensor * a,
498
517
  struct ggml_tensor * b);
499
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
500
542
  GGML_API struct ggml_tensor * ggml_sub(
501
543
  struct ggml_context * ctx,
502
544
  struct ggml_tensor * a,
@@ -520,12 +562,24 @@ extern "C" {
520
562
  struct ggml_context * ctx,
521
563
  struct ggml_tensor * a);
522
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
523
573
  // return scalar
524
- // TODO: compute sum along rows
525
574
  GGML_API struct ggml_tensor * ggml_sum(
526
575
  struct ggml_context * ctx,
527
576
  struct ggml_tensor * a);
528
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
529
583
  // mean along rows
530
584
  GGML_API struct ggml_tensor * ggml_mean(
531
585
  struct ggml_context * ctx,
@@ -567,6 +621,13 @@ extern "C" {
567
621
  struct ggml_context * ctx,
568
622
  struct ggml_tensor * a);
569
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
570
631
  // normalize along rows
571
632
  // TODO: eps is hardcoded to 1e-5 for now
572
633
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +638,13 @@ extern "C" {
577
638
  struct ggml_context * ctx,
578
639
  struct ggml_tensor * a);
579
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
580
648
  // A: m rows, n columns
581
649
  // B: p rows, n columns (i.e. we transpose it internally)
582
650
  // result is m columns, p rows
@@ -589,12 +657,66 @@ extern "C" {
589
657
  // operations on tensors without backpropagation
590
658
  //
591
659
 
592
- // in-place, returns view(a)
593
660
  GGML_API struct ggml_tensor * ggml_scale(
594
661
  struct ggml_context * ctx,
595
662
  struct ggml_tensor * a,
596
663
  struct ggml_tensor * b);
597
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
598
720
  // a -> b, return view(b)
599
721
  GGML_API struct ggml_tensor * ggml_cpy(
600
722
  struct ggml_context * ctx,
@@ -615,6 +737,11 @@ extern "C" {
615
737
 
616
738
  // return view(a)
617
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
618
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
746
  struct ggml_context * ctx,
620
747
  struct ggml_tensor * a,
@@ -630,6 +757,14 @@ extern "C" {
630
757
  int64_t ne1,
631
758
  int64_t ne2);
632
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
633
768
  // offset in bytes
634
769
  GGML_API struct ggml_tensor * ggml_view_1d(
635
770
  struct ggml_context * ctx,
@@ -655,6 +790,18 @@ extern "C" {
655
790
  size_t nb2, // slice stride in bytes
656
791
  size_t offset);
657
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
658
805
  GGML_API struct ggml_tensor * ggml_permute(
659
806
  struct ggml_context * ctx,
660
807
  struct ggml_tensor * a,
@@ -673,20 +820,50 @@ extern "C" {
673
820
  struct ggml_tensor * a,
674
821
  struct ggml_tensor * b);
675
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
676
833
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
835
  struct ggml_context * ctx,
680
836
  struct ggml_tensor * a,
681
837
  int n_past);
682
838
 
683
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
684
857
  GGML_API struct ggml_tensor * ggml_soft_max(
685
858
  struct ggml_context * ctx,
686
859
  struct ggml_tensor * a);
687
860
 
688
- // rotary position embedding
689
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
690
867
  // if mode & 1 == 1, skip n_past elements
691
868
  // if mode & 2 == 1, GPT-NeoX style
692
869
  // TODO: avoid creating a new tensor every time
@@ -697,6 +874,23 @@ extern "C" {
697
874
  int n_dims,
698
875
  int mode);
699
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
700
894
  // alibi position embedding
701
895
  // in-place, returns view(a)
702
896
  struct ggml_tensor * ggml_alibi(
@@ -741,13 +935,13 @@ extern "C" {
741
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
936
  struct ggml_context * ctx,
743
937
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
938
+ ggml_unary_op_f32_t fun);
745
939
 
746
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
941
  struct ggml_context * ctx,
748
942
  struct ggml_tensor * a,
749
943
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
944
+ ggml_binary_op_f32_t fun);
751
945
 
752
946
  //
753
947
  // automatic differentiation
@@ -876,7 +1070,6 @@ extern "C" {
876
1070
 
877
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);