llama_cpp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,9 +305,13 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
298
316
  GGML_OP_CONV_1D_1S,
299
317
  GGML_OP_CONV_1D_2S,
@@ -322,7 +340,8 @@ extern "C" {
322
340
 
323
341
  // n-dimensional tensor
324
342
  struct ggml_tensor {
325
- enum ggml_type type;
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
326
345
 
327
346
  int n_dims;
328
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +372,7 @@ extern "C" {
353
372
 
354
373
  char name[32];
355
374
 
356
- char padding[8]; // TODO: remove and add padding to name?
375
+ char padding[16];
357
376
  };
358
377
 
359
378
  // computation graph
@@ -497,6 +516,29 @@ extern "C" {
497
516
  struct ggml_tensor * a,
498
517
  struct ggml_tensor * b);
499
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
500
542
  GGML_API struct ggml_tensor * ggml_sub(
501
543
  struct ggml_context * ctx,
502
544
  struct ggml_tensor * a,
@@ -520,12 +562,24 @@ extern "C" {
520
562
  struct ggml_context * ctx,
521
563
  struct ggml_tensor * a);
522
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
523
573
  // return scalar
524
- // TODO: compute sum along rows
525
574
  GGML_API struct ggml_tensor * ggml_sum(
526
575
  struct ggml_context * ctx,
527
576
  struct ggml_tensor * a);
528
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
529
583
  // mean along rows
530
584
  GGML_API struct ggml_tensor * ggml_mean(
531
585
  struct ggml_context * ctx,
@@ -567,6 +621,13 @@ extern "C" {
567
621
  struct ggml_context * ctx,
568
622
  struct ggml_tensor * a);
569
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
570
631
  // normalize along rows
571
632
  // TODO: eps is hardcoded to 1e-5 for now
572
633
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +638,13 @@ extern "C" {
577
638
  struct ggml_context * ctx,
578
639
  struct ggml_tensor * a);
579
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
580
648
  // A: m rows, n columns
581
649
  // B: p rows, n columns (i.e. we transpose it internally)
582
650
  // result is m columns, p rows
@@ -589,12 +657,66 @@ extern "C" {
589
657
  // operations on tensors without backpropagation
590
658
  //
591
659
 
592
- // in-place, returns view(a)
593
660
  GGML_API struct ggml_tensor * ggml_scale(
594
661
  struct ggml_context * ctx,
595
662
  struct ggml_tensor * a,
596
663
  struct ggml_tensor * b);
597
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
598
720
  // a -> b, return view(b)
599
721
  GGML_API struct ggml_tensor * ggml_cpy(
600
722
  struct ggml_context * ctx,
@@ -615,6 +737,11 @@ extern "C" {
615
737
 
616
738
  // return view(a)
617
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
618
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
746
  struct ggml_context * ctx,
620
747
  struct ggml_tensor * a,
@@ -630,6 +757,14 @@ extern "C" {
630
757
  int64_t ne1,
631
758
  int64_t ne2);
632
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
633
768
  // offset in bytes
634
769
  GGML_API struct ggml_tensor * ggml_view_1d(
635
770
  struct ggml_context * ctx,
@@ -655,6 +790,18 @@ extern "C" {
655
790
  size_t nb2, // slice stride in bytes
656
791
  size_t offset);
657
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
658
805
  GGML_API struct ggml_tensor * ggml_permute(
659
806
  struct ggml_context * ctx,
660
807
  struct ggml_tensor * a,
@@ -673,20 +820,50 @@ extern "C" {
673
820
  struct ggml_tensor * a,
674
821
  struct ggml_tensor * b);
675
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
676
833
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
835
  struct ggml_context * ctx,
680
836
  struct ggml_tensor * a,
681
837
  int n_past);
682
838
 
683
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
684
857
  GGML_API struct ggml_tensor * ggml_soft_max(
685
858
  struct ggml_context * ctx,
686
859
  struct ggml_tensor * a);
687
860
 
688
- // rotary position embedding
689
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
690
867
  // if mode & 1 == 1, skip n_past elements
691
868
  // if mode & 2 == 1, GPT-NeoX style
692
869
  // TODO: avoid creating a new tensor every time
@@ -697,6 +874,23 @@ extern "C" {
697
874
  int n_dims,
698
875
  int mode);
699
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
700
894
  // alibi position embedding
701
895
  // in-place, returns view(a)
702
896
  struct ggml_tensor * ggml_alibi(
@@ -741,13 +935,13 @@ extern "C" {
741
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
936
  struct ggml_context * ctx,
743
937
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
938
+ ggml_unary_op_f32_t fun);
745
939
 
746
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
941
  struct ggml_context * ctx,
748
942
  struct ggml_tensor * a,
749
943
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
944
+ ggml_binary_op_f32_t fun);
751
945
 
752
946
  //
753
947
  // automatic differentiation
@@ -876,7 +1070,6 @@ extern "C" {
876
1070
 
877
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);