llama_cpp 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,10 +305,15 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
298
317
  GGML_OP_CONV_1D_1S,
299
318
  GGML_OP_CONV_1D_2S,
300
319
 
@@ -322,7 +341,8 @@ extern "C" {
322
341
 
323
342
  // n-dimensional tensor
324
343
  struct ggml_tensor {
325
- enum ggml_type type;
344
+ enum ggml_type type;
345
+ enum ggml_backend backend;
326
346
 
327
347
  int n_dims;
328
348
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +373,7 @@ extern "C" {
353
373
 
354
374
  char name[32];
355
375
 
356
- char padding[8]; // TODO: remove and add padding to name?
376
+ char padding[16];
357
377
  };
358
378
 
359
379
  // computation graph
@@ -497,6 +517,29 @@ extern "C" {
497
517
  struct ggml_tensor * a,
498
518
  struct ggml_tensor * b);
499
519
 
520
+ GGML_API struct ggml_tensor * ggml_add1(
521
+ struct ggml_context * ctx,
522
+ struct ggml_tensor * a,
523
+ struct ggml_tensor * b);
524
+
525
+ GGML_API struct ggml_tensor * ggml_acc(
526
+ struct ggml_context * ctx,
527
+ struct ggml_tensor * a,
528
+ struct ggml_tensor * b,
529
+ size_t nb1,
530
+ size_t nb2,
531
+ size_t nb3,
532
+ size_t offset);
533
+
534
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
535
+ struct ggml_context * ctx,
536
+ struct ggml_tensor * a,
537
+ struct ggml_tensor * b,
538
+ size_t nb1,
539
+ size_t nb2,
540
+ size_t nb3,
541
+ size_t offset);
542
+
500
543
  GGML_API struct ggml_tensor * ggml_sub(
501
544
  struct ggml_context * ctx,
502
545
  struct ggml_tensor * a,
@@ -520,12 +563,24 @@ extern "C" {
520
563
  struct ggml_context * ctx,
521
564
  struct ggml_tensor * a);
522
565
 
566
+ GGML_API struct ggml_tensor * ggml_log(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ GGML_API struct ggml_tensor * ggml_log_inplace(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
523
574
  // return scalar
524
- // TODO: compute sum along rows
525
575
  GGML_API struct ggml_tensor * ggml_sum(
526
576
  struct ggml_context * ctx,
527
577
  struct ggml_tensor * a);
528
578
 
579
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
580
+ GGML_API struct ggml_tensor * ggml_sum_rows(
581
+ struct ggml_context * ctx,
582
+ struct ggml_tensor * a);
583
+
529
584
  // mean along rows
530
585
  GGML_API struct ggml_tensor * ggml_mean(
531
586
  struct ggml_context * ctx,
@@ -567,6 +622,13 @@ extern "C" {
567
622
  struct ggml_context * ctx,
568
623
  struct ggml_tensor * a);
569
624
 
625
+ // a - x
626
+ // b - dy
627
+ GGML_API struct ggml_tensor * ggml_silu_back(
628
+ struct ggml_context * ctx,
629
+ struct ggml_tensor * a,
630
+ struct ggml_tensor * b);
631
+
570
632
  // normalize along rows
571
633
  // TODO: eps is hardcoded to 1e-5 for now
572
634
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +639,13 @@ extern "C" {
577
639
  struct ggml_context * ctx,
578
640
  struct ggml_tensor * a);
579
641
 
642
+ // a - x
643
+ // b - dy
644
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
645
+ struct ggml_context * ctx,
646
+ struct ggml_tensor * a,
647
+ struct ggml_tensor * b);
648
+
580
649
  // A: m rows, n columns
581
650
  // B: p rows, n columns (i.e. we transpose it internally)
582
651
  // result is m columns, p rows
@@ -589,12 +658,66 @@ extern "C" {
589
658
  // operations on tensors without backpropagation
590
659
  //
591
660
 
592
- // in-place, returns view(a)
593
661
  GGML_API struct ggml_tensor * ggml_scale(
594
662
  struct ggml_context * ctx,
595
663
  struct ggml_tensor * a,
596
664
  struct ggml_tensor * b);
597
665
 
666
+ // in-place, returns view(a)
667
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a,
670
+ struct ggml_tensor * b);
671
+
672
+ // b -> view(a,offset,nb1,nb2,3), return modified a
673
+ GGML_API struct ggml_tensor * ggml_set(
674
+ struct ggml_context * ctx,
675
+ struct ggml_tensor * a,
676
+ struct ggml_tensor * b,
677
+ size_t nb1,
678
+ size_t nb2,
679
+ size_t nb3,
680
+ size_t offset);
681
+
682
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
683
+ GGML_API struct ggml_tensor * ggml_set_inplace(
684
+ struct ggml_context * ctx,
685
+ struct ggml_tensor * a,
686
+ struct ggml_tensor * b,
687
+ size_t nb1,
688
+ size_t nb2,
689
+ size_t nb3,
690
+ size_t offset);
691
+
692
+ GGML_API struct ggml_tensor * ggml_set_1d(
693
+ struct ggml_context * ctx,
694
+ struct ggml_tensor * a,
695
+ struct ggml_tensor * b,
696
+ size_t offset);
697
+
698
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ struct ggml_tensor * b,
702
+ size_t offset);
703
+
704
+ // b -> view(a,offset,nb1,nb2,3), return modified a
705
+ GGML_API struct ggml_tensor * ggml_set_2d(
706
+ struct ggml_context * ctx,
707
+ struct ggml_tensor * a,
708
+ struct ggml_tensor * b,
709
+ size_t nb1,
710
+ size_t offset);
711
+
712
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
713
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * a,
716
+ struct ggml_tensor * b,
717
+ size_t nb1,
718
+ size_t offset);
719
+
720
+
598
721
  // a -> b, return view(b)
599
722
  GGML_API struct ggml_tensor * ggml_cpy(
600
723
  struct ggml_context * ctx,
@@ -615,6 +738,11 @@ extern "C" {
615
738
 
616
739
  // return view(a)
617
740
  // TODO: when we start computing gradient, make a copy instead of view
741
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
742
+ struct ggml_context * ctx,
743
+ struct ggml_tensor * a,
744
+ int64_t ne0);
745
+
618
746
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
747
  struct ggml_context * ctx,
620
748
  struct ggml_tensor * a,
@@ -630,6 +758,14 @@ extern "C" {
630
758
  int64_t ne1,
631
759
  int64_t ne2);
632
760
 
761
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
762
+ struct ggml_context * ctx,
763
+ struct ggml_tensor * a,
764
+ int64_t ne0,
765
+ int64_t ne1,
766
+ int64_t ne2,
767
+ int64_t ne3);
768
+
633
769
  // offset in bytes
634
770
  GGML_API struct ggml_tensor * ggml_view_1d(
635
771
  struct ggml_context * ctx,
@@ -655,6 +791,18 @@ extern "C" {
655
791
  size_t nb2, // slice stride in bytes
656
792
  size_t offset);
657
793
 
794
+ GGML_API struct ggml_tensor * ggml_view_4d(
795
+ struct ggml_context * ctx,
796
+ struct ggml_tensor * a,
797
+ int64_t ne0,
798
+ int64_t ne1,
799
+ int64_t ne2,
800
+ int64_t ne3,
801
+ size_t nb1, // row stride in bytes
802
+ size_t nb2, // slice stride in bytes
803
+ size_t nb3,
804
+ size_t offset);
805
+
658
806
  GGML_API struct ggml_tensor * ggml_permute(
659
807
  struct ggml_context * ctx,
660
808
  struct ggml_tensor * a,
@@ -673,20 +821,50 @@ extern "C" {
673
821
  struct ggml_tensor * a,
674
822
  struct ggml_tensor * b);
675
823
 
824
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
825
+ struct ggml_context * ctx,
826
+ struct ggml_tensor * a,
827
+ struct ggml_tensor * b,
828
+ struct ggml_tensor * c);
829
+
830
+ GGML_API struct ggml_tensor * ggml_diag(
831
+ struct ggml_context * ctx,
832
+ struct ggml_tensor * a);
833
+
676
834
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
835
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
836
  struct ggml_context * ctx,
680
837
  struct ggml_tensor * a,
681
838
  int n_past);
682
839
 
683
840
  // in-place, returns view(a)
841
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
842
+ struct ggml_context * ctx,
843
+ struct ggml_tensor * a,
844
+ int n_past);
845
+
846
+ // set elements above the diagonal to 0
847
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
848
+ struct ggml_context * ctx,
849
+ struct ggml_tensor * a,
850
+ int n_past);
851
+
852
+ // in-place, returns view(a)
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
854
+ struct ggml_context * ctx,
855
+ struct ggml_tensor * a,
856
+ int n_past);
857
+
684
858
  GGML_API struct ggml_tensor * ggml_soft_max(
685
859
  struct ggml_context * ctx,
686
860
  struct ggml_tensor * a);
687
861
 
688
- // rotary position embedding
689
862
  // in-place, returns view(a)
863
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
864
+ struct ggml_context * ctx,
865
+ struct ggml_tensor * a);
866
+
867
+ // rotary position embedding
690
868
  // if mode & 1 == 1, skip n_past elements
691
869
  // if mode & 2 == 1, GPT-NeoX style
692
870
  // TODO: avoid creating a new tensor every time
@@ -697,13 +875,39 @@ extern "C" {
697
875
  int n_dims,
698
876
  int mode);
699
877
 
878
+ // in-place, returns view(a)
879
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
880
+ struct ggml_context * ctx,
881
+ struct ggml_tensor * a,
882
+ int n_past,
883
+ int n_dims,
884
+ int mode);
885
+
886
+ // rotary position embedding backward, i.e compute dx from dy
887
+ // a - dy
888
+ GGML_API struct ggml_tensor * ggml_rope_back(
889
+ struct ggml_context * ctx,
890
+ struct ggml_tensor * a,
891
+ int n_past,
892
+ int n_dims,
893
+ int mode);
894
+
700
895
  // alibi position embedding
701
896
  // in-place, returns view(a)
702
897
  struct ggml_tensor * ggml_alibi(
703
898
  struct ggml_context * ctx,
704
899
  struct ggml_tensor * a,
705
900
  int n_past,
706
- int n_head);
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
707
911
 
708
912
  // padding = 1
709
913
  // TODO: we don't support extra parameters for now
@@ -741,13 +945,13 @@ extern "C" {
741
945
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
946
  struct ggml_context * ctx,
743
947
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
948
+ ggml_unary_op_f32_t fun);
745
949
 
746
950
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
951
  struct ggml_context * ctx,
748
952
  struct ggml_tensor * a,
749
953
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
954
+ ggml_binary_op_f32_t fun);
751
955
 
752
956
  //
753
957
  // automatic differentiation
@@ -876,7 +1080,6 @@ extern "C" {
876
1080
 
877
1081
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1082
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1083
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1084
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1085
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -101,12 +101,12 @@ struct llama_file {
101
101
  LLAMA_ASSERT(ret == 0); // same
102
102
  }
103
103
 
104
- void read_raw(void * ptr, size_t size) {
105
- if (size == 0) {
104
+ void read_raw(void * ptr, size_t len) const {
105
+ if (len == 0) {
106
106
  return;
107
107
  }
108
108
  errno = 0;
109
- std::size_t ret = std::fread(ptr, size, 1, fp);
109
+ std::size_t ret = std::fread(ptr, len, 1, fp);
110
110
  if (ferror(fp)) {
111
111
  throw std::runtime_error(format("read error: %s", strerror(errno)));
112
112
  }
@@ -127,12 +127,12 @@ struct llama_file {
127
127
  return std::string(chars.data(), len);
128
128
  }
129
129
 
130
- void write_raw(const void * ptr, size_t size) {
131
- if (size == 0) {
130
+ void write_raw(const void * ptr, size_t len) const {
131
+ if (len == 0) {
132
132
  return;
133
133
  }
134
134
  errno = 0;
135
- size_t ret = std::fwrite(ptr, size, 1, fp);
135
+ size_t ret = std::fwrite(ptr, len, 1, fp);
136
136
  if (ret != 1) {
137
137
  throw std::runtime_error(format("write error: %s", strerror(errno)));
138
138
  }
@@ -172,7 +172,7 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, bool prefetch = true) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
184
184
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
185
  }
186
186
 
187
- if (prefetch) {
187
+ if (prefetch > 0) {
188
188
  // Advise the kernel to preload the mapped memory
189
- if (madvise(addr, file->size, MADV_WILLNEED)) {
189
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190
190
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191
191
  strerror(errno));
192
192
  }
@@ -267,9 +267,9 @@ struct llama_mlock {
267
267
  }
268
268
  }
269
269
 
270
- void init(void * addr) {
271
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
272
- this->addr = addr;
270
+ void init(void * ptr) {
271
+ LLAMA_ASSERT(addr == NULL && size == 0);
272
+ addr = ptr;
273
273
  }
274
274
 
275
275
  void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
340
340
  return (size_t) si.dwPageSize;
341
341
  }
342
342
 
343
- bool raw_lock(void * addr, size_t size) {
343
+ bool raw_lock(void * ptr, size_t len) {
344
344
  for (int tries = 1; ; tries++) {
345
- if (VirtualLock(addr, size)) {
345
+ if (VirtualLock(ptr, len)) {
346
346
  return true;
347
347
  }
348
348
  if (tries == 2) {
349
349
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
- size, this->size, llama_format_win_err(GetLastError()).c_str());
350
+ len, size, llama_format_win_err(GetLastError()).c_str());
351
351
  return false;
352
352
  }
353
353
 
@@ -363,7 +363,7 @@ struct llama_mlock {
363
363
  // is equal to the number of pages in its minimum working set minus
364
364
  // a small overhead."
365
365
  // Hopefully a megabyte is enough overhead:
366
- size_t increment = size + 1048576;
366
+ size_t increment = len + 1048576;
367
367
  // The minimum must be <= the maximum, so we need to increase both:
368
368
  min_ws_size += increment;
369
369
  max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
375
375
  }
376
376
  }
377
377
 
378
- void raw_unlock(void * addr, size_t size) {
379
- if (!VirtualUnlock(addr, size)) {
378
+ void raw_unlock(void * ptr, size_t len) {
379
+ if (!VirtualUnlock(ptr, len)) {
380
380
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
381
  llama_format_win_err(GetLastError()).c_str());
382
382
  }
@@ -388,12 +388,12 @@ struct llama_mlock {
388
388
  return (size_t) 65536;
389
389
  }
390
390
 
391
- bool raw_lock(const void * addr, size_t size) {
391
+ bool raw_lock(const void * addr, size_t len) {
392
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
393
  return false;
394
394
  }
395
395
 
396
- void raw_unlock(const void * addr, size_t size) {}
396
+ void raw_unlock(const void * addr, size_t len) {}
397
397
  #endif
398
398
  };
399
399
 
@@ -404,10 +404,10 @@ struct llama_buffer {
404
404
 
405
405
  llama_buffer() = default;
406
406
 
407
- void resize(size_t size) {
407
+ void resize(size_t len) {
408
408
  delete[] addr;
409
- addr = new uint8_t[size];
410
- this->size = size;
409
+ addr = new uint8_t[len];
410
+ size = len;
411
411
  }
412
412
 
413
413
  ~llama_buffer() {