llama_cpp 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -190,9 +190,12 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
@@ -231,7 +234,7 @@ extern "C" {
231
234
  GGML_TYPE_F16 = 1,
232
235
  GGML_TYPE_Q4_0 = 2,
233
236
  GGML_TYPE_Q4_1 = 3,
234
- GGML_TYPE_Q4_2 = 4,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
235
238
  // GGML_TYPE_Q4_3 (5) support has been removed
236
239
  GGML_TYPE_Q5_0 = 6,
237
240
  GGML_TYPE_Q5_1 = 7,
@@ -243,6 +246,11 @@ extern "C" {
243
246
  GGML_TYPE_COUNT,
244
247
  };
245
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
246
254
  // model file types
247
255
  enum ggml_ftype {
248
256
  GGML_FTYPE_UNKNOWN = -1,
@@ -251,7 +259,6 @@ extern "C" {
251
259
  GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
260
  GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
261
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
- GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -263,12 +270,16 @@ extern "C" {
263
270
 
264
271
  GGML_OP_DUP,
265
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
266
275
  GGML_OP_SUB,
267
276
  GGML_OP_MUL,
268
277
  GGML_OP_DIV,
269
278
  GGML_OP_SQR,
270
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
271
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
272
283
  GGML_OP_MEAN,
273
284
  GGML_OP_REPEAT,
274
285
  GGML_OP_ABS,
@@ -278,12 +289,15 @@ extern "C" {
278
289
  GGML_OP_RELU,
279
290
  GGML_OP_GELU,
280
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
281
293
  GGML_OP_NORM, // normalize
282
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
283
296
 
284
297
  GGML_OP_MUL_MAT,
285
298
 
286
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
287
301
  GGML_OP_CPY,
288
302
  GGML_OP_CONT,
289
303
  GGML_OP_RESHAPE,
@@ -291,10 +305,15 @@ extern "C" {
291
305
  GGML_OP_PERMUTE,
292
306
  GGML_OP_TRANSPOSE,
293
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
294
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
295
312
  GGML_OP_SOFT_MAX,
296
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
297
315
  GGML_OP_ALIBI,
316
+ GGML_OP_CLAMP,
298
317
  GGML_OP_CONV_1D_1S,
299
318
  GGML_OP_CONV_1D_2S,
300
319
 
@@ -322,7 +341,8 @@ extern "C" {
322
341
 
323
342
  // n-dimensional tensor
324
343
  struct ggml_tensor {
325
- enum ggml_type type;
344
+ enum ggml_type type;
345
+ enum ggml_backend backend;
326
346
 
327
347
  int n_dims;
328
348
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -353,7 +373,7 @@ extern "C" {
353
373
 
354
374
  char name[32];
355
375
 
356
- char padding[8]; // TODO: remove and add padding to name?
376
+ char padding[16];
357
377
  };
358
378
 
359
379
  // computation graph
@@ -497,6 +517,29 @@ extern "C" {
497
517
  struct ggml_tensor * a,
498
518
  struct ggml_tensor * b);
499
519
 
520
+ GGML_API struct ggml_tensor * ggml_add1(
521
+ struct ggml_context * ctx,
522
+ struct ggml_tensor * a,
523
+ struct ggml_tensor * b);
524
+
525
+ GGML_API struct ggml_tensor * ggml_acc(
526
+ struct ggml_context * ctx,
527
+ struct ggml_tensor * a,
528
+ struct ggml_tensor * b,
529
+ size_t nb1,
530
+ size_t nb2,
531
+ size_t nb3,
532
+ size_t offset);
533
+
534
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
535
+ struct ggml_context * ctx,
536
+ struct ggml_tensor * a,
537
+ struct ggml_tensor * b,
538
+ size_t nb1,
539
+ size_t nb2,
540
+ size_t nb3,
541
+ size_t offset);
542
+
500
543
  GGML_API struct ggml_tensor * ggml_sub(
501
544
  struct ggml_context * ctx,
502
545
  struct ggml_tensor * a,
@@ -520,12 +563,24 @@ extern "C" {
520
563
  struct ggml_context * ctx,
521
564
  struct ggml_tensor * a);
522
565
 
566
+ GGML_API struct ggml_tensor * ggml_log(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ GGML_API struct ggml_tensor * ggml_log_inplace(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
523
574
  // return scalar
524
- // TODO: compute sum along rows
525
575
  GGML_API struct ggml_tensor * ggml_sum(
526
576
  struct ggml_context * ctx,
527
577
  struct ggml_tensor * a);
528
578
 
579
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
580
+ GGML_API struct ggml_tensor * ggml_sum_rows(
581
+ struct ggml_context * ctx,
582
+ struct ggml_tensor * a);
583
+
529
584
  // mean along rows
530
585
  GGML_API struct ggml_tensor * ggml_mean(
531
586
  struct ggml_context * ctx,
@@ -567,6 +622,13 @@ extern "C" {
567
622
  struct ggml_context * ctx,
568
623
  struct ggml_tensor * a);
569
624
 
625
+ // a - x
626
+ // b - dy
627
+ GGML_API struct ggml_tensor * ggml_silu_back(
628
+ struct ggml_context * ctx,
629
+ struct ggml_tensor * a,
630
+ struct ggml_tensor * b);
631
+
570
632
  // normalize along rows
571
633
  // TODO: eps is hardcoded to 1e-5 for now
572
634
  GGML_API struct ggml_tensor * ggml_norm(
@@ -577,6 +639,13 @@ extern "C" {
577
639
  struct ggml_context * ctx,
578
640
  struct ggml_tensor * a);
579
641
 
642
+ // a - x
643
+ // b - dy
644
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
645
+ struct ggml_context * ctx,
646
+ struct ggml_tensor * a,
647
+ struct ggml_tensor * b);
648
+
580
649
  // A: m rows, n columns
581
650
  // B: p rows, n columns (i.e. we transpose it internally)
582
651
  // result is m columns, p rows
@@ -589,12 +658,66 @@ extern "C" {
589
658
  // operations on tensors without backpropagation
590
659
  //
591
660
 
592
- // in-place, returns view(a)
593
661
  GGML_API struct ggml_tensor * ggml_scale(
594
662
  struct ggml_context * ctx,
595
663
  struct ggml_tensor * a,
596
664
  struct ggml_tensor * b);
597
665
 
666
+ // in-place, returns view(a)
667
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a,
670
+ struct ggml_tensor * b);
671
+
672
+ // b -> view(a,offset,nb1,nb2,3), return modified a
673
+ GGML_API struct ggml_tensor * ggml_set(
674
+ struct ggml_context * ctx,
675
+ struct ggml_tensor * a,
676
+ struct ggml_tensor * b,
677
+ size_t nb1,
678
+ size_t nb2,
679
+ size_t nb3,
680
+ size_t offset);
681
+
682
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
683
+ GGML_API struct ggml_tensor * ggml_set_inplace(
684
+ struct ggml_context * ctx,
685
+ struct ggml_tensor * a,
686
+ struct ggml_tensor * b,
687
+ size_t nb1,
688
+ size_t nb2,
689
+ size_t nb3,
690
+ size_t offset);
691
+
692
+ GGML_API struct ggml_tensor * ggml_set_1d(
693
+ struct ggml_context * ctx,
694
+ struct ggml_tensor * a,
695
+ struct ggml_tensor * b,
696
+ size_t offset);
697
+
698
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ struct ggml_tensor * b,
702
+ size_t offset);
703
+
704
+ // b -> view(a,offset,nb1,nb2,3), return modified a
705
+ GGML_API struct ggml_tensor * ggml_set_2d(
706
+ struct ggml_context * ctx,
707
+ struct ggml_tensor * a,
708
+ struct ggml_tensor * b,
709
+ size_t nb1,
710
+ size_t offset);
711
+
712
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
713
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * a,
716
+ struct ggml_tensor * b,
717
+ size_t nb1,
718
+ size_t offset);
719
+
720
+
598
721
  // a -> b, return view(b)
599
722
  GGML_API struct ggml_tensor * ggml_cpy(
600
723
  struct ggml_context * ctx,
@@ -615,6 +738,11 @@ extern "C" {
615
738
 
616
739
  // return view(a)
617
740
  // TODO: when we start computing gradient, make a copy instead of view
741
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
742
+ struct ggml_context * ctx,
743
+ struct ggml_tensor * a,
744
+ int64_t ne0);
745
+
618
746
  GGML_API struct ggml_tensor * ggml_reshape_2d(
619
747
  struct ggml_context * ctx,
620
748
  struct ggml_tensor * a,
@@ -630,6 +758,14 @@ extern "C" {
630
758
  int64_t ne1,
631
759
  int64_t ne2);
632
760
 
761
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
762
+ struct ggml_context * ctx,
763
+ struct ggml_tensor * a,
764
+ int64_t ne0,
765
+ int64_t ne1,
766
+ int64_t ne2,
767
+ int64_t ne3);
768
+
633
769
  // offset in bytes
634
770
  GGML_API struct ggml_tensor * ggml_view_1d(
635
771
  struct ggml_context * ctx,
@@ -655,6 +791,18 @@ extern "C" {
655
791
  size_t nb2, // slice stride in bytes
656
792
  size_t offset);
657
793
 
794
+ GGML_API struct ggml_tensor * ggml_view_4d(
795
+ struct ggml_context * ctx,
796
+ struct ggml_tensor * a,
797
+ int64_t ne0,
798
+ int64_t ne1,
799
+ int64_t ne2,
800
+ int64_t ne3,
801
+ size_t nb1, // row stride in bytes
802
+ size_t nb2, // slice stride in bytes
803
+ size_t nb3,
804
+ size_t offset);
805
+
658
806
  GGML_API struct ggml_tensor * ggml_permute(
659
807
  struct ggml_context * ctx,
660
808
  struct ggml_tensor * a,
@@ -673,20 +821,50 @@ extern "C" {
673
821
  struct ggml_tensor * a,
674
822
  struct ggml_tensor * b);
675
823
 
824
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
825
+ struct ggml_context * ctx,
826
+ struct ggml_tensor * a,
827
+ struct ggml_tensor * b,
828
+ struct ggml_tensor * c);
829
+
830
+ GGML_API struct ggml_tensor * ggml_diag(
831
+ struct ggml_context * ctx,
832
+ struct ggml_tensor * a);
833
+
676
834
  // set elements above the diagonal to -INF
677
- // in-place, returns view(a)
678
835
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
836
  struct ggml_context * ctx,
680
837
  struct ggml_tensor * a,
681
838
  int n_past);
682
839
 
683
840
  // in-place, returns view(a)
841
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
842
+ struct ggml_context * ctx,
843
+ struct ggml_tensor * a,
844
+ int n_past);
845
+
846
+ // set elements above the diagonal to 0
847
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
848
+ struct ggml_context * ctx,
849
+ struct ggml_tensor * a,
850
+ int n_past);
851
+
852
+ // in-place, returns view(a)
853
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
854
+ struct ggml_context * ctx,
855
+ struct ggml_tensor * a,
856
+ int n_past);
857
+
684
858
  GGML_API struct ggml_tensor * ggml_soft_max(
685
859
  struct ggml_context * ctx,
686
860
  struct ggml_tensor * a);
687
861
 
688
- // rotary position embedding
689
862
  // in-place, returns view(a)
863
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
864
+ struct ggml_context * ctx,
865
+ struct ggml_tensor * a);
866
+
867
+ // rotary position embedding
690
868
  // if mode & 1 == 1, skip n_past elements
691
869
  // if mode & 2 == 1, GPT-NeoX style
692
870
  // TODO: avoid creating a new tensor every time
@@ -697,13 +875,39 @@ extern "C" {
697
875
  int n_dims,
698
876
  int mode);
699
877
 
878
+ // in-place, returns view(a)
879
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
880
+ struct ggml_context * ctx,
881
+ struct ggml_tensor * a,
882
+ int n_past,
883
+ int n_dims,
884
+ int mode);
885
+
886
+ // rotary position embedding backward, i.e compute dx from dy
887
+ // a - dy
888
+ GGML_API struct ggml_tensor * ggml_rope_back(
889
+ struct ggml_context * ctx,
890
+ struct ggml_tensor * a,
891
+ int n_past,
892
+ int n_dims,
893
+ int mode);
894
+
700
895
  // alibi position embedding
701
896
  // in-place, returns view(a)
702
897
  struct ggml_tensor * ggml_alibi(
703
898
  struct ggml_context * ctx,
704
899
  struct ggml_tensor * a,
705
900
  int n_past,
706
- int n_head);
901
+ int n_head,
902
+ float bias_max);
903
+
904
+ // clamp
905
+ // in-place, returns view(a)
906
+ struct ggml_tensor * ggml_clamp(
907
+ struct ggml_context * ctx,
908
+ struct ggml_tensor * a,
909
+ float min,
910
+ float max);
707
911
 
708
912
  // padding = 1
709
913
  // TODO: we don't support extra parameters for now
@@ -741,13 +945,13 @@ extern "C" {
741
945
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
946
  struct ggml_context * ctx,
743
947
  struct ggml_tensor * a,
744
- const ggml_unary_op_f32_t fun);
948
+ ggml_unary_op_f32_t fun);
745
949
 
746
950
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
951
  struct ggml_context * ctx,
748
952
  struct ggml_tensor * a,
749
953
  struct ggml_tensor * b,
750
- const ggml_binary_op_f32_t fun);
954
+ ggml_binary_op_f32_t fun);
751
955
 
752
956
  //
753
957
  // automatic differentiation
@@ -876,7 +1080,6 @@ extern "C" {
876
1080
 
877
1081
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
1082
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
1083
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
1084
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
1085
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -101,12 +101,12 @@ struct llama_file {
101
101
  LLAMA_ASSERT(ret == 0); // same
102
102
  }
103
103
 
104
- void read_raw(void * ptr, size_t size) {
105
- if (size == 0) {
104
+ void read_raw(void * ptr, size_t len) const {
105
+ if (len == 0) {
106
106
  return;
107
107
  }
108
108
  errno = 0;
109
- std::size_t ret = std::fread(ptr, size, 1, fp);
109
+ std::size_t ret = std::fread(ptr, len, 1, fp);
110
110
  if (ferror(fp)) {
111
111
  throw std::runtime_error(format("read error: %s", strerror(errno)));
112
112
  }
@@ -127,12 +127,12 @@ struct llama_file {
127
127
  return std::string(chars.data(), len);
128
128
  }
129
129
 
130
- void write_raw(const void * ptr, size_t size) {
131
- if (size == 0) {
130
+ void write_raw(const void * ptr, size_t len) const {
131
+ if (len == 0) {
132
132
  return;
133
133
  }
134
134
  errno = 0;
135
- size_t ret = std::fwrite(ptr, size, 1, fp);
135
+ size_t ret = std::fwrite(ptr, len, 1, fp);
136
136
  if (ret != 1) {
137
137
  throw std::runtime_error(format("write error: %s", strerror(errno)));
138
138
  }
@@ -172,7 +172,7 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, bool prefetch = true) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
184
184
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
185
  }
186
186
 
187
- if (prefetch) {
187
+ if (prefetch > 0) {
188
188
  // Advise the kernel to preload the mapped memory
189
- if (madvise(addr, file->size, MADV_WILLNEED)) {
189
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190
190
  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191
191
  strerror(errno));
192
192
  }
@@ -267,9 +267,9 @@ struct llama_mlock {
267
267
  }
268
268
  }
269
269
 
270
- void init(void * addr) {
271
- LLAMA_ASSERT(this->addr == NULL && this->size == 0);
272
- this->addr = addr;
270
+ void init(void * ptr) {
271
+ LLAMA_ASSERT(addr == NULL && size == 0);
272
+ addr = ptr;
273
273
  }
274
274
 
275
275
  void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
340
340
  return (size_t) si.dwPageSize;
341
341
  }
342
342
 
343
- bool raw_lock(void * addr, size_t size) {
343
+ bool raw_lock(void * ptr, size_t len) {
344
344
  for (int tries = 1; ; tries++) {
345
- if (VirtualLock(addr, size)) {
345
+ if (VirtualLock(ptr, len)) {
346
346
  return true;
347
347
  }
348
348
  if (tries == 2) {
349
349
  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
- size, this->size, llama_format_win_err(GetLastError()).c_str());
350
+ len, size, llama_format_win_err(GetLastError()).c_str());
351
351
  return false;
352
352
  }
353
353
 
@@ -363,7 +363,7 @@ struct llama_mlock {
363
363
  // is equal to the number of pages in its minimum working set minus
364
364
  // a small overhead."
365
365
  // Hopefully a megabyte is enough overhead:
366
- size_t increment = size + 1048576;
366
+ size_t increment = len + 1048576;
367
367
  // The minimum must be <= the maximum, so we need to increase both:
368
368
  min_ws_size += increment;
369
369
  max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
375
375
  }
376
376
  }
377
377
 
378
- void raw_unlock(void * addr, size_t size) {
379
- if (!VirtualUnlock(addr, size)) {
378
+ void raw_unlock(void * ptr, size_t len) {
379
+ if (!VirtualUnlock(ptr, len)) {
380
380
  fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
381
  llama_format_win_err(GetLastError()).c_str());
382
382
  }
@@ -388,12 +388,12 @@ struct llama_mlock {
388
388
  return (size_t) 65536;
389
389
  }
390
390
 
391
- bool raw_lock(const void * addr, size_t size) {
391
+ bool raw_lock(const void * addr, size_t len) {
392
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
393
  return false;
394
394
  }
395
395
 
396
- void raw_unlock(const void * addr, size_t size) {}
396
+ void raw_unlock(const void * addr, size_t len) {}
397
397
  #endif
398
398
  };
399
399
 
@@ -404,10 +404,10 @@ struct llama_buffer {
404
404
 
405
405
  llama_buffer() = default;
406
406
 
407
- void resize(size_t size) {
407
+ void resize(size_t len) {
408
408
  delete[] addr;
409
- addr = new uint8_t[size];
410
- this->size = size;
409
+ addr = new uint8_t[len];
410
+ size = len;
411
411
  }
412
412
 
413
413
  ~llama_buffer() {