llama_cpp 0.0.7 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -190,13 +190,24 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
199
202
 
203
+ #define GGML_ASSERT(x) \
204
+ do { \
205
+ if (!(x)) { \
206
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
207
+ abort(); \
208
+ } \
209
+ } while (0)
210
+
200
211
  #ifdef __cplusplus
201
212
  extern "C" {
202
213
  #endif
@@ -212,6 +223,9 @@ extern "C" {
212
223
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
224
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
225
 
226
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
227
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
228
+
215
229
  struct ggml_object;
216
230
  struct ggml_context;
217
231
 
@@ -220,8 +234,8 @@ extern "C" {
220
234
  GGML_TYPE_F16 = 1,
221
235
  GGML_TYPE_Q4_0 = 2,
222
236
  GGML_TYPE_Q4_1 = 3,
223
- GGML_TYPE_Q4_2 = 4,
224
- GGML_TYPE_Q4_3 = 5,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
238
+ // GGML_TYPE_Q4_3 (5) support has been removed
225
239
  GGML_TYPE_Q5_0 = 6,
226
240
  GGML_TYPE_Q5_1 = 7,
227
241
  GGML_TYPE_Q8_0 = 8,
@@ -232,18 +246,40 @@ extern "C" {
232
246
  GGML_TYPE_COUNT,
233
247
  };
234
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
254
+ // model file types
255
+ enum ggml_ftype {
256
+ GGML_FTYPE_UNKNOWN = -1,
257
+ GGML_FTYPE_ALL_F32 = 0,
258
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
259
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
260
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
261
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
262
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
263
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
264
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
265
+ };
266
+
235
267
  // available tensor operations:
236
268
  enum ggml_op {
237
269
  GGML_OP_NONE = 0,
238
270
 
239
271
  GGML_OP_DUP,
240
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
241
275
  GGML_OP_SUB,
242
276
  GGML_OP_MUL,
243
277
  GGML_OP_DIV,
244
278
  GGML_OP_SQR,
245
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
246
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
247
283
  GGML_OP_MEAN,
248
284
  GGML_OP_REPEAT,
249
285
  GGML_OP_ABS,
@@ -253,12 +289,15 @@ extern "C" {
253
289
  GGML_OP_RELU,
254
290
  GGML_OP_GELU,
255
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
256
293
  GGML_OP_NORM, // normalize
257
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
258
296
 
259
297
  GGML_OP_MUL_MAT,
260
298
 
261
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
262
301
  GGML_OP_CPY,
263
302
  GGML_OP_CONT,
264
303
  GGML_OP_RESHAPE,
@@ -266,9 +305,14 @@ extern "C" {
266
305
  GGML_OP_PERMUTE,
267
306
  GGML_OP_TRANSPOSE,
268
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
269
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
270
312
  GGML_OP_SOFT_MAX,
271
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
315
+ GGML_OP_ALIBI,
272
316
  GGML_OP_CONV_1D_1S,
273
317
  GGML_OP_CONV_1D_2S,
274
318
 
@@ -296,7 +340,8 @@ extern "C" {
296
340
 
297
341
  // n-dimensional tensor
298
342
  struct ggml_tensor {
299
- enum ggml_type type;
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
300
345
 
301
346
  int n_dims;
302
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -324,7 +369,10 @@ extern "C" {
324
369
  int64_t perf_time_us;
325
370
 
326
371
  void * data;
327
- char padding[8];
372
+
373
+ char name[32];
374
+
375
+ char padding[16];
328
376
  };
329
377
 
330
378
  // computation graph
@@ -384,6 +432,9 @@ extern "C" {
384
432
 
385
433
  GGML_API bool ggml_is_quantized(enum ggml_type type);
386
434
 
435
+ // TODO: temporary until model loading of ggml examples is refactored
436
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
437
+
387
438
  // main
388
439
 
389
440
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +495,9 @@ extern "C" {
444
495
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
496
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
497
 
498
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
499
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
500
+
447
501
  //
448
502
  // operations on tensors with backpropagation
449
503
  //
@@ -462,6 +516,29 @@ extern "C" {
462
516
  struct ggml_tensor * a,
463
517
  struct ggml_tensor * b);
464
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
465
542
  GGML_API struct ggml_tensor * ggml_sub(
466
543
  struct ggml_context * ctx,
467
544
  struct ggml_tensor * a,
@@ -485,12 +562,24 @@ extern "C" {
485
562
  struct ggml_context * ctx,
486
563
  struct ggml_tensor * a);
487
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
488
573
  // return scalar
489
- // TODO: compute sum along rows
490
574
  GGML_API struct ggml_tensor * ggml_sum(
491
575
  struct ggml_context * ctx,
492
576
  struct ggml_tensor * a);
493
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
494
583
  // mean along rows
495
584
  GGML_API struct ggml_tensor * ggml_mean(
496
585
  struct ggml_context * ctx,
@@ -532,6 +621,13 @@ extern "C" {
532
621
  struct ggml_context * ctx,
533
622
  struct ggml_tensor * a);
534
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
535
631
  // normalize along rows
536
632
  // TODO: eps is hardcoded to 1e-5 for now
537
633
  GGML_API struct ggml_tensor * ggml_norm(
@@ -542,6 +638,13 @@ extern "C" {
542
638
  struct ggml_context * ctx,
543
639
  struct ggml_tensor * a);
544
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
545
648
  // A: m rows, n columns
546
649
  // B: p rows, n columns (i.e. we transpose it internally)
547
650
  // result is m columns, p rows
@@ -554,12 +657,66 @@ extern "C" {
554
657
  // operations on tensors without backpropagation
555
658
  //
556
659
 
557
- // in-place, returns view(a)
558
660
  GGML_API struct ggml_tensor * ggml_scale(
559
661
  struct ggml_context * ctx,
560
662
  struct ggml_tensor * a,
561
663
  struct ggml_tensor * b);
562
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
563
720
  // a -> b, return view(b)
564
721
  GGML_API struct ggml_tensor * ggml_cpy(
565
722
  struct ggml_context * ctx,
@@ -580,6 +737,11 @@ extern "C" {
580
737
 
581
738
  // return view(a)
582
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
583
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
584
746
  struct ggml_context * ctx,
585
747
  struct ggml_tensor * a,
@@ -595,6 +757,14 @@ extern "C" {
595
757
  int64_t ne1,
596
758
  int64_t ne2);
597
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
598
768
  // offset in bytes
599
769
  GGML_API struct ggml_tensor * ggml_view_1d(
600
770
  struct ggml_context * ctx,
@@ -620,6 +790,18 @@ extern "C" {
620
790
  size_t nb2, // slice stride in bytes
621
791
  size_t offset);
622
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
623
805
  GGML_API struct ggml_tensor * ggml_permute(
624
806
  struct ggml_context * ctx,
625
807
  struct ggml_tensor * a,
@@ -638,20 +820,50 @@ extern "C" {
638
820
  struct ggml_tensor * a,
639
821
  struct ggml_tensor * b);
640
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
641
833
  // set elements above the diagonal to -INF
642
- // in-place, returns view(a)
643
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
644
835
  struct ggml_context * ctx,
645
836
  struct ggml_tensor * a,
646
837
  int n_past);
647
838
 
648
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
649
857
  GGML_API struct ggml_tensor * ggml_soft_max(
650
858
  struct ggml_context * ctx,
651
859
  struct ggml_tensor * a);
652
860
 
653
- // rotary position embedding
654
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
655
867
  // if mode & 1 == 1, skip n_past elements
656
868
  // if mode & 2 == 1, GPT-NeoX style
657
869
  // TODO: avoid creating a new tensor every time
@@ -662,6 +874,31 @@ extern "C" {
662
874
  int n_dims,
663
875
  int mode);
664
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
894
+ // alibi position embedding
895
+ // in-place, returns view(a)
896
+ struct ggml_tensor * ggml_alibi(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a,
899
+ int n_past,
900
+ int n_head);
901
+
665
902
  // padding = 1
666
903
  // TODO: we don't support extra parameters for now
667
904
  // that's why we are hard-coding the stride, padding, and dilation
@@ -692,19 +929,19 @@ extern "C" {
692
929
  struct ggml_tensor * c1);
693
930
 
694
931
  // Mapping operations
695
- GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
- GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
932
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
933
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
934
 
698
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
936
  struct ggml_context * ctx,
700
937
  struct ggml_tensor * a,
701
- const ggml_unary_op_f32_t fun);
938
+ ggml_unary_op_f32_t fun);
702
939
 
703
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
704
941
  struct ggml_context * ctx,
705
942
  struct ggml_tensor * a,
706
943
  struct ggml_tensor * b,
707
- const ggml_binary_op_f32_t fun);
944
+ ggml_binary_op_f32_t fun);
708
945
 
709
946
  //
710
947
  // automatic differentiation
@@ -833,8 +1070,6 @@ extern "C" {
833
1070
 
834
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
- GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -14,6 +14,7 @@
14
14
 
15
15
  #include <string>
16
16
  #include <vector>
17
+ #include <stdexcept>
17
18
 
18
19
  #ifdef __has_include
19
20
  #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
74
75
  llama_file(const char * fname, const char * mode) {
75
76
  fp = std::fopen(fname, mode);
76
77
  if (fp == NULL) {
77
- throw format("failed to open %s: %s", fname, std::strerror(errno));
78
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
78
79
  }
79
80
  seek(0, SEEK_END);
80
81
  size = tell();
@@ -107,10 +108,10 @@ struct llama_file {
107
108
  errno = 0;
108
109
  std::size_t ret = std::fread(ptr, size, 1, fp);
109
110
  if (ferror(fp)) {
110
- throw format("read error: %s", strerror(errno));
111
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
111
112
  }
112
113
  if (ret != 1) {
113
- throw std::string("unexpectedly reached end of file");
114
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
114
115
  }
115
116
  }
116
117
 
@@ -133,7 +134,7 @@ struct llama_file {
133
134
  errno = 0;
134
135
  size_t ret = std::fwrite(ptr, size, 1, fp);
135
136
  if (ret != 1) {
136
- throw format("write error: %s", strerror(errno));
137
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
137
138
  }
138
139
  }
139
140
 
@@ -180,7 +181,7 @@ struct llama_mmap {
180
181
  #endif
181
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
182
183
  if (addr == MAP_FAILED) {
183
- throw format("mmap failed: %s", strerror(errno));
184
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
184
185
  }
185
186
 
186
187
  if (prefetch) {
@@ -207,7 +208,7 @@ struct llama_mmap {
207
208
  DWORD error = GetLastError();
208
209
 
209
210
  if (hMapping == NULL) {
210
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
211
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
211
212
  }
212
213
 
213
214
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
215
216
  CloseHandle(hMapping);
216
217
 
217
218
  if (addr == NULL) {
218
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
219
220
  }
220
221
 
221
222
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
243
244
  #else
244
245
  static constexpr bool SUPPORTED = false;
245
246
 
246
- llama_mmap(struct llama_file *) {
247
- throw std::string("mmap not supported");
247
+ llama_mmap(struct llama_file *, bool prefetch = true) {
248
+ (void)prefetch;
249
+ throw std::runtime_error(std::string("mmap not supported"));
248
250
  }
249
251
  #endif
250
252
  };
@@ -382,8 +384,13 @@ struct llama_mlock {
382
384
  #else
383
385
  static constexpr bool SUPPORTED = false;
384
386
 
385
- void raw_lock(const void * addr, size_t size) {
387
+ size_t lock_granularity() {
388
+ return (size_t) 65536;
389
+ }
390
+
391
+ bool raw_lock(const void * addr, size_t size) {
386
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
+ return false;
387
394
  }
388
395
 
389
396
  void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +402,8 @@ struct llama_buffer {
395
402
  uint8_t * addr = NULL;
396
403
  size_t size = 0;
397
404
 
405
+ llama_buffer() = default;
406
+
398
407
  void resize(size_t size) {
399
408
  delete[] addr;
400
409
  addr = new uint8_t[size];
@@ -404,5 +413,62 @@ struct llama_buffer {
404
413
  ~llama_buffer() {
405
414
  delete[] addr;
406
415
  }
416
+
417
+ // disable copy and move
418
+ llama_buffer(const llama_buffer&) = delete;
419
+ llama_buffer(llama_buffer&&) = delete;
420
+ llama_buffer& operator=(const llama_buffer&) = delete;
421
+ llama_buffer& operator=(llama_buffer&&) = delete;
407
422
  };
423
+
424
+ #ifdef GGML_USE_CUBLAS
425
+ #include "ggml-cuda.h"
426
+ struct llama_ctx_buffer {
427
+ uint8_t * addr = NULL;
428
+ bool is_cuda;
429
+ size_t size = 0;
430
+
431
+ llama_ctx_buffer() = default;
432
+
433
+ void resize(size_t size) {
434
+ free();
435
+
436
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
+ if (addr) {
438
+ is_cuda = true;
439
+ }
440
+ else {
441
+ // fall back to pageable memory
442
+ addr = new uint8_t[size];
443
+ is_cuda = false;
444
+ }
445
+ this->size = size;
446
+ }
447
+
448
+ void free() {
449
+ if (addr) {
450
+ if (is_cuda) {
451
+ ggml_cuda_host_free(addr);
452
+ }
453
+ else {
454
+ delete[] addr;
455
+ }
456
+ }
457
+ addr = NULL;
458
+ }
459
+
460
+ ~llama_ctx_buffer() {
461
+ free();
462
+ }
463
+
464
+ // disable copy and move
465
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
+ };
470
+ #else
471
+ typedef llama_buffer llama_ctx_buffer;
472
+ #endif
473
+
408
474
  #endif