llama_cpp 0.0.7 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -190,13 +190,24 @@
190
190
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
191
  #define GGML_FILE_VERSION 1
192
192
 
193
+ #define GGML_QNT_VERSION 1 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
193
196
  #define GGML_MAX_DIMS 4
194
197
  #define GGML_MAX_NODES 4096
195
- #define GGML_MAX_PARAMS 16
198
+ #define GGML_MAX_PARAMS 256
196
199
  #define GGML_MAX_CONTEXTS 64
197
200
  #define GGML_MAX_OPT 4
198
201
  #define GGML_DEFAULT_N_THREADS 4
199
202
 
203
+ #define GGML_ASSERT(x) \
204
+ do { \
205
+ if (!(x)) { \
206
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
207
+ abort(); \
208
+ } \
209
+ } while (0)
210
+
200
211
  #ifdef __cplusplus
201
212
  extern "C" {
202
213
  #endif
@@ -212,6 +223,9 @@ extern "C" {
212
223
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
224
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
225
 
226
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
227
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
228
+
215
229
  struct ggml_object;
216
230
  struct ggml_context;
217
231
 
@@ -220,8 +234,8 @@ extern "C" {
220
234
  GGML_TYPE_F16 = 1,
221
235
  GGML_TYPE_Q4_0 = 2,
222
236
  GGML_TYPE_Q4_1 = 3,
223
- GGML_TYPE_Q4_2 = 4,
224
- GGML_TYPE_Q4_3 = 5,
237
+ // GGML_TYPE_Q4_2 = 4, support has been removed
238
+ // GGML_TYPE_Q4_3 (5) support has been removed
225
239
  GGML_TYPE_Q5_0 = 6,
226
240
  GGML_TYPE_Q5_1 = 7,
227
241
  GGML_TYPE_Q8_0 = 8,
@@ -232,18 +246,40 @@ extern "C" {
232
246
  GGML_TYPE_COUNT,
233
247
  };
234
248
 
249
+ enum ggml_backend {
250
+ GGML_BACKEND_CPU = 0,
251
+ GGML_BACKEND_CUDA = 1,
252
+ };
253
+
254
+ // model file types
255
+ enum ggml_ftype {
256
+ GGML_FTYPE_UNKNOWN = -1,
257
+ GGML_FTYPE_ALL_F32 = 0,
258
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
259
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
260
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
261
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
262
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
263
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
264
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
265
+ };
266
+
235
267
  // available tensor operations:
236
268
  enum ggml_op {
237
269
  GGML_OP_NONE = 0,
238
270
 
239
271
  GGML_OP_DUP,
240
272
  GGML_OP_ADD,
273
+ GGML_OP_ADD1,
274
+ GGML_OP_ACC,
241
275
  GGML_OP_SUB,
242
276
  GGML_OP_MUL,
243
277
  GGML_OP_DIV,
244
278
  GGML_OP_SQR,
245
279
  GGML_OP_SQRT,
280
+ GGML_OP_LOG,
246
281
  GGML_OP_SUM,
282
+ GGML_OP_SUM_ROWS,
247
283
  GGML_OP_MEAN,
248
284
  GGML_OP_REPEAT,
249
285
  GGML_OP_ABS,
@@ -253,12 +289,15 @@ extern "C" {
253
289
  GGML_OP_RELU,
254
290
  GGML_OP_GELU,
255
291
  GGML_OP_SILU,
292
+ GGML_OP_SILU_BACK,
256
293
  GGML_OP_NORM, // normalize
257
294
  GGML_OP_RMS_NORM,
295
+ GGML_OP_RMS_NORM_BACK,
258
296
 
259
297
  GGML_OP_MUL_MAT,
260
298
 
261
299
  GGML_OP_SCALE,
300
+ GGML_OP_SET,
262
301
  GGML_OP_CPY,
263
302
  GGML_OP_CONT,
264
303
  GGML_OP_RESHAPE,
@@ -266,9 +305,14 @@ extern "C" {
266
305
  GGML_OP_PERMUTE,
267
306
  GGML_OP_TRANSPOSE,
268
307
  GGML_OP_GET_ROWS,
308
+ GGML_OP_GET_ROWS_BACK,
309
+ GGML_OP_DIAG,
269
310
  GGML_OP_DIAG_MASK_INF,
311
+ GGML_OP_DIAG_MASK_ZERO,
270
312
  GGML_OP_SOFT_MAX,
271
313
  GGML_OP_ROPE,
314
+ GGML_OP_ROPE_BACK,
315
+ GGML_OP_ALIBI,
272
316
  GGML_OP_CONV_1D_1S,
273
317
  GGML_OP_CONV_1D_2S,
274
318
 
@@ -296,7 +340,8 @@ extern "C" {
296
340
 
297
341
  // n-dimensional tensor
298
342
  struct ggml_tensor {
299
- enum ggml_type type;
343
+ enum ggml_type type;
344
+ enum ggml_backend backend;
300
345
 
301
346
  int n_dims;
302
347
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -324,7 +369,10 @@ extern "C" {
324
369
  int64_t perf_time_us;
325
370
 
326
371
  void * data;
327
- char padding[8];
372
+
373
+ char name[32];
374
+
375
+ char padding[16];
328
376
  };
329
377
 
330
378
  // computation graph
@@ -384,6 +432,9 @@ extern "C" {
384
432
 
385
433
  GGML_API bool ggml_is_quantized(enum ggml_type type);
386
434
 
435
+ // TODO: temporary until model loading of ggml examples is refactored
436
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
437
+
387
438
  // main
388
439
 
389
440
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +495,9 @@ extern "C" {
444
495
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
496
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
497
 
498
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
499
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
500
+
447
501
  //
448
502
  // operations on tensors with backpropagation
449
503
  //
@@ -462,6 +516,29 @@ extern "C" {
462
516
  struct ggml_tensor * a,
463
517
  struct ggml_tensor * b);
464
518
 
519
+ GGML_API struct ggml_tensor * ggml_add1(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a,
522
+ struct ggml_tensor * b);
523
+
524
+ GGML_API struct ggml_tensor * ggml_acc(
525
+ struct ggml_context * ctx,
526
+ struct ggml_tensor * a,
527
+ struct ggml_tensor * b,
528
+ size_t nb1,
529
+ size_t nb2,
530
+ size_t nb3,
531
+ size_t offset);
532
+
533
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
534
+ struct ggml_context * ctx,
535
+ struct ggml_tensor * a,
536
+ struct ggml_tensor * b,
537
+ size_t nb1,
538
+ size_t nb2,
539
+ size_t nb3,
540
+ size_t offset);
541
+
465
542
  GGML_API struct ggml_tensor * ggml_sub(
466
543
  struct ggml_context * ctx,
467
544
  struct ggml_tensor * a,
@@ -485,12 +562,24 @@ extern "C" {
485
562
  struct ggml_context * ctx,
486
563
  struct ggml_tensor * a);
487
564
 
565
+ GGML_API struct ggml_tensor * ggml_log(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a);
568
+
569
+ GGML_API struct ggml_tensor * ggml_log_inplace(
570
+ struct ggml_context * ctx,
571
+ struct ggml_tensor * a);
572
+
488
573
  // return scalar
489
- // TODO: compute sum along rows
490
574
  GGML_API struct ggml_tensor * ggml_sum(
491
575
  struct ggml_context * ctx,
492
576
  struct ggml_tensor * a);
493
577
 
578
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
579
+ GGML_API struct ggml_tensor * ggml_sum_rows(
580
+ struct ggml_context * ctx,
581
+ struct ggml_tensor * a);
582
+
494
583
  // mean along rows
495
584
  GGML_API struct ggml_tensor * ggml_mean(
496
585
  struct ggml_context * ctx,
@@ -532,6 +621,13 @@ extern "C" {
532
621
  struct ggml_context * ctx,
533
622
  struct ggml_tensor * a);
534
623
 
624
+ // a - x
625
+ // b - dy
626
+ GGML_API struct ggml_tensor * ggml_silu_back(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ struct ggml_tensor * b);
630
+
535
631
  // normalize along rows
536
632
  // TODO: eps is hardcoded to 1e-5 for now
537
633
  GGML_API struct ggml_tensor * ggml_norm(
@@ -542,6 +638,13 @@ extern "C" {
542
638
  struct ggml_context * ctx,
543
639
  struct ggml_tensor * a);
544
640
 
641
+ // a - x
642
+ // b - dy
643
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
545
648
  // A: m rows, n columns
546
649
  // B: p rows, n columns (i.e. we transpose it internally)
547
650
  // result is m columns, p rows
@@ -554,12 +657,66 @@ extern "C" {
554
657
  // operations on tensors without backpropagation
555
658
  //
556
659
 
557
- // in-place, returns view(a)
558
660
  GGML_API struct ggml_tensor * ggml_scale(
559
661
  struct ggml_context * ctx,
560
662
  struct ggml_tensor * a,
561
663
  struct ggml_tensor * b);
562
664
 
665
+ // in-place, returns view(a)
666
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a,
669
+ struct ggml_tensor * b);
670
+
671
+ // b -> view(a,offset,nb1,nb2,3), return modified a
672
+ GGML_API struct ggml_tensor * ggml_set(
673
+ struct ggml_context * ctx,
674
+ struct ggml_tensor * a,
675
+ struct ggml_tensor * b,
676
+ size_t nb1,
677
+ size_t nb2,
678
+ size_t nb3,
679
+ size_t offset);
680
+
681
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
682
+ GGML_API struct ggml_tensor * ggml_set_inplace(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a,
685
+ struct ggml_tensor * b,
686
+ size_t nb1,
687
+ size_t nb2,
688
+ size_t nb3,
689
+ size_t offset);
690
+
691
+ GGML_API struct ggml_tensor * ggml_set_1d(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a,
694
+ struct ggml_tensor * b,
695
+ size_t offset);
696
+
697
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b,
701
+ size_t offset);
702
+
703
+ // b -> view(a,offset,nb1,nb2,3), return modified a
704
+ GGML_API struct ggml_tensor * ggml_set_2d(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a,
707
+ struct ggml_tensor * b,
708
+ size_t nb1,
709
+ size_t offset);
710
+
711
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
712
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ size_t nb1,
717
+ size_t offset);
718
+
719
+
563
720
  // a -> b, return view(b)
564
721
  GGML_API struct ggml_tensor * ggml_cpy(
565
722
  struct ggml_context * ctx,
@@ -580,6 +737,11 @@ extern "C" {
580
737
 
581
738
  // return view(a)
582
739
  // TODO: when we start computing gradient, make a copy instead of view
740
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a,
743
+ int64_t ne0);
744
+
583
745
  GGML_API struct ggml_tensor * ggml_reshape_2d(
584
746
  struct ggml_context * ctx,
585
747
  struct ggml_tensor * a,
@@ -595,6 +757,14 @@ extern "C" {
595
757
  int64_t ne1,
596
758
  int64_t ne2);
597
759
 
760
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a,
763
+ int64_t ne0,
764
+ int64_t ne1,
765
+ int64_t ne2,
766
+ int64_t ne3);
767
+
598
768
  // offset in bytes
599
769
  GGML_API struct ggml_tensor * ggml_view_1d(
600
770
  struct ggml_context * ctx,
@@ -620,6 +790,18 @@ extern "C" {
620
790
  size_t nb2, // slice stride in bytes
621
791
  size_t offset);
622
792
 
793
+ GGML_API struct ggml_tensor * ggml_view_4d(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a,
796
+ int64_t ne0,
797
+ int64_t ne1,
798
+ int64_t ne2,
799
+ int64_t ne3,
800
+ size_t nb1, // row stride in bytes
801
+ size_t nb2, // slice stride in bytes
802
+ size_t nb3,
803
+ size_t offset);
804
+
623
805
  GGML_API struct ggml_tensor * ggml_permute(
624
806
  struct ggml_context * ctx,
625
807
  struct ggml_tensor * a,
@@ -638,20 +820,50 @@ extern "C" {
638
820
  struct ggml_tensor * a,
639
821
  struct ggml_tensor * b);
640
822
 
823
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
824
+ struct ggml_context * ctx,
825
+ struct ggml_tensor * a,
826
+ struct ggml_tensor * b,
827
+ struct ggml_tensor * c);
828
+
829
+ GGML_API struct ggml_tensor * ggml_diag(
830
+ struct ggml_context * ctx,
831
+ struct ggml_tensor * a);
832
+
641
833
  // set elements above the diagonal to -INF
642
- // in-place, returns view(a)
643
834
  GGML_API struct ggml_tensor * ggml_diag_mask_inf(
644
835
  struct ggml_context * ctx,
645
836
  struct ggml_tensor * a,
646
837
  int n_past);
647
838
 
648
839
  // in-place, returns view(a)
840
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
841
+ struct ggml_context * ctx,
842
+ struct ggml_tensor * a,
843
+ int n_past);
844
+
845
+ // set elements above the diagonal to 0
846
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
847
+ struct ggml_context * ctx,
848
+ struct ggml_tensor * a,
849
+ int n_past);
850
+
851
+ // in-place, returns view(a)
852
+ GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ int n_past);
856
+
649
857
  GGML_API struct ggml_tensor * ggml_soft_max(
650
858
  struct ggml_context * ctx,
651
859
  struct ggml_tensor * a);
652
860
 
653
- // rotary position embedding
654
861
  // in-place, returns view(a)
862
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a);
865
+
866
+ // rotary position embedding
655
867
  // if mode & 1 == 1, skip n_past elements
656
868
  // if mode & 2 == 1, GPT-NeoX style
657
869
  // TODO: avoid creating a new tensor every time
@@ -662,6 +874,31 @@ extern "C" {
662
874
  int n_dims,
663
875
  int mode);
664
876
 
877
+ // in-place, returns view(a)
878
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
879
+ struct ggml_context * ctx,
880
+ struct ggml_tensor * a,
881
+ int n_past,
882
+ int n_dims,
883
+ int mode);
884
+
885
+ // rotary position embedding backward, i.e compute dx from dy
886
+ // a - dy
887
+ GGML_API struct ggml_tensor * ggml_rope_back(
888
+ struct ggml_context * ctx,
889
+ struct ggml_tensor * a,
890
+ int n_past,
891
+ int n_dims,
892
+ int mode);
893
+
894
+ // alibi position embedding
895
+ // in-place, returns view(a)
896
+ struct ggml_tensor * ggml_alibi(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a,
899
+ int n_past,
900
+ int n_head);
901
+
665
902
  // padding = 1
666
903
  // TODO: we don't support extra parameters for now
667
904
  // that's why we are hard-coding the stride, padding, and dilation
@@ -692,19 +929,19 @@ extern "C" {
692
929
  struct ggml_tensor * c1);
693
930
 
694
931
  // Mapping operations
695
- GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
- GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
932
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
933
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
934
 
698
935
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
936
  struct ggml_context * ctx,
700
937
  struct ggml_tensor * a,
701
- const ggml_unary_op_f32_t fun);
938
+ ggml_unary_op_f32_t fun);
702
939
 
703
940
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
704
941
  struct ggml_context * ctx,
705
942
  struct ggml_tensor * a,
706
943
  struct ggml_tensor * b,
707
- const ggml_binary_op_f32_t fun);
944
+ ggml_binary_op_f32_t fun);
708
945
 
709
946
  //
710
947
  // automatic differentiation
@@ -833,8 +1070,6 @@ extern "C" {
833
1070
 
834
1071
  GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
1072
  GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
- GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
- GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
1073
  GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
1074
  GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
1075
  GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -14,6 +14,7 @@
14
14
 
15
15
  #include <string>
16
16
  #include <vector>
17
+ #include <stdexcept>
17
18
 
18
19
  #ifdef __has_include
19
20
  #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
74
75
  llama_file(const char * fname, const char * mode) {
75
76
  fp = std::fopen(fname, mode);
76
77
  if (fp == NULL) {
77
- throw format("failed to open %s: %s", fname, std::strerror(errno));
78
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
78
79
  }
79
80
  seek(0, SEEK_END);
80
81
  size = tell();
@@ -107,10 +108,10 @@ struct llama_file {
107
108
  errno = 0;
108
109
  std::size_t ret = std::fread(ptr, size, 1, fp);
109
110
  if (ferror(fp)) {
110
- throw format("read error: %s", strerror(errno));
111
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
111
112
  }
112
113
  if (ret != 1) {
113
- throw std::string("unexpectedly reached end of file");
114
+ throw std::runtime_error(std::string("unexpectedly reached end of file"));
114
115
  }
115
116
  }
116
117
 
@@ -133,7 +134,7 @@ struct llama_file {
133
134
  errno = 0;
134
135
  size_t ret = std::fwrite(ptr, size, 1, fp);
135
136
  if (ret != 1) {
136
- throw format("write error: %s", strerror(errno));
137
+ throw std::runtime_error(format("write error: %s", strerror(errno)));
137
138
  }
138
139
  }
139
140
 
@@ -180,7 +181,7 @@ struct llama_mmap {
180
181
  #endif
181
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
182
183
  if (addr == MAP_FAILED) {
183
- throw format("mmap failed: %s", strerror(errno));
184
+ throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
184
185
  }
185
186
 
186
187
  if (prefetch) {
@@ -207,7 +208,7 @@ struct llama_mmap {
207
208
  DWORD error = GetLastError();
208
209
 
209
210
  if (hMapping == NULL) {
210
- throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
211
+ throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
211
212
  }
212
213
 
213
214
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
215
216
  CloseHandle(hMapping);
216
217
 
217
218
  if (addr == NULL) {
218
- throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
+ throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
219
220
  }
220
221
 
221
222
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
243
244
  #else
244
245
  static constexpr bool SUPPORTED = false;
245
246
 
246
- llama_mmap(struct llama_file *) {
247
- throw std::string("mmap not supported");
247
+ llama_mmap(struct llama_file *, bool prefetch = true) {
248
+ (void)prefetch;
249
+ throw std::runtime_error(std::string("mmap not supported"));
248
250
  }
249
251
  #endif
250
252
  };
@@ -382,8 +384,13 @@ struct llama_mlock {
382
384
  #else
383
385
  static constexpr bool SUPPORTED = false;
384
386
 
385
- void raw_lock(const void * addr, size_t size) {
387
+ size_t lock_granularity() {
388
+ return (size_t) 65536;
389
+ }
390
+
391
+ bool raw_lock(const void * addr, size_t size) {
386
392
  fprintf(stderr, "warning: mlock not supported on this system\n");
393
+ return false;
387
394
  }
388
395
 
389
396
  void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +402,8 @@ struct llama_buffer {
395
402
  uint8_t * addr = NULL;
396
403
  size_t size = 0;
397
404
 
405
+ llama_buffer() = default;
406
+
398
407
  void resize(size_t size) {
399
408
  delete[] addr;
400
409
  addr = new uint8_t[size];
@@ -404,5 +413,62 @@ struct llama_buffer {
404
413
  ~llama_buffer() {
405
414
  delete[] addr;
406
415
  }
416
+
417
+ // disable copy and move
418
+ llama_buffer(const llama_buffer&) = delete;
419
+ llama_buffer(llama_buffer&&) = delete;
420
+ llama_buffer& operator=(const llama_buffer&) = delete;
421
+ llama_buffer& operator=(llama_buffer&&) = delete;
407
422
  };
423
+
424
+ #ifdef GGML_USE_CUBLAS
425
+ #include "ggml-cuda.h"
426
+ struct llama_ctx_buffer {
427
+ uint8_t * addr = NULL;
428
+ bool is_cuda;
429
+ size_t size = 0;
430
+
431
+ llama_ctx_buffer() = default;
432
+
433
+ void resize(size_t size) {
434
+ free();
435
+
436
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
+ if (addr) {
438
+ is_cuda = true;
439
+ }
440
+ else {
441
+ // fall back to pageable memory
442
+ addr = new uint8_t[size];
443
+ is_cuda = false;
444
+ }
445
+ this->size = size;
446
+ }
447
+
448
+ void free() {
449
+ if (addr) {
450
+ if (is_cuda) {
451
+ ggml_cuda_host_free(addr);
452
+ }
453
+ else {
454
+ delete[] addr;
455
+ }
456
+ }
457
+ addr = NULL;
458
+ }
459
+
460
+ ~llama_ctx_buffer() {
461
+ free();
462
+ }
463
+
464
+ // disable copy and move
465
+ llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
+ llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
+ llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
+ llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
+ };
470
+ #else
471
+ typedef llama_buffer llama_ctx_buffer;
472
+ #endif
473
+
408
474
  #endif