llama_cpp 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,9 +190,12 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
243
246
|
GGML_TYPE_COUNT,
|
244
247
|
};
|
245
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
246
254
|
// model file types
|
247
255
|
enum ggml_ftype {
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
263
270
|
|
264
271
|
GGML_OP_DUP,
|
265
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
266
275
|
GGML_OP_SUB,
|
267
276
|
GGML_OP_MUL,
|
268
277
|
GGML_OP_DIV,
|
269
278
|
GGML_OP_SQR,
|
270
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
271
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
272
283
|
GGML_OP_MEAN,
|
273
284
|
GGML_OP_REPEAT,
|
274
285
|
GGML_OP_ABS,
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
278
289
|
GGML_OP_RELU,
|
279
290
|
GGML_OP_GELU,
|
280
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
281
293
|
GGML_OP_NORM, // normalize
|
282
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
283
296
|
|
284
297
|
GGML_OP_MUL_MAT,
|
285
298
|
|
286
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
287
301
|
GGML_OP_CPY,
|
288
302
|
GGML_OP_CONT,
|
289
303
|
GGML_OP_RESHAPE,
|
@@ -291,10 +305,15 @@ extern "C" {
|
|
291
305
|
GGML_OP_PERMUTE,
|
292
306
|
GGML_OP_TRANSPOSE,
|
293
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
295
312
|
GGML_OP_SOFT_MAX,
|
296
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
297
315
|
GGML_OP_ALIBI,
|
316
|
+
GGML_OP_CLAMP,
|
298
317
|
GGML_OP_CONV_1D_1S,
|
299
318
|
GGML_OP_CONV_1D_2S,
|
300
319
|
|
@@ -322,7 +341,8 @@ extern "C" {
|
|
322
341
|
|
323
342
|
// n-dimensional tensor
|
324
343
|
struct ggml_tensor {
|
325
|
-
enum ggml_type
|
344
|
+
enum ggml_type type;
|
345
|
+
enum ggml_backend backend;
|
326
346
|
|
327
347
|
int n_dims;
|
328
348
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -353,7 +373,7 @@ extern "C" {
|
|
353
373
|
|
354
374
|
char name[32];
|
355
375
|
|
356
|
-
char padding[
|
376
|
+
char padding[16];
|
357
377
|
};
|
358
378
|
|
359
379
|
// computation graph
|
@@ -497,6 +517,29 @@ extern "C" {
|
|
497
517
|
struct ggml_tensor * a,
|
498
518
|
struct ggml_tensor * b);
|
499
519
|
|
520
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
521
|
+
struct ggml_context * ctx,
|
522
|
+
struct ggml_tensor * a,
|
523
|
+
struct ggml_tensor * b);
|
524
|
+
|
525
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
526
|
+
struct ggml_context * ctx,
|
527
|
+
struct ggml_tensor * a,
|
528
|
+
struct ggml_tensor * b,
|
529
|
+
size_t nb1,
|
530
|
+
size_t nb2,
|
531
|
+
size_t nb3,
|
532
|
+
size_t offset);
|
533
|
+
|
534
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
535
|
+
struct ggml_context * ctx,
|
536
|
+
struct ggml_tensor * a,
|
537
|
+
struct ggml_tensor * b,
|
538
|
+
size_t nb1,
|
539
|
+
size_t nb2,
|
540
|
+
size_t nb3,
|
541
|
+
size_t offset);
|
542
|
+
|
500
543
|
GGML_API struct ggml_tensor * ggml_sub(
|
501
544
|
struct ggml_context * ctx,
|
502
545
|
struct ggml_tensor * a,
|
@@ -520,12 +563,24 @@ extern "C" {
|
|
520
563
|
struct ggml_context * ctx,
|
521
564
|
struct ggml_tensor * a);
|
522
565
|
|
566
|
+
GGML_API struct ggml_tensor * ggml_log(
|
567
|
+
struct ggml_context * ctx,
|
568
|
+
struct ggml_tensor * a);
|
569
|
+
|
570
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
571
|
+
struct ggml_context * ctx,
|
572
|
+
struct ggml_tensor * a);
|
573
|
+
|
523
574
|
// return scalar
|
524
|
-
// TODO: compute sum along rows
|
525
575
|
GGML_API struct ggml_tensor * ggml_sum(
|
526
576
|
struct ggml_context * ctx,
|
527
577
|
struct ggml_tensor * a);
|
528
578
|
|
579
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
580
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
581
|
+
struct ggml_context * ctx,
|
582
|
+
struct ggml_tensor * a);
|
583
|
+
|
529
584
|
// mean along rows
|
530
585
|
GGML_API struct ggml_tensor * ggml_mean(
|
531
586
|
struct ggml_context * ctx,
|
@@ -567,6 +622,13 @@ extern "C" {
|
|
567
622
|
struct ggml_context * ctx,
|
568
623
|
struct ggml_tensor * a);
|
569
624
|
|
625
|
+
// a - x
|
626
|
+
// b - dy
|
627
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
628
|
+
struct ggml_context * ctx,
|
629
|
+
struct ggml_tensor * a,
|
630
|
+
struct ggml_tensor * b);
|
631
|
+
|
570
632
|
// normalize along rows
|
571
633
|
// TODO: eps is hardcoded to 1e-5 for now
|
572
634
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -577,6 +639,13 @@ extern "C" {
|
|
577
639
|
struct ggml_context * ctx,
|
578
640
|
struct ggml_tensor * a);
|
579
641
|
|
642
|
+
// a - x
|
643
|
+
// b - dy
|
644
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
645
|
+
struct ggml_context * ctx,
|
646
|
+
struct ggml_tensor * a,
|
647
|
+
struct ggml_tensor * b);
|
648
|
+
|
580
649
|
// A: m rows, n columns
|
581
650
|
// B: p rows, n columns (i.e. we transpose it internally)
|
582
651
|
// result is m columns, p rows
|
@@ -589,12 +658,66 @@ extern "C" {
|
|
589
658
|
// operations on tensors without backpropagation
|
590
659
|
//
|
591
660
|
|
592
|
-
// in-place, returns view(a)
|
593
661
|
GGML_API struct ggml_tensor * ggml_scale(
|
594
662
|
struct ggml_context * ctx,
|
595
663
|
struct ggml_tensor * a,
|
596
664
|
struct ggml_tensor * b);
|
597
665
|
|
666
|
+
// in-place, returns view(a)
|
667
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
668
|
+
struct ggml_context * ctx,
|
669
|
+
struct ggml_tensor * a,
|
670
|
+
struct ggml_tensor * b);
|
671
|
+
|
672
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
673
|
+
GGML_API struct ggml_tensor * ggml_set(
|
674
|
+
struct ggml_context * ctx,
|
675
|
+
struct ggml_tensor * a,
|
676
|
+
struct ggml_tensor * b,
|
677
|
+
size_t nb1,
|
678
|
+
size_t nb2,
|
679
|
+
size_t nb3,
|
680
|
+
size_t offset);
|
681
|
+
|
682
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
683
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
684
|
+
struct ggml_context * ctx,
|
685
|
+
struct ggml_tensor * a,
|
686
|
+
struct ggml_tensor * b,
|
687
|
+
size_t nb1,
|
688
|
+
size_t nb2,
|
689
|
+
size_t nb3,
|
690
|
+
size_t offset);
|
691
|
+
|
692
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
693
|
+
struct ggml_context * ctx,
|
694
|
+
struct ggml_tensor * a,
|
695
|
+
struct ggml_tensor * b,
|
696
|
+
size_t offset);
|
697
|
+
|
698
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
699
|
+
struct ggml_context * ctx,
|
700
|
+
struct ggml_tensor * a,
|
701
|
+
struct ggml_tensor * b,
|
702
|
+
size_t offset);
|
703
|
+
|
704
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
705
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
706
|
+
struct ggml_context * ctx,
|
707
|
+
struct ggml_tensor * a,
|
708
|
+
struct ggml_tensor * b,
|
709
|
+
size_t nb1,
|
710
|
+
size_t offset);
|
711
|
+
|
712
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
713
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
714
|
+
struct ggml_context * ctx,
|
715
|
+
struct ggml_tensor * a,
|
716
|
+
struct ggml_tensor * b,
|
717
|
+
size_t nb1,
|
718
|
+
size_t offset);
|
719
|
+
|
720
|
+
|
598
721
|
// a -> b, return view(b)
|
599
722
|
GGML_API struct ggml_tensor * ggml_cpy(
|
600
723
|
struct ggml_context * ctx,
|
@@ -615,6 +738,11 @@ extern "C" {
|
|
615
738
|
|
616
739
|
// return view(a)
|
617
740
|
// TODO: when we start computing gradient, make a copy instead of view
|
741
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
742
|
+
struct ggml_context * ctx,
|
743
|
+
struct ggml_tensor * a,
|
744
|
+
int64_t ne0);
|
745
|
+
|
618
746
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
619
747
|
struct ggml_context * ctx,
|
620
748
|
struct ggml_tensor * a,
|
@@ -630,6 +758,14 @@ extern "C" {
|
|
630
758
|
int64_t ne1,
|
631
759
|
int64_t ne2);
|
632
760
|
|
761
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
762
|
+
struct ggml_context * ctx,
|
763
|
+
struct ggml_tensor * a,
|
764
|
+
int64_t ne0,
|
765
|
+
int64_t ne1,
|
766
|
+
int64_t ne2,
|
767
|
+
int64_t ne3);
|
768
|
+
|
633
769
|
// offset in bytes
|
634
770
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
635
771
|
struct ggml_context * ctx,
|
@@ -655,6 +791,18 @@ extern "C" {
|
|
655
791
|
size_t nb2, // slice stride in bytes
|
656
792
|
size_t offset);
|
657
793
|
|
794
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
795
|
+
struct ggml_context * ctx,
|
796
|
+
struct ggml_tensor * a,
|
797
|
+
int64_t ne0,
|
798
|
+
int64_t ne1,
|
799
|
+
int64_t ne2,
|
800
|
+
int64_t ne3,
|
801
|
+
size_t nb1, // row stride in bytes
|
802
|
+
size_t nb2, // slice stride in bytes
|
803
|
+
size_t nb3,
|
804
|
+
size_t offset);
|
805
|
+
|
658
806
|
GGML_API struct ggml_tensor * ggml_permute(
|
659
807
|
struct ggml_context * ctx,
|
660
808
|
struct ggml_tensor * a,
|
@@ -673,20 +821,50 @@ extern "C" {
|
|
673
821
|
struct ggml_tensor * a,
|
674
822
|
struct ggml_tensor * b);
|
675
823
|
|
824
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
825
|
+
struct ggml_context * ctx,
|
826
|
+
struct ggml_tensor * a,
|
827
|
+
struct ggml_tensor * b,
|
828
|
+
struct ggml_tensor * c);
|
829
|
+
|
830
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
831
|
+
struct ggml_context * ctx,
|
832
|
+
struct ggml_tensor * a);
|
833
|
+
|
676
834
|
// set elements above the diagonal to -INF
|
677
|
-
// in-place, returns view(a)
|
678
835
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
679
836
|
struct ggml_context * ctx,
|
680
837
|
struct ggml_tensor * a,
|
681
838
|
int n_past);
|
682
839
|
|
683
840
|
// in-place, returns view(a)
|
841
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
842
|
+
struct ggml_context * ctx,
|
843
|
+
struct ggml_tensor * a,
|
844
|
+
int n_past);
|
845
|
+
|
846
|
+
// set elements above the diagonal to 0
|
847
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
848
|
+
struct ggml_context * ctx,
|
849
|
+
struct ggml_tensor * a,
|
850
|
+
int n_past);
|
851
|
+
|
852
|
+
// in-place, returns view(a)
|
853
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
854
|
+
struct ggml_context * ctx,
|
855
|
+
struct ggml_tensor * a,
|
856
|
+
int n_past);
|
857
|
+
|
684
858
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
685
859
|
struct ggml_context * ctx,
|
686
860
|
struct ggml_tensor * a);
|
687
861
|
|
688
|
-
// rotary position embedding
|
689
862
|
// in-place, returns view(a)
|
863
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
864
|
+
struct ggml_context * ctx,
|
865
|
+
struct ggml_tensor * a);
|
866
|
+
|
867
|
+
// rotary position embedding
|
690
868
|
// if mode & 1 == 1, skip n_past elements
|
691
869
|
// if mode & 2 == 1, GPT-NeoX style
|
692
870
|
// TODO: avoid creating a new tensor every time
|
@@ -697,13 +875,39 @@ extern "C" {
|
|
697
875
|
int n_dims,
|
698
876
|
int mode);
|
699
877
|
|
878
|
+
// in-place, returns view(a)
|
879
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
880
|
+
struct ggml_context * ctx,
|
881
|
+
struct ggml_tensor * a,
|
882
|
+
int n_past,
|
883
|
+
int n_dims,
|
884
|
+
int mode);
|
885
|
+
|
886
|
+
// rotary position embedding backward, i.e compute dx from dy
|
887
|
+
// a - dy
|
888
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
889
|
+
struct ggml_context * ctx,
|
890
|
+
struct ggml_tensor * a,
|
891
|
+
int n_past,
|
892
|
+
int n_dims,
|
893
|
+
int mode);
|
894
|
+
|
700
895
|
// alibi position embedding
|
701
896
|
// in-place, returns view(a)
|
702
897
|
struct ggml_tensor * ggml_alibi(
|
703
898
|
struct ggml_context * ctx,
|
704
899
|
struct ggml_tensor * a,
|
705
900
|
int n_past,
|
706
|
-
int n_head
|
901
|
+
int n_head,
|
902
|
+
float bias_max);
|
903
|
+
|
904
|
+
// clamp
|
905
|
+
// in-place, returns view(a)
|
906
|
+
struct ggml_tensor * ggml_clamp(
|
907
|
+
struct ggml_context * ctx,
|
908
|
+
struct ggml_tensor * a,
|
909
|
+
float min,
|
910
|
+
float max);
|
707
911
|
|
708
912
|
// padding = 1
|
709
913
|
// TODO: we don't support extra parameters for now
|
@@ -741,13 +945,13 @@ extern "C" {
|
|
741
945
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
742
946
|
struct ggml_context * ctx,
|
743
947
|
struct ggml_tensor * a,
|
744
|
-
|
948
|
+
ggml_unary_op_f32_t fun);
|
745
949
|
|
746
950
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
747
951
|
struct ggml_context * ctx,
|
748
952
|
struct ggml_tensor * a,
|
749
953
|
struct ggml_tensor * b,
|
750
|
-
|
954
|
+
ggml_binary_op_f32_t fun);
|
751
955
|
|
752
956
|
//
|
753
957
|
// automatic differentiation
|
@@ -876,7 +1080,6 @@ extern "C" {
|
|
876
1080
|
|
877
1081
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
878
1082
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
880
1083
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
881
1084
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
882
1085
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -101,12 +101,12 @@ struct llama_file {
|
|
101
101
|
LLAMA_ASSERT(ret == 0); // same
|
102
102
|
}
|
103
103
|
|
104
|
-
void read_raw(void * ptr, size_t
|
105
|
-
if (
|
104
|
+
void read_raw(void * ptr, size_t len) const {
|
105
|
+
if (len == 0) {
|
106
106
|
return;
|
107
107
|
}
|
108
108
|
errno = 0;
|
109
|
-
std::size_t ret = std::fread(ptr,
|
109
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
110
110
|
if (ferror(fp)) {
|
111
111
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
112
112
|
}
|
@@ -127,12 +127,12 @@ struct llama_file {
|
|
127
127
|
return std::string(chars.data(), len);
|
128
128
|
}
|
129
129
|
|
130
|
-
void write_raw(const void * ptr, size_t
|
131
|
-
if (
|
130
|
+
void write_raw(const void * ptr, size_t len) const {
|
131
|
+
if (len == 0) {
|
132
132
|
return;
|
133
133
|
}
|
134
134
|
errno = 0;
|
135
|
-
size_t ret = std::fwrite(ptr,
|
135
|
+
size_t ret = std::fwrite(ptr, len, 1, fp);
|
136
136
|
if (ret != 1) {
|
137
137
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
138
138
|
}
|
@@ -172,7 +172,7 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file,
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
@@ -184,9 +184,9 @@ struct llama_mmap {
|
|
184
184
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
185
185
|
}
|
186
186
|
|
187
|
-
if (prefetch) {
|
187
|
+
if (prefetch > 0) {
|
188
188
|
// Advise the kernel to preload the mapped memory
|
189
|
-
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
189
|
+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
190
190
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
191
191
|
strerror(errno));
|
192
192
|
}
|
@@ -267,9 +267,9 @@ struct llama_mlock {
|
|
267
267
|
}
|
268
268
|
}
|
269
269
|
|
270
|
-
void init(void *
|
271
|
-
LLAMA_ASSERT(
|
272
|
-
|
270
|
+
void init(void * ptr) {
|
271
|
+
LLAMA_ASSERT(addr == NULL && size == 0);
|
272
|
+
addr = ptr;
|
273
273
|
}
|
274
274
|
|
275
275
|
void grow_to(size_t target_size) {
|
@@ -340,14 +340,14 @@ struct llama_mlock {
|
|
340
340
|
return (size_t) si.dwPageSize;
|
341
341
|
}
|
342
342
|
|
343
|
-
bool raw_lock(void *
|
343
|
+
bool raw_lock(void * ptr, size_t len) {
|
344
344
|
for (int tries = 1; ; tries++) {
|
345
|
-
if (VirtualLock(
|
345
|
+
if (VirtualLock(ptr, len)) {
|
346
346
|
return true;
|
347
347
|
}
|
348
348
|
if (tries == 2) {
|
349
349
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
350
|
-
|
350
|
+
len, size, llama_format_win_err(GetLastError()).c_str());
|
351
351
|
return false;
|
352
352
|
}
|
353
353
|
|
@@ -363,7 +363,7 @@ struct llama_mlock {
|
|
363
363
|
// is equal to the number of pages in its minimum working set minus
|
364
364
|
// a small overhead."
|
365
365
|
// Hopefully a megabyte is enough overhead:
|
366
|
-
size_t increment =
|
366
|
+
size_t increment = len + 1048576;
|
367
367
|
// The minimum must be <= the maximum, so we need to increase both:
|
368
368
|
min_ws_size += increment;
|
369
369
|
max_ws_size += increment;
|
@@ -375,8 +375,8 @@ struct llama_mlock {
|
|
375
375
|
}
|
376
376
|
}
|
377
377
|
|
378
|
-
void raw_unlock(void *
|
379
|
-
if (!VirtualUnlock(
|
378
|
+
void raw_unlock(void * ptr, size_t len) {
|
379
|
+
if (!VirtualUnlock(ptr, len)) {
|
380
380
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
381
381
|
llama_format_win_err(GetLastError()).c_str());
|
382
382
|
}
|
@@ -388,12 +388,12 @@ struct llama_mlock {
|
|
388
388
|
return (size_t) 65536;
|
389
389
|
}
|
390
390
|
|
391
|
-
bool raw_lock(const void * addr, size_t
|
391
|
+
bool raw_lock(const void * addr, size_t len) {
|
392
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
393
|
return false;
|
394
394
|
}
|
395
395
|
|
396
|
-
void raw_unlock(const void * addr, size_t
|
396
|
+
void raw_unlock(const void * addr, size_t len) {}
|
397
397
|
#endif
|
398
398
|
};
|
399
399
|
|
@@ -404,10 +404,10 @@ struct llama_buffer {
|
|
404
404
|
|
405
405
|
llama_buffer() = default;
|
406
406
|
|
407
|
-
void resize(size_t
|
407
|
+
void resize(size_t len) {
|
408
408
|
delete[] addr;
|
409
|
-
addr = new uint8_t[
|
410
|
-
|
409
|
+
addr = new uint8_t[len];
|
410
|
+
size = len;
|
411
411
|
}
|
412
412
|
|
413
413
|
~llama_buffer() {
|