llama_cpp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,9 +190,12 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
243
246
|
GGML_TYPE_COUNT,
|
244
247
|
};
|
245
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
246
254
|
// model file types
|
247
255
|
enum ggml_ftype {
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
263
270
|
|
264
271
|
GGML_OP_DUP,
|
265
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
266
275
|
GGML_OP_SUB,
|
267
276
|
GGML_OP_MUL,
|
268
277
|
GGML_OP_DIV,
|
269
278
|
GGML_OP_SQR,
|
270
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
271
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
272
283
|
GGML_OP_MEAN,
|
273
284
|
GGML_OP_REPEAT,
|
274
285
|
GGML_OP_ABS,
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
278
289
|
GGML_OP_RELU,
|
279
290
|
GGML_OP_GELU,
|
280
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
281
293
|
GGML_OP_NORM, // normalize
|
282
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
283
296
|
|
284
297
|
GGML_OP_MUL_MAT,
|
285
298
|
|
286
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
287
301
|
GGML_OP_CPY,
|
288
302
|
GGML_OP_CONT,
|
289
303
|
GGML_OP_RESHAPE,
|
@@ -291,10 +305,15 @@ extern "C" {
|
|
291
305
|
GGML_OP_PERMUTE,
|
292
306
|
GGML_OP_TRANSPOSE,
|
293
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
295
312
|
GGML_OP_SOFT_MAX,
|
296
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
297
315
|
GGML_OP_ALIBI,
|
316
|
+
GGML_OP_CLAMP,
|
298
317
|
GGML_OP_CONV_1D_1S,
|
299
318
|
GGML_OP_CONV_1D_2S,
|
300
319
|
|
@@ -322,7 +341,8 @@ extern "C" {
|
|
322
341
|
|
323
342
|
// n-dimensional tensor
|
324
343
|
struct ggml_tensor {
|
325
|
-
enum ggml_type
|
344
|
+
enum ggml_type type;
|
345
|
+
enum ggml_backend backend;
|
326
346
|
|
327
347
|
int n_dims;
|
328
348
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -353,7 +373,7 @@ extern "C" {
|
|
353
373
|
|
354
374
|
char name[32];
|
355
375
|
|
356
|
-
char padding[
|
376
|
+
char padding[16];
|
357
377
|
};
|
358
378
|
|
359
379
|
// computation graph
|
@@ -497,6 +517,29 @@ extern "C" {
|
|
497
517
|
struct ggml_tensor * a,
|
498
518
|
struct ggml_tensor * b);
|
499
519
|
|
520
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
521
|
+
struct ggml_context * ctx,
|
522
|
+
struct ggml_tensor * a,
|
523
|
+
struct ggml_tensor * b);
|
524
|
+
|
525
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
526
|
+
struct ggml_context * ctx,
|
527
|
+
struct ggml_tensor * a,
|
528
|
+
struct ggml_tensor * b,
|
529
|
+
size_t nb1,
|
530
|
+
size_t nb2,
|
531
|
+
size_t nb3,
|
532
|
+
size_t offset);
|
533
|
+
|
534
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
535
|
+
struct ggml_context * ctx,
|
536
|
+
struct ggml_tensor * a,
|
537
|
+
struct ggml_tensor * b,
|
538
|
+
size_t nb1,
|
539
|
+
size_t nb2,
|
540
|
+
size_t nb3,
|
541
|
+
size_t offset);
|
542
|
+
|
500
543
|
GGML_API struct ggml_tensor * ggml_sub(
|
501
544
|
struct ggml_context * ctx,
|
502
545
|
struct ggml_tensor * a,
|
@@ -520,12 +563,24 @@ extern "C" {
|
|
520
563
|
struct ggml_context * ctx,
|
521
564
|
struct ggml_tensor * a);
|
522
565
|
|
566
|
+
GGML_API struct ggml_tensor * ggml_log(
|
567
|
+
struct ggml_context * ctx,
|
568
|
+
struct ggml_tensor * a);
|
569
|
+
|
570
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
571
|
+
struct ggml_context * ctx,
|
572
|
+
struct ggml_tensor * a);
|
573
|
+
|
523
574
|
// return scalar
|
524
|
-
// TODO: compute sum along rows
|
525
575
|
GGML_API struct ggml_tensor * ggml_sum(
|
526
576
|
struct ggml_context * ctx,
|
527
577
|
struct ggml_tensor * a);
|
528
578
|
|
579
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
580
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
581
|
+
struct ggml_context * ctx,
|
582
|
+
struct ggml_tensor * a);
|
583
|
+
|
529
584
|
// mean along rows
|
530
585
|
GGML_API struct ggml_tensor * ggml_mean(
|
531
586
|
struct ggml_context * ctx,
|
@@ -567,6 +622,13 @@ extern "C" {
|
|
567
622
|
struct ggml_context * ctx,
|
568
623
|
struct ggml_tensor * a);
|
569
624
|
|
625
|
+
// a - x
|
626
|
+
// b - dy
|
627
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
628
|
+
struct ggml_context * ctx,
|
629
|
+
struct ggml_tensor * a,
|
630
|
+
struct ggml_tensor * b);
|
631
|
+
|
570
632
|
// normalize along rows
|
571
633
|
// TODO: eps is hardcoded to 1e-5 for now
|
572
634
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -577,6 +639,13 @@ extern "C" {
|
|
577
639
|
struct ggml_context * ctx,
|
578
640
|
struct ggml_tensor * a);
|
579
641
|
|
642
|
+
// a - x
|
643
|
+
// b - dy
|
644
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
645
|
+
struct ggml_context * ctx,
|
646
|
+
struct ggml_tensor * a,
|
647
|
+
struct ggml_tensor * b);
|
648
|
+
|
580
649
|
// A: m rows, n columns
|
581
650
|
// B: p rows, n columns (i.e. we transpose it internally)
|
582
651
|
// result is m columns, p rows
|
@@ -589,12 +658,66 @@ extern "C" {
|
|
589
658
|
// operations on tensors without backpropagation
|
590
659
|
//
|
591
660
|
|
592
|
-
// in-place, returns view(a)
|
593
661
|
GGML_API struct ggml_tensor * ggml_scale(
|
594
662
|
struct ggml_context * ctx,
|
595
663
|
struct ggml_tensor * a,
|
596
664
|
struct ggml_tensor * b);
|
597
665
|
|
666
|
+
// in-place, returns view(a)
|
667
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
668
|
+
struct ggml_context * ctx,
|
669
|
+
struct ggml_tensor * a,
|
670
|
+
struct ggml_tensor * b);
|
671
|
+
|
672
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
673
|
+
GGML_API struct ggml_tensor * ggml_set(
|
674
|
+
struct ggml_context * ctx,
|
675
|
+
struct ggml_tensor * a,
|
676
|
+
struct ggml_tensor * b,
|
677
|
+
size_t nb1,
|
678
|
+
size_t nb2,
|
679
|
+
size_t nb3,
|
680
|
+
size_t offset);
|
681
|
+
|
682
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
683
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
684
|
+
struct ggml_context * ctx,
|
685
|
+
struct ggml_tensor * a,
|
686
|
+
struct ggml_tensor * b,
|
687
|
+
size_t nb1,
|
688
|
+
size_t nb2,
|
689
|
+
size_t nb3,
|
690
|
+
size_t offset);
|
691
|
+
|
692
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
693
|
+
struct ggml_context * ctx,
|
694
|
+
struct ggml_tensor * a,
|
695
|
+
struct ggml_tensor * b,
|
696
|
+
size_t offset);
|
697
|
+
|
698
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
699
|
+
struct ggml_context * ctx,
|
700
|
+
struct ggml_tensor * a,
|
701
|
+
struct ggml_tensor * b,
|
702
|
+
size_t offset);
|
703
|
+
|
704
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
705
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
706
|
+
struct ggml_context * ctx,
|
707
|
+
struct ggml_tensor * a,
|
708
|
+
struct ggml_tensor * b,
|
709
|
+
size_t nb1,
|
710
|
+
size_t offset);
|
711
|
+
|
712
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
713
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
714
|
+
struct ggml_context * ctx,
|
715
|
+
struct ggml_tensor * a,
|
716
|
+
struct ggml_tensor * b,
|
717
|
+
size_t nb1,
|
718
|
+
size_t offset);
|
719
|
+
|
720
|
+
|
598
721
|
// a -> b, return view(b)
|
599
722
|
GGML_API struct ggml_tensor * ggml_cpy(
|
600
723
|
struct ggml_context * ctx,
|
@@ -615,6 +738,11 @@ extern "C" {
|
|
615
738
|
|
616
739
|
// return view(a)
|
617
740
|
// TODO: when we start computing gradient, make a copy instead of view
|
741
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
742
|
+
struct ggml_context * ctx,
|
743
|
+
struct ggml_tensor * a,
|
744
|
+
int64_t ne0);
|
745
|
+
|
618
746
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
619
747
|
struct ggml_context * ctx,
|
620
748
|
struct ggml_tensor * a,
|
@@ -630,6 +758,14 @@ extern "C" {
|
|
630
758
|
int64_t ne1,
|
631
759
|
int64_t ne2);
|
632
760
|
|
761
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
762
|
+
struct ggml_context * ctx,
|
763
|
+
struct ggml_tensor * a,
|
764
|
+
int64_t ne0,
|
765
|
+
int64_t ne1,
|
766
|
+
int64_t ne2,
|
767
|
+
int64_t ne3);
|
768
|
+
|
633
769
|
// offset in bytes
|
634
770
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
635
771
|
struct ggml_context * ctx,
|
@@ -655,6 +791,18 @@ extern "C" {
|
|
655
791
|
size_t nb2, // slice stride in bytes
|
656
792
|
size_t offset);
|
657
793
|
|
794
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
795
|
+
struct ggml_context * ctx,
|
796
|
+
struct ggml_tensor * a,
|
797
|
+
int64_t ne0,
|
798
|
+
int64_t ne1,
|
799
|
+
int64_t ne2,
|
800
|
+
int64_t ne3,
|
801
|
+
size_t nb1, // row stride in bytes
|
802
|
+
size_t nb2, // slice stride in bytes
|
803
|
+
size_t nb3,
|
804
|
+
size_t offset);
|
805
|
+
|
658
806
|
GGML_API struct ggml_tensor * ggml_permute(
|
659
807
|
struct ggml_context * ctx,
|
660
808
|
struct ggml_tensor * a,
|
@@ -673,20 +821,50 @@ extern "C" {
|
|
673
821
|
struct ggml_tensor * a,
|
674
822
|
struct ggml_tensor * b);
|
675
823
|
|
824
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
825
|
+
struct ggml_context * ctx,
|
826
|
+
struct ggml_tensor * a,
|
827
|
+
struct ggml_tensor * b,
|
828
|
+
struct ggml_tensor * c);
|
829
|
+
|
830
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
831
|
+
struct ggml_context * ctx,
|
832
|
+
struct ggml_tensor * a);
|
833
|
+
|
676
834
|
// set elements above the diagonal to -INF
|
677
|
-
// in-place, returns view(a)
|
678
835
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
679
836
|
struct ggml_context * ctx,
|
680
837
|
struct ggml_tensor * a,
|
681
838
|
int n_past);
|
682
839
|
|
683
840
|
// in-place, returns view(a)
|
841
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
842
|
+
struct ggml_context * ctx,
|
843
|
+
struct ggml_tensor * a,
|
844
|
+
int n_past);
|
845
|
+
|
846
|
+
// set elements above the diagonal to 0
|
847
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
848
|
+
struct ggml_context * ctx,
|
849
|
+
struct ggml_tensor * a,
|
850
|
+
int n_past);
|
851
|
+
|
852
|
+
// in-place, returns view(a)
|
853
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
854
|
+
struct ggml_context * ctx,
|
855
|
+
struct ggml_tensor * a,
|
856
|
+
int n_past);
|
857
|
+
|
684
858
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
685
859
|
struct ggml_context * ctx,
|
686
860
|
struct ggml_tensor * a);
|
687
861
|
|
688
|
-
// rotary position embedding
|
689
862
|
// in-place, returns view(a)
|
863
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
864
|
+
struct ggml_context * ctx,
|
865
|
+
struct ggml_tensor * a);
|
866
|
+
|
867
|
+
// rotary position embedding
|
690
868
|
// if mode & 1 == 1, skip n_past elements
|
691
869
|
// if mode & 2 == 1, GPT-NeoX style
|
692
870
|
// TODO: avoid creating a new tensor every time
|
@@ -697,13 +875,39 @@ extern "C" {
|
|
697
875
|
int n_dims,
|
698
876
|
int mode);
|
699
877
|
|
878
|
+
// in-place, returns view(a)
|
879
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
880
|
+
struct ggml_context * ctx,
|
881
|
+
struct ggml_tensor * a,
|
882
|
+
int n_past,
|
883
|
+
int n_dims,
|
884
|
+
int mode);
|
885
|
+
|
886
|
+
// rotary position embedding backward, i.e compute dx from dy
|
887
|
+
// a - dy
|
888
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
889
|
+
struct ggml_context * ctx,
|
890
|
+
struct ggml_tensor * a,
|
891
|
+
int n_past,
|
892
|
+
int n_dims,
|
893
|
+
int mode);
|
894
|
+
|
700
895
|
// alibi position embedding
|
701
896
|
// in-place, returns view(a)
|
702
897
|
struct ggml_tensor * ggml_alibi(
|
703
898
|
struct ggml_context * ctx,
|
704
899
|
struct ggml_tensor * a,
|
705
900
|
int n_past,
|
706
|
-
int n_head
|
901
|
+
int n_head,
|
902
|
+
float bias_max);
|
903
|
+
|
904
|
+
// clamp
|
905
|
+
// in-place, returns view(a)
|
906
|
+
struct ggml_tensor * ggml_clamp(
|
907
|
+
struct ggml_context * ctx,
|
908
|
+
struct ggml_tensor * a,
|
909
|
+
float min,
|
910
|
+
float max);
|
707
911
|
|
708
912
|
// padding = 1
|
709
913
|
// TODO: we don't support extra parameters for now
|
@@ -741,13 +945,13 @@ extern "C" {
|
|
741
945
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
742
946
|
struct ggml_context * ctx,
|
743
947
|
struct ggml_tensor * a,
|
744
|
-
|
948
|
+
ggml_unary_op_f32_t fun);
|
745
949
|
|
746
950
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
747
951
|
struct ggml_context * ctx,
|
748
952
|
struct ggml_tensor * a,
|
749
953
|
struct ggml_tensor * b,
|
750
|
-
|
954
|
+
ggml_binary_op_f32_t fun);
|
751
955
|
|
752
956
|
//
|
753
957
|
// automatic differentiation
|
@@ -876,7 +1080,6 @@ extern "C" {
|
|
876
1080
|
|
877
1081
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
878
1082
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
880
1083
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
881
1084
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
882
1085
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -101,12 +101,12 @@ struct llama_file {
|
|
101
101
|
LLAMA_ASSERT(ret == 0); // same
|
102
102
|
}
|
103
103
|
|
104
|
-
void read_raw(void * ptr, size_t
|
105
|
-
if (
|
104
|
+
void read_raw(void * ptr, size_t len) const {
|
105
|
+
if (len == 0) {
|
106
106
|
return;
|
107
107
|
}
|
108
108
|
errno = 0;
|
109
|
-
std::size_t ret = std::fread(ptr,
|
109
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
110
110
|
if (ferror(fp)) {
|
111
111
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
112
112
|
}
|
@@ -127,12 +127,12 @@ struct llama_file {
|
|
127
127
|
return std::string(chars.data(), len);
|
128
128
|
}
|
129
129
|
|
130
|
-
void write_raw(const void * ptr, size_t
|
131
|
-
if (
|
130
|
+
void write_raw(const void * ptr, size_t len) const {
|
131
|
+
if (len == 0) {
|
132
132
|
return;
|
133
133
|
}
|
134
134
|
errno = 0;
|
135
|
-
size_t ret = std::fwrite(ptr,
|
135
|
+
size_t ret = std::fwrite(ptr, len, 1, fp);
|
136
136
|
if (ret != 1) {
|
137
137
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
138
138
|
}
|
@@ -172,7 +172,7 @@ struct llama_mmap {
|
|
172
172
|
#ifdef _POSIX_MAPPED_FILES
|
173
173
|
static constexpr bool SUPPORTED = true;
|
174
174
|
|
175
|
-
llama_mmap(struct llama_file * file,
|
175
|
+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
176
176
|
size = file->size;
|
177
177
|
int fd = fileno(file->fp);
|
178
178
|
int flags = MAP_SHARED;
|
@@ -184,9 +184,9 @@ struct llama_mmap {
|
|
184
184
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
185
185
|
}
|
186
186
|
|
187
|
-
if (prefetch) {
|
187
|
+
if (prefetch > 0) {
|
188
188
|
// Advise the kernel to preload the mapped memory
|
189
|
-
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
189
|
+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
190
190
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
191
191
|
strerror(errno));
|
192
192
|
}
|
@@ -267,9 +267,9 @@ struct llama_mlock {
|
|
267
267
|
}
|
268
268
|
}
|
269
269
|
|
270
|
-
void init(void *
|
271
|
-
LLAMA_ASSERT(
|
272
|
-
|
270
|
+
void init(void * ptr) {
|
271
|
+
LLAMA_ASSERT(addr == NULL && size == 0);
|
272
|
+
addr = ptr;
|
273
273
|
}
|
274
274
|
|
275
275
|
void grow_to(size_t target_size) {
|
@@ -340,14 +340,14 @@ struct llama_mlock {
|
|
340
340
|
return (size_t) si.dwPageSize;
|
341
341
|
}
|
342
342
|
|
343
|
-
bool raw_lock(void *
|
343
|
+
bool raw_lock(void * ptr, size_t len) {
|
344
344
|
for (int tries = 1; ; tries++) {
|
345
|
-
if (VirtualLock(
|
345
|
+
if (VirtualLock(ptr, len)) {
|
346
346
|
return true;
|
347
347
|
}
|
348
348
|
if (tries == 2) {
|
349
349
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
350
|
-
|
350
|
+
len, size, llama_format_win_err(GetLastError()).c_str());
|
351
351
|
return false;
|
352
352
|
}
|
353
353
|
|
@@ -363,7 +363,7 @@ struct llama_mlock {
|
|
363
363
|
// is equal to the number of pages in its minimum working set minus
|
364
364
|
// a small overhead."
|
365
365
|
// Hopefully a megabyte is enough overhead:
|
366
|
-
size_t increment =
|
366
|
+
size_t increment = len + 1048576;
|
367
367
|
// The minimum must be <= the maximum, so we need to increase both:
|
368
368
|
min_ws_size += increment;
|
369
369
|
max_ws_size += increment;
|
@@ -375,8 +375,8 @@ struct llama_mlock {
|
|
375
375
|
}
|
376
376
|
}
|
377
377
|
|
378
|
-
void raw_unlock(void *
|
379
|
-
if (!VirtualUnlock(
|
378
|
+
void raw_unlock(void * ptr, size_t len) {
|
379
|
+
if (!VirtualUnlock(ptr, len)) {
|
380
380
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
381
381
|
llama_format_win_err(GetLastError()).c_str());
|
382
382
|
}
|
@@ -388,12 +388,12 @@ struct llama_mlock {
|
|
388
388
|
return (size_t) 65536;
|
389
389
|
}
|
390
390
|
|
391
|
-
bool raw_lock(const void * addr, size_t
|
391
|
+
bool raw_lock(const void * addr, size_t len) {
|
392
392
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
393
393
|
return false;
|
394
394
|
}
|
395
395
|
|
396
|
-
void raw_unlock(const void * addr, size_t
|
396
|
+
void raw_unlock(const void * addr, size_t len) {}
|
397
397
|
#endif
|
398
398
|
};
|
399
399
|
|
@@ -404,10 +404,10 @@ struct llama_buffer {
|
|
404
404
|
|
405
405
|
llama_buffer() = default;
|
406
406
|
|
407
|
-
void resize(size_t
|
407
|
+
void resize(size_t len) {
|
408
408
|
delete[] addr;
|
409
|
-
addr = new uint8_t[
|
410
|
-
|
409
|
+
addr = new uint8_t[len];
|
410
|
+
size = len;
|
411
411
|
}
|
412
412
|
|
413
413
|
~llama_buffer() {
|