llama_cpp 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,9 +190,12 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
243
246
|
GGML_TYPE_COUNT,
|
244
247
|
};
|
245
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
246
254
|
// model file types
|
247
255
|
enum ggml_ftype {
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
263
270
|
|
264
271
|
GGML_OP_DUP,
|
265
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
266
275
|
GGML_OP_SUB,
|
267
276
|
GGML_OP_MUL,
|
268
277
|
GGML_OP_DIV,
|
269
278
|
GGML_OP_SQR,
|
270
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
271
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
272
283
|
GGML_OP_MEAN,
|
273
284
|
GGML_OP_REPEAT,
|
274
285
|
GGML_OP_ABS,
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
278
289
|
GGML_OP_RELU,
|
279
290
|
GGML_OP_GELU,
|
280
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
281
293
|
GGML_OP_NORM, // normalize
|
282
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
283
296
|
|
284
297
|
GGML_OP_MUL_MAT,
|
285
298
|
|
286
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
287
301
|
GGML_OP_CPY,
|
288
302
|
GGML_OP_CONT,
|
289
303
|
GGML_OP_RESHAPE,
|
@@ -291,9 +305,13 @@ extern "C" {
|
|
291
305
|
GGML_OP_PERMUTE,
|
292
306
|
GGML_OP_TRANSPOSE,
|
293
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
295
312
|
GGML_OP_SOFT_MAX,
|
296
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
297
315
|
GGML_OP_ALIBI,
|
298
316
|
GGML_OP_CONV_1D_1S,
|
299
317
|
GGML_OP_CONV_1D_2S,
|
@@ -322,7 +340,8 @@ extern "C" {
|
|
322
340
|
|
323
341
|
// n-dimensional tensor
|
324
342
|
struct ggml_tensor {
|
325
|
-
enum ggml_type
|
343
|
+
enum ggml_type type;
|
344
|
+
enum ggml_backend backend;
|
326
345
|
|
327
346
|
int n_dims;
|
328
347
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -353,7 +372,7 @@ extern "C" {
|
|
353
372
|
|
354
373
|
char name[32];
|
355
374
|
|
356
|
-
char padding[
|
375
|
+
char padding[16];
|
357
376
|
};
|
358
377
|
|
359
378
|
// computation graph
|
@@ -497,6 +516,29 @@ extern "C" {
|
|
497
516
|
struct ggml_tensor * a,
|
498
517
|
struct ggml_tensor * b);
|
499
518
|
|
519
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
520
|
+
struct ggml_context * ctx,
|
521
|
+
struct ggml_tensor * a,
|
522
|
+
struct ggml_tensor * b);
|
523
|
+
|
524
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
525
|
+
struct ggml_context * ctx,
|
526
|
+
struct ggml_tensor * a,
|
527
|
+
struct ggml_tensor * b,
|
528
|
+
size_t nb1,
|
529
|
+
size_t nb2,
|
530
|
+
size_t nb3,
|
531
|
+
size_t offset);
|
532
|
+
|
533
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
534
|
+
struct ggml_context * ctx,
|
535
|
+
struct ggml_tensor * a,
|
536
|
+
struct ggml_tensor * b,
|
537
|
+
size_t nb1,
|
538
|
+
size_t nb2,
|
539
|
+
size_t nb3,
|
540
|
+
size_t offset);
|
541
|
+
|
500
542
|
GGML_API struct ggml_tensor * ggml_sub(
|
501
543
|
struct ggml_context * ctx,
|
502
544
|
struct ggml_tensor * a,
|
@@ -520,12 +562,24 @@ extern "C" {
|
|
520
562
|
struct ggml_context * ctx,
|
521
563
|
struct ggml_tensor * a);
|
522
564
|
|
565
|
+
GGML_API struct ggml_tensor * ggml_log(
|
566
|
+
struct ggml_context * ctx,
|
567
|
+
struct ggml_tensor * a);
|
568
|
+
|
569
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
570
|
+
struct ggml_context * ctx,
|
571
|
+
struct ggml_tensor * a);
|
572
|
+
|
523
573
|
// return scalar
|
524
|
-
// TODO: compute sum along rows
|
525
574
|
GGML_API struct ggml_tensor * ggml_sum(
|
526
575
|
struct ggml_context * ctx,
|
527
576
|
struct ggml_tensor * a);
|
528
577
|
|
578
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
579
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
580
|
+
struct ggml_context * ctx,
|
581
|
+
struct ggml_tensor * a);
|
582
|
+
|
529
583
|
// mean along rows
|
530
584
|
GGML_API struct ggml_tensor * ggml_mean(
|
531
585
|
struct ggml_context * ctx,
|
@@ -567,6 +621,13 @@ extern "C" {
|
|
567
621
|
struct ggml_context * ctx,
|
568
622
|
struct ggml_tensor * a);
|
569
623
|
|
624
|
+
// a - x
|
625
|
+
// b - dy
|
626
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
570
631
|
// normalize along rows
|
571
632
|
// TODO: eps is hardcoded to 1e-5 for now
|
572
633
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -577,6 +638,13 @@ extern "C" {
|
|
577
638
|
struct ggml_context * ctx,
|
578
639
|
struct ggml_tensor * a);
|
579
640
|
|
641
|
+
// a - x
|
642
|
+
// b - dy
|
643
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
644
|
+
struct ggml_context * ctx,
|
645
|
+
struct ggml_tensor * a,
|
646
|
+
struct ggml_tensor * b);
|
647
|
+
|
580
648
|
// A: m rows, n columns
|
581
649
|
// B: p rows, n columns (i.e. we transpose it internally)
|
582
650
|
// result is m columns, p rows
|
@@ -589,12 +657,66 @@ extern "C" {
|
|
589
657
|
// operations on tensors without backpropagation
|
590
658
|
//
|
591
659
|
|
592
|
-
// in-place, returns view(a)
|
593
660
|
GGML_API struct ggml_tensor * ggml_scale(
|
594
661
|
struct ggml_context * ctx,
|
595
662
|
struct ggml_tensor * a,
|
596
663
|
struct ggml_tensor * b);
|
597
664
|
|
665
|
+
// in-place, returns view(a)
|
666
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
667
|
+
struct ggml_context * ctx,
|
668
|
+
struct ggml_tensor * a,
|
669
|
+
struct ggml_tensor * b);
|
670
|
+
|
671
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
672
|
+
GGML_API struct ggml_tensor * ggml_set(
|
673
|
+
struct ggml_context * ctx,
|
674
|
+
struct ggml_tensor * a,
|
675
|
+
struct ggml_tensor * b,
|
676
|
+
size_t nb1,
|
677
|
+
size_t nb2,
|
678
|
+
size_t nb3,
|
679
|
+
size_t offset);
|
680
|
+
|
681
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
682
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
683
|
+
struct ggml_context * ctx,
|
684
|
+
struct ggml_tensor * a,
|
685
|
+
struct ggml_tensor * b,
|
686
|
+
size_t nb1,
|
687
|
+
size_t nb2,
|
688
|
+
size_t nb3,
|
689
|
+
size_t offset);
|
690
|
+
|
691
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
692
|
+
struct ggml_context * ctx,
|
693
|
+
struct ggml_tensor * a,
|
694
|
+
struct ggml_tensor * b,
|
695
|
+
size_t offset);
|
696
|
+
|
697
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
698
|
+
struct ggml_context * ctx,
|
699
|
+
struct ggml_tensor * a,
|
700
|
+
struct ggml_tensor * b,
|
701
|
+
size_t offset);
|
702
|
+
|
703
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
704
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
705
|
+
struct ggml_context * ctx,
|
706
|
+
struct ggml_tensor * a,
|
707
|
+
struct ggml_tensor * b,
|
708
|
+
size_t nb1,
|
709
|
+
size_t offset);
|
710
|
+
|
711
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
712
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
713
|
+
struct ggml_context * ctx,
|
714
|
+
struct ggml_tensor * a,
|
715
|
+
struct ggml_tensor * b,
|
716
|
+
size_t nb1,
|
717
|
+
size_t offset);
|
718
|
+
|
719
|
+
|
598
720
|
// a -> b, return view(b)
|
599
721
|
GGML_API struct ggml_tensor * ggml_cpy(
|
600
722
|
struct ggml_context * ctx,
|
@@ -615,6 +737,11 @@ extern "C" {
|
|
615
737
|
|
616
738
|
// return view(a)
|
617
739
|
// TODO: when we start computing gradient, make a copy instead of view
|
740
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
741
|
+
struct ggml_context * ctx,
|
742
|
+
struct ggml_tensor * a,
|
743
|
+
int64_t ne0);
|
744
|
+
|
618
745
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
619
746
|
struct ggml_context * ctx,
|
620
747
|
struct ggml_tensor * a,
|
@@ -630,6 +757,14 @@ extern "C" {
|
|
630
757
|
int64_t ne1,
|
631
758
|
int64_t ne2);
|
632
759
|
|
760
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
761
|
+
struct ggml_context * ctx,
|
762
|
+
struct ggml_tensor * a,
|
763
|
+
int64_t ne0,
|
764
|
+
int64_t ne1,
|
765
|
+
int64_t ne2,
|
766
|
+
int64_t ne3);
|
767
|
+
|
633
768
|
// offset in bytes
|
634
769
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
635
770
|
struct ggml_context * ctx,
|
@@ -655,6 +790,18 @@ extern "C" {
|
|
655
790
|
size_t nb2, // slice stride in bytes
|
656
791
|
size_t offset);
|
657
792
|
|
793
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
794
|
+
struct ggml_context * ctx,
|
795
|
+
struct ggml_tensor * a,
|
796
|
+
int64_t ne0,
|
797
|
+
int64_t ne1,
|
798
|
+
int64_t ne2,
|
799
|
+
int64_t ne3,
|
800
|
+
size_t nb1, // row stride in bytes
|
801
|
+
size_t nb2, // slice stride in bytes
|
802
|
+
size_t nb3,
|
803
|
+
size_t offset);
|
804
|
+
|
658
805
|
GGML_API struct ggml_tensor * ggml_permute(
|
659
806
|
struct ggml_context * ctx,
|
660
807
|
struct ggml_tensor * a,
|
@@ -673,20 +820,50 @@ extern "C" {
|
|
673
820
|
struct ggml_tensor * a,
|
674
821
|
struct ggml_tensor * b);
|
675
822
|
|
823
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
824
|
+
struct ggml_context * ctx,
|
825
|
+
struct ggml_tensor * a,
|
826
|
+
struct ggml_tensor * b,
|
827
|
+
struct ggml_tensor * c);
|
828
|
+
|
829
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
830
|
+
struct ggml_context * ctx,
|
831
|
+
struct ggml_tensor * a);
|
832
|
+
|
676
833
|
// set elements above the diagonal to -INF
|
677
|
-
// in-place, returns view(a)
|
678
834
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
679
835
|
struct ggml_context * ctx,
|
680
836
|
struct ggml_tensor * a,
|
681
837
|
int n_past);
|
682
838
|
|
683
839
|
// in-place, returns view(a)
|
840
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
841
|
+
struct ggml_context * ctx,
|
842
|
+
struct ggml_tensor * a,
|
843
|
+
int n_past);
|
844
|
+
|
845
|
+
// set elements above the diagonal to 0
|
846
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
847
|
+
struct ggml_context * ctx,
|
848
|
+
struct ggml_tensor * a,
|
849
|
+
int n_past);
|
850
|
+
|
851
|
+
// in-place, returns view(a)
|
852
|
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
853
|
+
struct ggml_context * ctx,
|
854
|
+
struct ggml_tensor * a,
|
855
|
+
int n_past);
|
856
|
+
|
684
857
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
685
858
|
struct ggml_context * ctx,
|
686
859
|
struct ggml_tensor * a);
|
687
860
|
|
688
|
-
// rotary position embedding
|
689
861
|
// in-place, returns view(a)
|
862
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
863
|
+
struct ggml_context * ctx,
|
864
|
+
struct ggml_tensor * a);
|
865
|
+
|
866
|
+
// rotary position embedding
|
690
867
|
// if mode & 1 == 1, skip n_past elements
|
691
868
|
// if mode & 2 == 1, GPT-NeoX style
|
692
869
|
// TODO: avoid creating a new tensor every time
|
@@ -697,6 +874,23 @@ extern "C" {
|
|
697
874
|
int n_dims,
|
698
875
|
int mode);
|
699
876
|
|
877
|
+
// in-place, returns view(a)
|
878
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
879
|
+
struct ggml_context * ctx,
|
880
|
+
struct ggml_tensor * a,
|
881
|
+
int n_past,
|
882
|
+
int n_dims,
|
883
|
+
int mode);
|
884
|
+
|
885
|
+
// rotary position embedding backward, i.e compute dx from dy
|
886
|
+
// a - dy
|
887
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
888
|
+
struct ggml_context * ctx,
|
889
|
+
struct ggml_tensor * a,
|
890
|
+
int n_past,
|
891
|
+
int n_dims,
|
892
|
+
int mode);
|
893
|
+
|
700
894
|
// alibi position embedding
|
701
895
|
// in-place, returns view(a)
|
702
896
|
struct ggml_tensor * ggml_alibi(
|
@@ -741,13 +935,13 @@ extern "C" {
|
|
741
935
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
742
936
|
struct ggml_context * ctx,
|
743
937
|
struct ggml_tensor * a,
|
744
|
-
|
938
|
+
ggml_unary_op_f32_t fun);
|
745
939
|
|
746
940
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
747
941
|
struct ggml_context * ctx,
|
748
942
|
struct ggml_tensor * a,
|
749
943
|
struct ggml_tensor * b,
|
750
|
-
|
944
|
+
ggml_binary_op_f32_t fun);
|
751
945
|
|
752
946
|
//
|
753
947
|
// automatic differentiation
|
@@ -876,7 +1070,6 @@ extern "C" {
|
|
876
1070
|
|
877
1071
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
878
1072
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
880
1073
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
881
1074
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
882
1075
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|