llama_cpp 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -190,9 +190,12 @@
|
|
190
190
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
191
191
|
#define GGML_FILE_VERSION 1
|
192
192
|
|
193
|
+
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
|
194
|
+
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
195
|
+
|
193
196
|
#define GGML_MAX_DIMS 4
|
194
197
|
#define GGML_MAX_NODES 4096
|
195
|
-
#define GGML_MAX_PARAMS
|
198
|
+
#define GGML_MAX_PARAMS 256
|
196
199
|
#define GGML_MAX_CONTEXTS 64
|
197
200
|
#define GGML_MAX_OPT 4
|
198
201
|
#define GGML_DEFAULT_N_THREADS 4
|
@@ -231,7 +234,7 @@ extern "C" {
|
|
231
234
|
GGML_TYPE_F16 = 1,
|
232
235
|
GGML_TYPE_Q4_0 = 2,
|
233
236
|
GGML_TYPE_Q4_1 = 3,
|
234
|
-
GGML_TYPE_Q4_2 = 4,
|
237
|
+
// GGML_TYPE_Q4_2 = 4, support has been removed
|
235
238
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
236
239
|
GGML_TYPE_Q5_0 = 6,
|
237
240
|
GGML_TYPE_Q5_1 = 7,
|
@@ -243,6 +246,11 @@ extern "C" {
|
|
243
246
|
GGML_TYPE_COUNT,
|
244
247
|
};
|
245
248
|
|
249
|
+
enum ggml_backend {
|
250
|
+
GGML_BACKEND_CPU = 0,
|
251
|
+
GGML_BACKEND_CUDA = 1,
|
252
|
+
};
|
253
|
+
|
246
254
|
// model file types
|
247
255
|
enum ggml_ftype {
|
248
256
|
GGML_FTYPE_UNKNOWN = -1,
|
@@ -251,7 +259,6 @@ extern "C" {
|
|
251
259
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
252
260
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
253
261
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
254
|
-
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
255
262
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
256
263
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
257
264
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
@@ -263,12 +270,16 @@ extern "C" {
|
|
263
270
|
|
264
271
|
GGML_OP_DUP,
|
265
272
|
GGML_OP_ADD,
|
273
|
+
GGML_OP_ADD1,
|
274
|
+
GGML_OP_ACC,
|
266
275
|
GGML_OP_SUB,
|
267
276
|
GGML_OP_MUL,
|
268
277
|
GGML_OP_DIV,
|
269
278
|
GGML_OP_SQR,
|
270
279
|
GGML_OP_SQRT,
|
280
|
+
GGML_OP_LOG,
|
271
281
|
GGML_OP_SUM,
|
282
|
+
GGML_OP_SUM_ROWS,
|
272
283
|
GGML_OP_MEAN,
|
273
284
|
GGML_OP_REPEAT,
|
274
285
|
GGML_OP_ABS,
|
@@ -278,12 +289,15 @@ extern "C" {
|
|
278
289
|
GGML_OP_RELU,
|
279
290
|
GGML_OP_GELU,
|
280
291
|
GGML_OP_SILU,
|
292
|
+
GGML_OP_SILU_BACK,
|
281
293
|
GGML_OP_NORM, // normalize
|
282
294
|
GGML_OP_RMS_NORM,
|
295
|
+
GGML_OP_RMS_NORM_BACK,
|
283
296
|
|
284
297
|
GGML_OP_MUL_MAT,
|
285
298
|
|
286
299
|
GGML_OP_SCALE,
|
300
|
+
GGML_OP_SET,
|
287
301
|
GGML_OP_CPY,
|
288
302
|
GGML_OP_CONT,
|
289
303
|
GGML_OP_RESHAPE,
|
@@ -291,9 +305,13 @@ extern "C" {
|
|
291
305
|
GGML_OP_PERMUTE,
|
292
306
|
GGML_OP_TRANSPOSE,
|
293
307
|
GGML_OP_GET_ROWS,
|
308
|
+
GGML_OP_GET_ROWS_BACK,
|
309
|
+
GGML_OP_DIAG,
|
294
310
|
GGML_OP_DIAG_MASK_INF,
|
311
|
+
GGML_OP_DIAG_MASK_ZERO,
|
295
312
|
GGML_OP_SOFT_MAX,
|
296
313
|
GGML_OP_ROPE,
|
314
|
+
GGML_OP_ROPE_BACK,
|
297
315
|
GGML_OP_ALIBI,
|
298
316
|
GGML_OP_CONV_1D_1S,
|
299
317
|
GGML_OP_CONV_1D_2S,
|
@@ -322,7 +340,8 @@ extern "C" {
|
|
322
340
|
|
323
341
|
// n-dimensional tensor
|
324
342
|
struct ggml_tensor {
|
325
|
-
enum ggml_type
|
343
|
+
enum ggml_type type;
|
344
|
+
enum ggml_backend backend;
|
326
345
|
|
327
346
|
int n_dims;
|
328
347
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -353,7 +372,7 @@ extern "C" {
|
|
353
372
|
|
354
373
|
char name[32];
|
355
374
|
|
356
|
-
char padding[
|
375
|
+
char padding[16];
|
357
376
|
};
|
358
377
|
|
359
378
|
// computation graph
|
@@ -497,6 +516,29 @@ extern "C" {
|
|
497
516
|
struct ggml_tensor * a,
|
498
517
|
struct ggml_tensor * b);
|
499
518
|
|
519
|
+
GGML_API struct ggml_tensor * ggml_add1(
|
520
|
+
struct ggml_context * ctx,
|
521
|
+
struct ggml_tensor * a,
|
522
|
+
struct ggml_tensor * b);
|
523
|
+
|
524
|
+
GGML_API struct ggml_tensor * ggml_acc(
|
525
|
+
struct ggml_context * ctx,
|
526
|
+
struct ggml_tensor * a,
|
527
|
+
struct ggml_tensor * b,
|
528
|
+
size_t nb1,
|
529
|
+
size_t nb2,
|
530
|
+
size_t nb3,
|
531
|
+
size_t offset);
|
532
|
+
|
533
|
+
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
534
|
+
struct ggml_context * ctx,
|
535
|
+
struct ggml_tensor * a,
|
536
|
+
struct ggml_tensor * b,
|
537
|
+
size_t nb1,
|
538
|
+
size_t nb2,
|
539
|
+
size_t nb3,
|
540
|
+
size_t offset);
|
541
|
+
|
500
542
|
GGML_API struct ggml_tensor * ggml_sub(
|
501
543
|
struct ggml_context * ctx,
|
502
544
|
struct ggml_tensor * a,
|
@@ -520,12 +562,24 @@ extern "C" {
|
|
520
562
|
struct ggml_context * ctx,
|
521
563
|
struct ggml_tensor * a);
|
522
564
|
|
565
|
+
GGML_API struct ggml_tensor * ggml_log(
|
566
|
+
struct ggml_context * ctx,
|
567
|
+
struct ggml_tensor * a);
|
568
|
+
|
569
|
+
GGML_API struct ggml_tensor * ggml_log_inplace(
|
570
|
+
struct ggml_context * ctx,
|
571
|
+
struct ggml_tensor * a);
|
572
|
+
|
523
573
|
// return scalar
|
524
|
-
// TODO: compute sum along rows
|
525
574
|
GGML_API struct ggml_tensor * ggml_sum(
|
526
575
|
struct ggml_context * ctx,
|
527
576
|
struct ggml_tensor * a);
|
528
577
|
|
578
|
+
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
579
|
+
GGML_API struct ggml_tensor * ggml_sum_rows(
|
580
|
+
struct ggml_context * ctx,
|
581
|
+
struct ggml_tensor * a);
|
582
|
+
|
529
583
|
// mean along rows
|
530
584
|
GGML_API struct ggml_tensor * ggml_mean(
|
531
585
|
struct ggml_context * ctx,
|
@@ -567,6 +621,13 @@ extern "C" {
|
|
567
621
|
struct ggml_context * ctx,
|
568
622
|
struct ggml_tensor * a);
|
569
623
|
|
624
|
+
// a - x
|
625
|
+
// b - dy
|
626
|
+
GGML_API struct ggml_tensor * ggml_silu_back(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
570
631
|
// normalize along rows
|
571
632
|
// TODO: eps is hardcoded to 1e-5 for now
|
572
633
|
GGML_API struct ggml_tensor * ggml_norm(
|
@@ -577,6 +638,13 @@ extern "C" {
|
|
577
638
|
struct ggml_context * ctx,
|
578
639
|
struct ggml_tensor * a);
|
579
640
|
|
641
|
+
// a - x
|
642
|
+
// b - dy
|
643
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
644
|
+
struct ggml_context * ctx,
|
645
|
+
struct ggml_tensor * a,
|
646
|
+
struct ggml_tensor * b);
|
647
|
+
|
580
648
|
// A: m rows, n columns
|
581
649
|
// B: p rows, n columns (i.e. we transpose it internally)
|
582
650
|
// result is m columns, p rows
|
@@ -589,12 +657,66 @@ extern "C" {
|
|
589
657
|
// operations on tensors without backpropagation
|
590
658
|
//
|
591
659
|
|
592
|
-
// in-place, returns view(a)
|
593
660
|
GGML_API struct ggml_tensor * ggml_scale(
|
594
661
|
struct ggml_context * ctx,
|
595
662
|
struct ggml_tensor * a,
|
596
663
|
struct ggml_tensor * b);
|
597
664
|
|
665
|
+
// in-place, returns view(a)
|
666
|
+
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
667
|
+
struct ggml_context * ctx,
|
668
|
+
struct ggml_tensor * a,
|
669
|
+
struct ggml_tensor * b);
|
670
|
+
|
671
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
672
|
+
GGML_API struct ggml_tensor * ggml_set(
|
673
|
+
struct ggml_context * ctx,
|
674
|
+
struct ggml_tensor * a,
|
675
|
+
struct ggml_tensor * b,
|
676
|
+
size_t nb1,
|
677
|
+
size_t nb2,
|
678
|
+
size_t nb3,
|
679
|
+
size_t offset);
|
680
|
+
|
681
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
682
|
+
GGML_API struct ggml_tensor * ggml_set_inplace(
|
683
|
+
struct ggml_context * ctx,
|
684
|
+
struct ggml_tensor * a,
|
685
|
+
struct ggml_tensor * b,
|
686
|
+
size_t nb1,
|
687
|
+
size_t nb2,
|
688
|
+
size_t nb3,
|
689
|
+
size_t offset);
|
690
|
+
|
691
|
+
GGML_API struct ggml_tensor * ggml_set_1d(
|
692
|
+
struct ggml_context * ctx,
|
693
|
+
struct ggml_tensor * a,
|
694
|
+
struct ggml_tensor * b,
|
695
|
+
size_t offset);
|
696
|
+
|
697
|
+
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
698
|
+
struct ggml_context * ctx,
|
699
|
+
struct ggml_tensor * a,
|
700
|
+
struct ggml_tensor * b,
|
701
|
+
size_t offset);
|
702
|
+
|
703
|
+
// b -> view(a,offset,nb1,nb2,3), return modified a
|
704
|
+
GGML_API struct ggml_tensor * ggml_set_2d(
|
705
|
+
struct ggml_context * ctx,
|
706
|
+
struct ggml_tensor * a,
|
707
|
+
struct ggml_tensor * b,
|
708
|
+
size_t nb1,
|
709
|
+
size_t offset);
|
710
|
+
|
711
|
+
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
712
|
+
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
713
|
+
struct ggml_context * ctx,
|
714
|
+
struct ggml_tensor * a,
|
715
|
+
struct ggml_tensor * b,
|
716
|
+
size_t nb1,
|
717
|
+
size_t offset);
|
718
|
+
|
719
|
+
|
598
720
|
// a -> b, return view(b)
|
599
721
|
GGML_API struct ggml_tensor * ggml_cpy(
|
600
722
|
struct ggml_context * ctx,
|
@@ -615,6 +737,11 @@ extern "C" {
|
|
615
737
|
|
616
738
|
// return view(a)
|
617
739
|
// TODO: when we start computing gradient, make a copy instead of view
|
740
|
+
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
741
|
+
struct ggml_context * ctx,
|
742
|
+
struct ggml_tensor * a,
|
743
|
+
int64_t ne0);
|
744
|
+
|
618
745
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
619
746
|
struct ggml_context * ctx,
|
620
747
|
struct ggml_tensor * a,
|
@@ -630,6 +757,14 @@ extern "C" {
|
|
630
757
|
int64_t ne1,
|
631
758
|
int64_t ne2);
|
632
759
|
|
760
|
+
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
761
|
+
struct ggml_context * ctx,
|
762
|
+
struct ggml_tensor * a,
|
763
|
+
int64_t ne0,
|
764
|
+
int64_t ne1,
|
765
|
+
int64_t ne2,
|
766
|
+
int64_t ne3);
|
767
|
+
|
633
768
|
// offset in bytes
|
634
769
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
635
770
|
struct ggml_context * ctx,
|
@@ -655,6 +790,18 @@ extern "C" {
|
|
655
790
|
size_t nb2, // slice stride in bytes
|
656
791
|
size_t offset);
|
657
792
|
|
793
|
+
GGML_API struct ggml_tensor * ggml_view_4d(
|
794
|
+
struct ggml_context * ctx,
|
795
|
+
struct ggml_tensor * a,
|
796
|
+
int64_t ne0,
|
797
|
+
int64_t ne1,
|
798
|
+
int64_t ne2,
|
799
|
+
int64_t ne3,
|
800
|
+
size_t nb1, // row stride in bytes
|
801
|
+
size_t nb2, // slice stride in bytes
|
802
|
+
size_t nb3,
|
803
|
+
size_t offset);
|
804
|
+
|
658
805
|
GGML_API struct ggml_tensor * ggml_permute(
|
659
806
|
struct ggml_context * ctx,
|
660
807
|
struct ggml_tensor * a,
|
@@ -673,20 +820,50 @@ extern "C" {
|
|
673
820
|
struct ggml_tensor * a,
|
674
821
|
struct ggml_tensor * b);
|
675
822
|
|
823
|
+
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
824
|
+
struct ggml_context * ctx,
|
825
|
+
struct ggml_tensor * a,
|
826
|
+
struct ggml_tensor * b,
|
827
|
+
struct ggml_tensor * c);
|
828
|
+
|
829
|
+
GGML_API struct ggml_tensor * ggml_diag(
|
830
|
+
struct ggml_context * ctx,
|
831
|
+
struct ggml_tensor * a);
|
832
|
+
|
676
833
|
// set elements above the diagonal to -INF
|
677
|
-
// in-place, returns view(a)
|
678
834
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
679
835
|
struct ggml_context * ctx,
|
680
836
|
struct ggml_tensor * a,
|
681
837
|
int n_past);
|
682
838
|
|
683
839
|
// in-place, returns view(a)
|
840
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
841
|
+
struct ggml_context * ctx,
|
842
|
+
struct ggml_tensor * a,
|
843
|
+
int n_past);
|
844
|
+
|
845
|
+
// set elements above the diagonal to 0
|
846
|
+
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
847
|
+
struct ggml_context * ctx,
|
848
|
+
struct ggml_tensor * a,
|
849
|
+
int n_past);
|
850
|
+
|
851
|
+
// in-place, returns view(a)
|
852
|
+
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
853
|
+
struct ggml_context * ctx,
|
854
|
+
struct ggml_tensor * a,
|
855
|
+
int n_past);
|
856
|
+
|
684
857
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
685
858
|
struct ggml_context * ctx,
|
686
859
|
struct ggml_tensor * a);
|
687
860
|
|
688
|
-
// rotary position embedding
|
689
861
|
// in-place, returns view(a)
|
862
|
+
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
863
|
+
struct ggml_context * ctx,
|
864
|
+
struct ggml_tensor * a);
|
865
|
+
|
866
|
+
// rotary position embedding
|
690
867
|
// if mode & 1 == 1, skip n_past elements
|
691
868
|
// if mode & 2 == 1, GPT-NeoX style
|
692
869
|
// TODO: avoid creating a new tensor every time
|
@@ -697,6 +874,23 @@ extern "C" {
|
|
697
874
|
int n_dims,
|
698
875
|
int mode);
|
699
876
|
|
877
|
+
// in-place, returns view(a)
|
878
|
+
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
879
|
+
struct ggml_context * ctx,
|
880
|
+
struct ggml_tensor * a,
|
881
|
+
int n_past,
|
882
|
+
int n_dims,
|
883
|
+
int mode);
|
884
|
+
|
885
|
+
// rotary position embedding backward, i.e compute dx from dy
|
886
|
+
// a - dy
|
887
|
+
GGML_API struct ggml_tensor * ggml_rope_back(
|
888
|
+
struct ggml_context * ctx,
|
889
|
+
struct ggml_tensor * a,
|
890
|
+
int n_past,
|
891
|
+
int n_dims,
|
892
|
+
int mode);
|
893
|
+
|
700
894
|
// alibi position embedding
|
701
895
|
// in-place, returns view(a)
|
702
896
|
struct ggml_tensor * ggml_alibi(
|
@@ -741,13 +935,13 @@ extern "C" {
|
|
741
935
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
742
936
|
struct ggml_context * ctx,
|
743
937
|
struct ggml_tensor * a,
|
744
|
-
|
938
|
+
ggml_unary_op_f32_t fun);
|
745
939
|
|
746
940
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
747
941
|
struct ggml_context * ctx,
|
748
942
|
struct ggml_tensor * a,
|
749
943
|
struct ggml_tensor * b,
|
750
|
-
|
944
|
+
ggml_binary_op_f32_t fun);
|
751
945
|
|
752
946
|
//
|
753
947
|
// automatic differentiation
|
@@ -876,7 +1070,6 @@ extern "C" {
|
|
876
1070
|
|
877
1071
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
878
1072
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
879
|
-
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
880
1073
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
881
1074
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
882
1075
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|