llama_cpp 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,7 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
-
#define GGML_MAX_NAME
|
201
|
+
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
204
|
#define GGML_ASSERT(x) \
|
@@ -303,6 +303,7 @@ extern "C" {
|
|
303
303
|
GGML_OP_STEP,
|
304
304
|
GGML_OP_RELU,
|
305
305
|
GGML_OP_GELU,
|
306
|
+
GGML_OP_GELU_QUICK,
|
306
307
|
GGML_OP_SILU,
|
307
308
|
GGML_OP_SILU_BACK,
|
308
309
|
GGML_OP_NORM, // normalize
|
@@ -331,16 +332,23 @@ extern "C" {
|
|
331
332
|
GGML_OP_ROPE_BACK,
|
332
333
|
GGML_OP_ALIBI,
|
333
334
|
GGML_OP_CLAMP,
|
334
|
-
|
335
|
-
|
335
|
+
GGML_OP_CONV_1D_S1_PH,
|
336
|
+
GGML_OP_CONV_1D_S2_PH,
|
337
|
+
GGML_OP_CONV_2D_SK_P0,
|
336
338
|
|
337
339
|
GGML_OP_FLASH_ATTN,
|
338
340
|
GGML_OP_FLASH_FF,
|
339
341
|
GGML_OP_FLASH_ATTN_BACK,
|
342
|
+
GGML_OP_WIN_PART,
|
343
|
+
GGML_OP_WIN_UNPART,
|
340
344
|
|
341
345
|
GGML_OP_MAP_UNARY,
|
342
346
|
GGML_OP_MAP_BINARY,
|
343
347
|
|
348
|
+
GGML_OP_MAP_CUSTOM1,
|
349
|
+
GGML_OP_MAP_CUSTOM2,
|
350
|
+
GGML_OP_MAP_CUSTOM3,
|
351
|
+
|
344
352
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
345
353
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
346
354
|
|
@@ -461,6 +469,9 @@ extern "C" {
|
|
461
469
|
GGML_API int64_t ggml_cycles(void);
|
462
470
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
463
471
|
|
472
|
+
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
473
|
+
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
474
|
+
|
464
475
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
465
476
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
466
477
|
|
@@ -500,8 +511,9 @@ extern "C" {
|
|
500
511
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501
512
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502
513
|
|
503
|
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504
|
-
GGML_API size_t ggml_get_mem_size
|
514
|
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
515
|
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
516
|
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
505
517
|
|
506
518
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507
519
|
struct ggml_context * ctx,
|
@@ -556,8 +568,9 @@ extern "C" {
|
|
556
568
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557
569
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558
570
|
|
559
|
-
GGML_API const char *
|
560
|
-
GGML_API
|
571
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
572
|
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
573
|
+
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
561
574
|
|
562
575
|
//
|
563
576
|
// operations on tensors with backpropagation
|
@@ -610,24 +623,47 @@ extern "C" {
|
|
610
623
|
struct ggml_tensor * a,
|
611
624
|
struct ggml_tensor * b);
|
612
625
|
|
626
|
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
613
631
|
GGML_API struct ggml_tensor * ggml_mul(
|
614
632
|
struct ggml_context * ctx,
|
615
633
|
struct ggml_tensor * a,
|
616
634
|
struct ggml_tensor * b);
|
617
635
|
|
636
|
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
637
|
+
struct ggml_context * ctx,
|
638
|
+
struct ggml_tensor * a,
|
639
|
+
struct ggml_tensor * b);
|
640
|
+
|
618
641
|
GGML_API struct ggml_tensor * ggml_div(
|
619
642
|
struct ggml_context * ctx,
|
620
643
|
struct ggml_tensor * a,
|
621
644
|
struct ggml_tensor * b);
|
622
645
|
|
646
|
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
647
|
+
struct ggml_context * ctx,
|
648
|
+
struct ggml_tensor * a,
|
649
|
+
struct ggml_tensor * b);
|
650
|
+
|
623
651
|
GGML_API struct ggml_tensor * ggml_sqr(
|
624
652
|
struct ggml_context * ctx,
|
625
653
|
struct ggml_tensor * a);
|
626
654
|
|
655
|
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
656
|
+
struct ggml_context * ctx,
|
657
|
+
struct ggml_tensor * a);
|
658
|
+
|
627
659
|
GGML_API struct ggml_tensor * ggml_sqrt(
|
628
660
|
struct ggml_context * ctx,
|
629
661
|
struct ggml_tensor * a);
|
630
662
|
|
663
|
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
664
|
+
struct ggml_context * ctx,
|
665
|
+
struct ggml_tensor * a);
|
666
|
+
|
631
667
|
GGML_API struct ggml_tensor * ggml_log(
|
632
668
|
struct ggml_context * ctx,
|
633
669
|
struct ggml_tensor * a);
|
@@ -667,31 +703,67 @@ extern "C" {
|
|
667
703
|
struct ggml_context * ctx,
|
668
704
|
struct ggml_tensor * a);
|
669
705
|
|
706
|
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
707
|
+
struct ggml_context * ctx,
|
708
|
+
struct ggml_tensor * a);
|
709
|
+
|
670
710
|
GGML_API struct ggml_tensor * ggml_sgn(
|
671
711
|
struct ggml_context * ctx,
|
672
712
|
struct ggml_tensor * a);
|
673
713
|
|
714
|
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
715
|
+
struct ggml_context * ctx,
|
716
|
+
struct ggml_tensor * a);
|
717
|
+
|
674
718
|
GGML_API struct ggml_tensor * ggml_neg(
|
675
719
|
struct ggml_context * ctx,
|
676
720
|
struct ggml_tensor * a);
|
677
721
|
|
722
|
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
678
726
|
GGML_API struct ggml_tensor * ggml_step(
|
679
727
|
struct ggml_context * ctx,
|
680
728
|
struct ggml_tensor * a);
|
681
729
|
|
730
|
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a);
|
733
|
+
|
682
734
|
GGML_API struct ggml_tensor * ggml_relu(
|
683
735
|
struct ggml_context * ctx,
|
684
736
|
struct ggml_tensor * a);
|
685
737
|
|
738
|
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
739
|
+
struct ggml_context * ctx,
|
740
|
+
struct ggml_tensor * a);
|
741
|
+
|
686
742
|
// TODO: double-check this computation is correct
|
687
743
|
GGML_API struct ggml_tensor * ggml_gelu(
|
688
744
|
struct ggml_context * ctx,
|
689
745
|
struct ggml_tensor * a);
|
690
746
|
|
747
|
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
748
|
+
struct ggml_context * ctx,
|
749
|
+
struct ggml_tensor * a);
|
750
|
+
|
751
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
752
|
+
struct ggml_context * ctx,
|
753
|
+
struct ggml_tensor * a);
|
754
|
+
|
755
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a);
|
758
|
+
|
691
759
|
GGML_API struct ggml_tensor * ggml_silu(
|
692
760
|
struct ggml_context * ctx,
|
693
761
|
struct ggml_tensor * a);
|
694
762
|
|
763
|
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
764
|
+
struct ggml_context * ctx,
|
765
|
+
struct ggml_tensor * a);
|
766
|
+
|
695
767
|
// a - x
|
696
768
|
// b - dy
|
697
769
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +777,18 @@ extern "C" {
|
|
705
777
|
struct ggml_context * ctx,
|
706
778
|
struct ggml_tensor * a);
|
707
779
|
|
780
|
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
781
|
+
struct ggml_context * ctx,
|
782
|
+
struct ggml_tensor * a);
|
783
|
+
|
708
784
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709
785
|
struct ggml_context * ctx,
|
710
786
|
struct ggml_tensor * a);
|
711
787
|
|
788
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
789
|
+
struct ggml_context * ctx,
|
790
|
+
struct ggml_tensor * a);
|
791
|
+
|
712
792
|
// a - x
|
713
793
|
// b - dy
|
714
794
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -956,13 +1036,15 @@ extern "C" {
|
|
956
1036
|
// rotary position embedding
|
957
1037
|
// if mode & 1 == 1, skip n_past elements
|
958
1038
|
// if mode & 2 == 1, GPT-NeoX style
|
1039
|
+
// if mode & 4 == 1, ChatGLM style
|
959
1040
|
// TODO: avoid creating a new tensor every time
|
960
1041
|
GGML_API struct ggml_tensor * ggml_rope(
|
961
1042
|
struct ggml_context * ctx,
|
962
1043
|
struct ggml_tensor * a,
|
963
1044
|
int n_past,
|
964
1045
|
int n_dims,
|
965
|
-
int mode
|
1046
|
+
int mode,
|
1047
|
+
int n_ctx);
|
966
1048
|
|
967
1049
|
// in-place, returns view(a)
|
968
1050
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
@@ -970,7 +1052,8 @@ extern "C" {
|
|
970
1052
|
struct ggml_tensor * a,
|
971
1053
|
int n_past,
|
972
1054
|
int n_dims,
|
973
|
-
int mode
|
1055
|
+
int mode,
|
1056
|
+
int n_ctx);
|
974
1057
|
|
975
1058
|
// rotary position embedding backward, i.e compute dx from dy
|
976
1059
|
// a - dy
|
@@ -998,16 +1081,55 @@ extern "C" {
|
|
998
1081
|
float min,
|
999
1082
|
float max);
|
1000
1083
|
|
1001
|
-
//
|
1084
|
+
// TODO: implement general-purpose convolutions
|
1085
|
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
+
// struct ggml_context * ctx,
|
1087
|
+
// struct ggml_tensor * a,
|
1088
|
+
// struct ggml_tensor * b,
|
1089
|
+
// int s0
|
1090
|
+
// int p0,
|
1091
|
+
// int d0);
|
1092
|
+
//
|
1093
|
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
+
// struct ggml_context * ctx,
|
1095
|
+
// struct ggml_tensor * a,
|
1096
|
+
// struct ggml_tensor * b,
|
1097
|
+
// int s0,
|
1098
|
+
// int s1,
|
1099
|
+
// int p0,
|
1100
|
+
// int p1,
|
1101
|
+
// int d0,
|
1102
|
+
// int d1);
|
1103
|
+
|
1104
|
+
// padding = half
|
1002
1105
|
// TODO: we don't support extra parameters for now
|
1003
1106
|
// that's why we are hard-coding the stride, padding, and dilation
|
1004
1107
|
// not great ..
|
1005
|
-
|
1108
|
+
// example:
|
1109
|
+
// a: 3 80 768 1
|
1110
|
+
// b: 3000 80 1 1
|
1111
|
+
// res: 3000 768 1 1
|
1112
|
+
// used in whisper
|
1113
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1006
1114
|
struct ggml_context * ctx,
|
1007
1115
|
struct ggml_tensor * a,
|
1008
1116
|
struct ggml_tensor * b);
|
1009
1117
|
|
1010
|
-
|
1118
|
+
// used in whisper
|
1119
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1120
|
+
struct ggml_context * ctx,
|
1121
|
+
struct ggml_tensor * a,
|
1122
|
+
struct ggml_tensor * b);
|
1123
|
+
|
1124
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1125
|
+
// stride is equal to kernel size
|
1126
|
+
// padding is zero
|
1127
|
+
// example:
|
1128
|
+
// a: 16 16 3 768
|
1129
|
+
// b: 1024 1024 3 1
|
1130
|
+
// res: 64 64 768 1
|
1131
|
+
// used in sam
|
1132
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1011
1133
|
struct ggml_context * ctx,
|
1012
1134
|
struct ggml_tensor * a,
|
1013
1135
|
struct ggml_tensor * b);
|
@@ -1035,21 +1157,93 @@ extern "C" {
|
|
1035
1157
|
struct ggml_tensor * c0,
|
1036
1158
|
struct ggml_tensor * c1);
|
1037
1159
|
|
1038
|
-
//
|
1039
|
-
|
1160
|
+
// partition into non-overlapping windows with padding if needed
|
1161
|
+
// example:
|
1162
|
+
// a: 768 64 64 1
|
1163
|
+
// w: 14
|
1164
|
+
// res: 768 14 14 25
|
1165
|
+
// used in sam
|
1166
|
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1167
|
+
struct ggml_context * ctx,
|
1168
|
+
struct ggml_tensor * a,
|
1169
|
+
int w);
|
1170
|
+
|
1171
|
+
// reverse of ggml_win_part
|
1172
|
+
// used in sam
|
1173
|
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1174
|
+
struct ggml_context * ctx,
|
1175
|
+
struct ggml_tensor * a,
|
1176
|
+
int w0,
|
1177
|
+
int h0,
|
1178
|
+
int w);
|
1179
|
+
|
1180
|
+
// custom operators
|
1181
|
+
|
1182
|
+
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1040
1183
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1041
1184
|
|
1185
|
+
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1186
|
+
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1187
|
+
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1188
|
+
|
1042
1189
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1043
1190
|
struct ggml_context * ctx,
|
1044
1191
|
struct ggml_tensor * a,
|
1045
1192
|
ggml_unary_op_f32_t fun);
|
1046
1193
|
|
1194
|
+
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1195
|
+
struct ggml_context * ctx,
|
1196
|
+
struct ggml_tensor * a,
|
1197
|
+
ggml_unary_op_f32_t fun);
|
1198
|
+
|
1047
1199
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1048
1200
|
struct ggml_context * ctx,
|
1049
1201
|
struct ggml_tensor * a,
|
1050
1202
|
struct ggml_tensor * b,
|
1051
1203
|
ggml_binary_op_f32_t fun);
|
1052
1204
|
|
1205
|
+
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1206
|
+
struct ggml_context * ctx,
|
1207
|
+
struct ggml_tensor * a,
|
1208
|
+
struct ggml_tensor * b,
|
1209
|
+
ggml_binary_op_f32_t fun);
|
1210
|
+
|
1211
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1212
|
+
struct ggml_context * ctx,
|
1213
|
+
struct ggml_tensor * a,
|
1214
|
+
ggml_custom1_op_f32_t fun);
|
1215
|
+
|
1216
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1217
|
+
struct ggml_context * ctx,
|
1218
|
+
struct ggml_tensor * a,
|
1219
|
+
ggml_custom1_op_f32_t fun);
|
1220
|
+
|
1221
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1222
|
+
struct ggml_context * ctx,
|
1223
|
+
struct ggml_tensor * a,
|
1224
|
+
struct ggml_tensor * b,
|
1225
|
+
ggml_custom2_op_f32_t fun);
|
1226
|
+
|
1227
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1228
|
+
struct ggml_context * ctx,
|
1229
|
+
struct ggml_tensor * a,
|
1230
|
+
struct ggml_tensor * b,
|
1231
|
+
ggml_custom2_op_f32_t fun);
|
1232
|
+
|
1233
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1234
|
+
struct ggml_context * ctx,
|
1235
|
+
struct ggml_tensor * a,
|
1236
|
+
struct ggml_tensor * b,
|
1237
|
+
struct ggml_tensor * c,
|
1238
|
+
ggml_custom3_op_f32_t fun);
|
1239
|
+
|
1240
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1241
|
+
struct ggml_context * ctx,
|
1242
|
+
struct ggml_tensor * a,
|
1243
|
+
struct ggml_tensor * b,
|
1244
|
+
struct ggml_tensor * c,
|
1245
|
+
ggml_custom3_op_f32_t fun);
|
1246
|
+
|
1053
1247
|
// loss function
|
1054
1248
|
|
1055
1249
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|