llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,7 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
-
#define GGML_MAX_NAME
|
201
|
+
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
204
|
#define GGML_ASSERT(x) \
|
@@ -303,6 +303,7 @@ extern "C" {
|
|
303
303
|
GGML_OP_STEP,
|
304
304
|
GGML_OP_RELU,
|
305
305
|
GGML_OP_GELU,
|
306
|
+
GGML_OP_GELU_QUICK,
|
306
307
|
GGML_OP_SILU,
|
307
308
|
GGML_OP_SILU_BACK,
|
308
309
|
GGML_OP_NORM, // normalize
|
@@ -331,16 +332,23 @@ extern "C" {
|
|
331
332
|
GGML_OP_ROPE_BACK,
|
332
333
|
GGML_OP_ALIBI,
|
333
334
|
GGML_OP_CLAMP,
|
334
|
-
|
335
|
-
|
335
|
+
GGML_OP_CONV_1D_S1_PH,
|
336
|
+
GGML_OP_CONV_1D_S2_PH,
|
337
|
+
GGML_OP_CONV_2D_SK_P0,
|
336
338
|
|
337
339
|
GGML_OP_FLASH_ATTN,
|
338
340
|
GGML_OP_FLASH_FF,
|
339
341
|
GGML_OP_FLASH_ATTN_BACK,
|
342
|
+
GGML_OP_WIN_PART,
|
343
|
+
GGML_OP_WIN_UNPART,
|
340
344
|
|
341
345
|
GGML_OP_MAP_UNARY,
|
342
346
|
GGML_OP_MAP_BINARY,
|
343
347
|
|
348
|
+
GGML_OP_MAP_CUSTOM1,
|
349
|
+
GGML_OP_MAP_CUSTOM2,
|
350
|
+
GGML_OP_MAP_CUSTOM3,
|
351
|
+
|
344
352
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
345
353
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
346
354
|
|
@@ -461,6 +469,9 @@ extern "C" {
|
|
461
469
|
GGML_API int64_t ggml_cycles(void);
|
462
470
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
463
471
|
|
472
|
+
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
473
|
+
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
474
|
+
|
464
475
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
465
476
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
466
477
|
|
@@ -500,8 +511,9 @@ extern "C" {
|
|
500
511
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501
512
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502
513
|
|
503
|
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504
|
-
GGML_API size_t ggml_get_mem_size
|
514
|
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
515
|
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
516
|
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
505
517
|
|
506
518
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507
519
|
struct ggml_context * ctx,
|
@@ -556,8 +568,9 @@ extern "C" {
|
|
556
568
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557
569
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558
570
|
|
559
|
-
GGML_API const char *
|
560
|
-
GGML_API
|
571
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
572
|
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
573
|
+
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
561
574
|
|
562
575
|
//
|
563
576
|
// operations on tensors with backpropagation
|
@@ -610,24 +623,47 @@ extern "C" {
|
|
610
623
|
struct ggml_tensor * a,
|
611
624
|
struct ggml_tensor * b);
|
612
625
|
|
626
|
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
627
|
+
struct ggml_context * ctx,
|
628
|
+
struct ggml_tensor * a,
|
629
|
+
struct ggml_tensor * b);
|
630
|
+
|
613
631
|
GGML_API struct ggml_tensor * ggml_mul(
|
614
632
|
struct ggml_context * ctx,
|
615
633
|
struct ggml_tensor * a,
|
616
634
|
struct ggml_tensor * b);
|
617
635
|
|
636
|
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
637
|
+
struct ggml_context * ctx,
|
638
|
+
struct ggml_tensor * a,
|
639
|
+
struct ggml_tensor * b);
|
640
|
+
|
618
641
|
GGML_API struct ggml_tensor * ggml_div(
|
619
642
|
struct ggml_context * ctx,
|
620
643
|
struct ggml_tensor * a,
|
621
644
|
struct ggml_tensor * b);
|
622
645
|
|
646
|
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
647
|
+
struct ggml_context * ctx,
|
648
|
+
struct ggml_tensor * a,
|
649
|
+
struct ggml_tensor * b);
|
650
|
+
|
623
651
|
GGML_API struct ggml_tensor * ggml_sqr(
|
624
652
|
struct ggml_context * ctx,
|
625
653
|
struct ggml_tensor * a);
|
626
654
|
|
655
|
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
656
|
+
struct ggml_context * ctx,
|
657
|
+
struct ggml_tensor * a);
|
658
|
+
|
627
659
|
GGML_API struct ggml_tensor * ggml_sqrt(
|
628
660
|
struct ggml_context * ctx,
|
629
661
|
struct ggml_tensor * a);
|
630
662
|
|
663
|
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
664
|
+
struct ggml_context * ctx,
|
665
|
+
struct ggml_tensor * a);
|
666
|
+
|
631
667
|
GGML_API struct ggml_tensor * ggml_log(
|
632
668
|
struct ggml_context * ctx,
|
633
669
|
struct ggml_tensor * a);
|
@@ -667,31 +703,67 @@ extern "C" {
|
|
667
703
|
struct ggml_context * ctx,
|
668
704
|
struct ggml_tensor * a);
|
669
705
|
|
706
|
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
707
|
+
struct ggml_context * ctx,
|
708
|
+
struct ggml_tensor * a);
|
709
|
+
|
670
710
|
GGML_API struct ggml_tensor * ggml_sgn(
|
671
711
|
struct ggml_context * ctx,
|
672
712
|
struct ggml_tensor * a);
|
673
713
|
|
714
|
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
715
|
+
struct ggml_context * ctx,
|
716
|
+
struct ggml_tensor * a);
|
717
|
+
|
674
718
|
GGML_API struct ggml_tensor * ggml_neg(
|
675
719
|
struct ggml_context * ctx,
|
676
720
|
struct ggml_tensor * a);
|
677
721
|
|
722
|
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
678
726
|
GGML_API struct ggml_tensor * ggml_step(
|
679
727
|
struct ggml_context * ctx,
|
680
728
|
struct ggml_tensor * a);
|
681
729
|
|
730
|
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a);
|
733
|
+
|
682
734
|
GGML_API struct ggml_tensor * ggml_relu(
|
683
735
|
struct ggml_context * ctx,
|
684
736
|
struct ggml_tensor * a);
|
685
737
|
|
738
|
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
739
|
+
struct ggml_context * ctx,
|
740
|
+
struct ggml_tensor * a);
|
741
|
+
|
686
742
|
// TODO: double-check this computation is correct
|
687
743
|
GGML_API struct ggml_tensor * ggml_gelu(
|
688
744
|
struct ggml_context * ctx,
|
689
745
|
struct ggml_tensor * a);
|
690
746
|
|
747
|
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
748
|
+
struct ggml_context * ctx,
|
749
|
+
struct ggml_tensor * a);
|
750
|
+
|
751
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
752
|
+
struct ggml_context * ctx,
|
753
|
+
struct ggml_tensor * a);
|
754
|
+
|
755
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a);
|
758
|
+
|
691
759
|
GGML_API struct ggml_tensor * ggml_silu(
|
692
760
|
struct ggml_context * ctx,
|
693
761
|
struct ggml_tensor * a);
|
694
762
|
|
763
|
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
764
|
+
struct ggml_context * ctx,
|
765
|
+
struct ggml_tensor * a);
|
766
|
+
|
695
767
|
// a - x
|
696
768
|
// b - dy
|
697
769
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +777,18 @@ extern "C" {
|
|
705
777
|
struct ggml_context * ctx,
|
706
778
|
struct ggml_tensor * a);
|
707
779
|
|
780
|
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
781
|
+
struct ggml_context * ctx,
|
782
|
+
struct ggml_tensor * a);
|
783
|
+
|
708
784
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709
785
|
struct ggml_context * ctx,
|
710
786
|
struct ggml_tensor * a);
|
711
787
|
|
788
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
789
|
+
struct ggml_context * ctx,
|
790
|
+
struct ggml_tensor * a);
|
791
|
+
|
712
792
|
// a - x
|
713
793
|
// b - dy
|
714
794
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -956,13 +1036,15 @@ extern "C" {
|
|
956
1036
|
// rotary position embedding
|
957
1037
|
// if mode & 1 == 1, skip n_past elements
|
958
1038
|
// if mode & 2 == 1, GPT-NeoX style
|
1039
|
+
// if mode & 4 == 1, ChatGLM style
|
959
1040
|
// TODO: avoid creating a new tensor every time
|
960
1041
|
GGML_API struct ggml_tensor * ggml_rope(
|
961
1042
|
struct ggml_context * ctx,
|
962
1043
|
struct ggml_tensor * a,
|
963
1044
|
int n_past,
|
964
1045
|
int n_dims,
|
965
|
-
int mode
|
1046
|
+
int mode,
|
1047
|
+
int n_ctx);
|
966
1048
|
|
967
1049
|
// in-place, returns view(a)
|
968
1050
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
@@ -970,7 +1052,8 @@ extern "C" {
|
|
970
1052
|
struct ggml_tensor * a,
|
971
1053
|
int n_past,
|
972
1054
|
int n_dims,
|
973
|
-
int mode
|
1055
|
+
int mode,
|
1056
|
+
int n_ctx);
|
974
1057
|
|
975
1058
|
// rotary position embedding backward, i.e compute dx from dy
|
976
1059
|
// a - dy
|
@@ -998,16 +1081,55 @@ extern "C" {
|
|
998
1081
|
float min,
|
999
1082
|
float max);
|
1000
1083
|
|
1001
|
-
//
|
1084
|
+
// TODO: implement general-purpose convolutions
|
1085
|
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
+
// struct ggml_context * ctx,
|
1087
|
+
// struct ggml_tensor * a,
|
1088
|
+
// struct ggml_tensor * b,
|
1089
|
+
// int s0
|
1090
|
+
// int p0,
|
1091
|
+
// int d0);
|
1092
|
+
//
|
1093
|
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
+
// struct ggml_context * ctx,
|
1095
|
+
// struct ggml_tensor * a,
|
1096
|
+
// struct ggml_tensor * b,
|
1097
|
+
// int s0,
|
1098
|
+
// int s1,
|
1099
|
+
// int p0,
|
1100
|
+
// int p1,
|
1101
|
+
// int d0,
|
1102
|
+
// int d1);
|
1103
|
+
|
1104
|
+
// padding = half
|
1002
1105
|
// TODO: we don't support extra parameters for now
|
1003
1106
|
// that's why we are hard-coding the stride, padding, and dilation
|
1004
1107
|
// not great ..
|
1005
|
-
|
1108
|
+
// example:
|
1109
|
+
// a: 3 80 768 1
|
1110
|
+
// b: 3000 80 1 1
|
1111
|
+
// res: 3000 768 1 1
|
1112
|
+
// used in whisper
|
1113
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1006
1114
|
struct ggml_context * ctx,
|
1007
1115
|
struct ggml_tensor * a,
|
1008
1116
|
struct ggml_tensor * b);
|
1009
1117
|
|
1010
|
-
|
1118
|
+
// used in whisper
|
1119
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1120
|
+
struct ggml_context * ctx,
|
1121
|
+
struct ggml_tensor * a,
|
1122
|
+
struct ggml_tensor * b);
|
1123
|
+
|
1124
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1125
|
+
// stride is equal to kernel size
|
1126
|
+
// padding is zero
|
1127
|
+
// example:
|
1128
|
+
// a: 16 16 3 768
|
1129
|
+
// b: 1024 1024 3 1
|
1130
|
+
// res: 64 64 768 1
|
1131
|
+
// used in sam
|
1132
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1011
1133
|
struct ggml_context * ctx,
|
1012
1134
|
struct ggml_tensor * a,
|
1013
1135
|
struct ggml_tensor * b);
|
@@ -1035,21 +1157,93 @@ extern "C" {
|
|
1035
1157
|
struct ggml_tensor * c0,
|
1036
1158
|
struct ggml_tensor * c1);
|
1037
1159
|
|
1038
|
-
//
|
1039
|
-
|
1160
|
+
// partition into non-overlapping windows with padding if needed
|
1161
|
+
// example:
|
1162
|
+
// a: 768 64 64 1
|
1163
|
+
// w: 14
|
1164
|
+
// res: 768 14 14 25
|
1165
|
+
// used in sam
|
1166
|
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1167
|
+
struct ggml_context * ctx,
|
1168
|
+
struct ggml_tensor * a,
|
1169
|
+
int w);
|
1170
|
+
|
1171
|
+
// reverse of ggml_win_part
|
1172
|
+
// used in sam
|
1173
|
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1174
|
+
struct ggml_context * ctx,
|
1175
|
+
struct ggml_tensor * a,
|
1176
|
+
int w0,
|
1177
|
+
int h0,
|
1178
|
+
int w);
|
1179
|
+
|
1180
|
+
// custom operators
|
1181
|
+
|
1182
|
+
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
1040
1183
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
1041
1184
|
|
1185
|
+
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
1186
|
+
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1187
|
+
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
1188
|
+
|
1042
1189
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
1043
1190
|
struct ggml_context * ctx,
|
1044
1191
|
struct ggml_tensor * a,
|
1045
1192
|
ggml_unary_op_f32_t fun);
|
1046
1193
|
|
1194
|
+
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
1195
|
+
struct ggml_context * ctx,
|
1196
|
+
struct ggml_tensor * a,
|
1197
|
+
ggml_unary_op_f32_t fun);
|
1198
|
+
|
1047
1199
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
1048
1200
|
struct ggml_context * ctx,
|
1049
1201
|
struct ggml_tensor * a,
|
1050
1202
|
struct ggml_tensor * b,
|
1051
1203
|
ggml_binary_op_f32_t fun);
|
1052
1204
|
|
1205
|
+
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
1206
|
+
struct ggml_context * ctx,
|
1207
|
+
struct ggml_tensor * a,
|
1208
|
+
struct ggml_tensor * b,
|
1209
|
+
ggml_binary_op_f32_t fun);
|
1210
|
+
|
1211
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
1212
|
+
struct ggml_context * ctx,
|
1213
|
+
struct ggml_tensor * a,
|
1214
|
+
ggml_custom1_op_f32_t fun);
|
1215
|
+
|
1216
|
+
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
1217
|
+
struct ggml_context * ctx,
|
1218
|
+
struct ggml_tensor * a,
|
1219
|
+
ggml_custom1_op_f32_t fun);
|
1220
|
+
|
1221
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
1222
|
+
struct ggml_context * ctx,
|
1223
|
+
struct ggml_tensor * a,
|
1224
|
+
struct ggml_tensor * b,
|
1225
|
+
ggml_custom2_op_f32_t fun);
|
1226
|
+
|
1227
|
+
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
1228
|
+
struct ggml_context * ctx,
|
1229
|
+
struct ggml_tensor * a,
|
1230
|
+
struct ggml_tensor * b,
|
1231
|
+
ggml_custom2_op_f32_t fun);
|
1232
|
+
|
1233
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
1234
|
+
struct ggml_context * ctx,
|
1235
|
+
struct ggml_tensor * a,
|
1236
|
+
struct ggml_tensor * b,
|
1237
|
+
struct ggml_tensor * c,
|
1238
|
+
ggml_custom3_op_f32_t fun);
|
1239
|
+
|
1240
|
+
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
1241
|
+
struct ggml_context * ctx,
|
1242
|
+
struct ggml_tensor * a,
|
1243
|
+
struct ggml_tensor * b,
|
1244
|
+
struct ggml_tensor * c,
|
1245
|
+
ggml_custom3_op_f32_t fun);
|
1246
|
+
|
1053
1247
|
// loss function
|
1054
1248
|
|
1055
1249
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|