llama_cpp 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +548 -497
- data/ext/llama_cpp/src/ggml-metal.metal +425 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +1904 -303
- data/ext/llama_cpp/src/ggml.h +126 -2
- data/ext/llama_cpp/src/llama.cpp +212 -108
- data/ext/llama_cpp/src/llama.h +12 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +4 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -296,6 +296,7 @@ extern "C" {
|
|
296
296
|
GGML_OP_SUM_ROWS,
|
297
297
|
GGML_OP_MEAN,
|
298
298
|
GGML_OP_REPEAT,
|
299
|
+
GGML_OP_REPEAT_BACK,
|
299
300
|
GGML_OP_ABS,
|
300
301
|
GGML_OP_SGN,
|
301
302
|
GGML_OP_NEG,
|
@@ -309,6 +310,7 @@ extern "C" {
|
|
309
310
|
GGML_OP_RMS_NORM_BACK,
|
310
311
|
|
311
312
|
GGML_OP_MUL_MAT,
|
313
|
+
GGML_OP_OUT_PROD,
|
312
314
|
|
313
315
|
GGML_OP_SCALE,
|
314
316
|
GGML_OP_SET,
|
@@ -324,6 +326,7 @@ extern "C" {
|
|
324
326
|
GGML_OP_DIAG_MASK_INF,
|
325
327
|
GGML_OP_DIAG_MASK_ZERO,
|
326
328
|
GGML_OP_SOFT_MAX,
|
329
|
+
GGML_OP_SOFT_MAX_BACK,
|
327
330
|
GGML_OP_ROPE,
|
328
331
|
GGML_OP_ROPE_BACK,
|
329
332
|
GGML_OP_ALIBI,
|
@@ -333,10 +336,14 @@ extern "C" {
|
|
333
336
|
|
334
337
|
GGML_OP_FLASH_ATTN,
|
335
338
|
GGML_OP_FLASH_FF,
|
339
|
+
GGML_OP_FLASH_ATTN_BACK,
|
336
340
|
|
337
341
|
GGML_OP_MAP_UNARY,
|
338
342
|
GGML_OP_MAP_BINARY,
|
339
343
|
|
344
|
+
GGML_OP_CROSS_ENTROPY_LOSS,
|
345
|
+
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
346
|
+
|
340
347
|
GGML_OP_COUNT,
|
341
348
|
};
|
342
349
|
|
@@ -478,6 +485,7 @@ extern "C" {
|
|
478
485
|
|
479
486
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
480
487
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
488
|
+
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
481
489
|
|
482
490
|
// use this to compute the memory overhead of a tensor
|
483
491
|
GGML_API size_t ggml_tensor_overhead(void);
|
@@ -574,6 +582,11 @@ extern "C" {
|
|
574
582
|
struct ggml_tensor * a,
|
575
583
|
struct ggml_tensor * b);
|
576
584
|
|
585
|
+
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
586
|
+
struct ggml_context * ctx,
|
587
|
+
struct ggml_tensor * a,
|
588
|
+
struct ggml_tensor * b);
|
589
|
+
|
577
590
|
GGML_API struct ggml_tensor * ggml_acc(
|
578
591
|
struct ggml_context * ctx,
|
579
592
|
struct ggml_tensor * a,
|
@@ -645,6 +658,11 @@ extern "C" {
|
|
645
658
|
struct ggml_tensor * a,
|
646
659
|
struct ggml_tensor * b);
|
647
660
|
|
661
|
+
GGML_API struct ggml_tensor * ggml_repeat_back(
|
662
|
+
struct ggml_context * ctx,
|
663
|
+
struct ggml_tensor * a,
|
664
|
+
struct ggml_tensor * b);
|
665
|
+
|
648
666
|
GGML_API struct ggml_tensor * ggml_abs(
|
649
667
|
struct ggml_context * ctx,
|
650
668
|
struct ggml_tensor * a);
|
@@ -698,14 +716,22 @@ extern "C" {
|
|
698
716
|
struct ggml_tensor * a,
|
699
717
|
struct ggml_tensor * b);
|
700
718
|
|
701
|
-
// A:
|
702
|
-
// B:
|
719
|
+
// A: n columns, m rows
|
720
|
+
// B: n columns, p rows (i.e. we transpose it internally)
|
703
721
|
// result is m columns, p rows
|
704
722
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
705
723
|
struct ggml_context * ctx,
|
706
724
|
struct ggml_tensor * a,
|
707
725
|
struct ggml_tensor * b);
|
708
726
|
|
727
|
+
// A: m columns, n rows,
|
728
|
+
// B: p columns, n rows,
|
729
|
+
// result is m columns, p rows
|
730
|
+
GGML_API struct ggml_tensor * ggml_out_prod(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a,
|
733
|
+
struct ggml_tensor * b);
|
734
|
+
|
709
735
|
//
|
710
736
|
// operations on tensors without backpropagation
|
711
737
|
//
|
@@ -916,6 +942,17 @@ extern "C" {
|
|
916
942
|
struct ggml_context * ctx,
|
917
943
|
struct ggml_tensor * a);
|
918
944
|
|
945
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
946
|
+
struct ggml_context * ctx,
|
947
|
+
struct ggml_tensor * a,
|
948
|
+
struct ggml_tensor * b);
|
949
|
+
|
950
|
+
// in-place, returns view(a)
|
951
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
952
|
+
struct ggml_context * ctx,
|
953
|
+
struct ggml_tensor * a,
|
954
|
+
struct ggml_tensor * b);
|
955
|
+
|
919
956
|
// rotary position embedding
|
920
957
|
// if mode & 1 == 1, skip n_past elements
|
921
958
|
// if mode & 2 == 1, GPT-NeoX style
|
@@ -982,6 +1019,14 @@ extern "C" {
|
|
982
1019
|
struct ggml_tensor * v,
|
983
1020
|
bool masked);
|
984
1021
|
|
1022
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1023
|
+
struct ggml_context * ctx,
|
1024
|
+
struct ggml_tensor * q,
|
1025
|
+
struct ggml_tensor * k,
|
1026
|
+
struct ggml_tensor * v,
|
1027
|
+
struct ggml_tensor * d,
|
1028
|
+
bool masked);
|
1029
|
+
|
985
1030
|
GGML_API struct ggml_tensor * ggml_flash_ff(
|
986
1031
|
struct ggml_context * ctx,
|
987
1032
|
struct ggml_tensor * a,
|
@@ -1005,6 +1050,19 @@ extern "C" {
|
|
1005
1050
|
struct ggml_tensor * b,
|
1006
1051
|
ggml_binary_op_f32_t fun);
|
1007
1052
|
|
1053
|
+
// loss function
|
1054
|
+
|
1055
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
1056
|
+
struct ggml_context * ctx,
|
1057
|
+
struct ggml_tensor * a,
|
1058
|
+
struct ggml_tensor * b);
|
1059
|
+
|
1060
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
1061
|
+
struct ggml_context * ctx,
|
1062
|
+
struct ggml_tensor * a,
|
1063
|
+
struct ggml_tensor * b,
|
1064
|
+
struct ggml_tensor * c);
|
1065
|
+
|
1008
1066
|
//
|
1009
1067
|
// automatic differentiation
|
1010
1068
|
//
|
@@ -1099,6 +1157,8 @@ extern "C" {
|
|
1099
1157
|
struct {
|
1100
1158
|
int n_iter;
|
1101
1159
|
|
1160
|
+
float sched; // schedule multiplier (fixed, decay or warmup)
|
1161
|
+
float decay; // weight decay for AdamW, use 0.0f to disable
|
1102
1162
|
float alpha; // learning rate
|
1103
1163
|
float beta1;
|
1104
1164
|
float beta2;
|
@@ -1123,6 +1183,49 @@ extern "C" {
|
|
1123
1183
|
} lbfgs;
|
1124
1184
|
};
|
1125
1185
|
|
1186
|
+
struct ggml_opt_context {
|
1187
|
+
struct ggml_context * ctx;
|
1188
|
+
struct ggml_opt_params params;
|
1189
|
+
|
1190
|
+
int iter;
|
1191
|
+
int64_t nx; // number of parameter elements
|
1192
|
+
|
1193
|
+
bool just_initialized;
|
1194
|
+
|
1195
|
+
struct {
|
1196
|
+
struct ggml_tensor * x; // view of the parameters
|
1197
|
+
struct ggml_tensor * g1; // gradient
|
1198
|
+
struct ggml_tensor * g2; // gradient squared
|
1199
|
+
struct ggml_tensor * m; // first moment
|
1200
|
+
struct ggml_tensor * v; // second moment
|
1201
|
+
struct ggml_tensor * mh; // first moment hat
|
1202
|
+
struct ggml_tensor * vh; // second moment hat
|
1203
|
+
struct ggml_tensor * pf; // past function values
|
1204
|
+
float fx_best;
|
1205
|
+
float fx_prev;
|
1206
|
+
int n_no_improvement;
|
1207
|
+
} adam;
|
1208
|
+
|
1209
|
+
struct {
|
1210
|
+
struct ggml_tensor * x; // current parameters
|
1211
|
+
struct ggml_tensor * xp; // previous parameters
|
1212
|
+
struct ggml_tensor * g; // current gradient
|
1213
|
+
struct ggml_tensor * gp; // previous gradient
|
1214
|
+
struct ggml_tensor * d; // search direction
|
1215
|
+
struct ggml_tensor * pf; // past function values
|
1216
|
+
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
1217
|
+
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
1218
|
+
struct ggml_tensor * lms; // the L-BFGS memory s
|
1219
|
+
struct ggml_tensor * lmy; // the L-BFGS memory y
|
1220
|
+
float fx_best;
|
1221
|
+
float step;
|
1222
|
+
int j;
|
1223
|
+
int k;
|
1224
|
+
int end;
|
1225
|
+
int n_no_improvement;
|
1226
|
+
} lbfgs;
|
1227
|
+
};
|
1228
|
+
|
1126
1229
|
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
1127
1230
|
|
1128
1231
|
// optimize the function defined by the tensor f
|
@@ -1131,6 +1234,27 @@ extern "C" {
|
|
1131
1234
|
struct ggml_opt_params params,
|
1132
1235
|
struct ggml_tensor * f);
|
1133
1236
|
|
1237
|
+
// initialize optimizer context
|
1238
|
+
GGML_API void ggml_opt_init(
|
1239
|
+
struct ggml_context * ctx,
|
1240
|
+
struct ggml_opt_context * opt,
|
1241
|
+
struct ggml_opt_params params,
|
1242
|
+
int64_t nx);
|
1243
|
+
|
1244
|
+
// continue optimizing the function defined by the tensor f
|
1245
|
+
GGML_API enum ggml_opt_result ggml_opt_resume(
|
1246
|
+
struct ggml_context * ctx,
|
1247
|
+
struct ggml_opt_context * opt,
|
1248
|
+
struct ggml_tensor * f);
|
1249
|
+
|
1250
|
+
// continue optimizing the function defined by the tensor f
|
1251
|
+
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
1252
|
+
struct ggml_context * ctx,
|
1253
|
+
struct ggml_opt_context * opt,
|
1254
|
+
struct ggml_tensor * f,
|
1255
|
+
struct ggml_cgraph * gf,
|
1256
|
+
struct ggml_cgraph * gb);
|
1257
|
+
|
1134
1258
|
//
|
1135
1259
|
// quantization
|
1136
1260
|
//
|