llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -241,6 +241,13 @@ extern "C" {
|
|
241
241
|
GGML_TYPE_Q5_1 = 7,
|
242
242
|
GGML_TYPE_Q8_0 = 8,
|
243
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
244
251
|
GGML_TYPE_I8,
|
245
252
|
GGML_TYPE_I16,
|
246
253
|
GGML_TYPE_I32,
|
@@ -249,8 +256,8 @@ extern "C" {
|
|
249
256
|
|
250
257
|
enum ggml_backend {
|
251
258
|
GGML_BACKEND_CPU = 0,
|
252
|
-
|
253
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
254
261
|
};
|
255
262
|
|
256
263
|
// model file types
|
@@ -264,6 +271,11 @@ extern "C" {
|
|
264
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
265
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
266
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
267
279
|
};
|
268
280
|
|
269
281
|
// available tensor operations:
|
@@ -284,6 +296,7 @@ extern "C" {
|
|
284
296
|
GGML_OP_SUM_ROWS,
|
285
297
|
GGML_OP_MEAN,
|
286
298
|
GGML_OP_REPEAT,
|
299
|
+
GGML_OP_REPEAT_BACK,
|
287
300
|
GGML_OP_ABS,
|
288
301
|
GGML_OP_SGN,
|
289
302
|
GGML_OP_NEG,
|
@@ -297,6 +310,7 @@ extern "C" {
|
|
297
310
|
GGML_OP_RMS_NORM_BACK,
|
298
311
|
|
299
312
|
GGML_OP_MUL_MAT,
|
313
|
+
GGML_OP_OUT_PROD,
|
300
314
|
|
301
315
|
GGML_OP_SCALE,
|
302
316
|
GGML_OP_SET,
|
@@ -312,6 +326,7 @@ extern "C" {
|
|
312
326
|
GGML_OP_DIAG_MASK_INF,
|
313
327
|
GGML_OP_DIAG_MASK_ZERO,
|
314
328
|
GGML_OP_SOFT_MAX,
|
329
|
+
GGML_OP_SOFT_MAX_BACK,
|
315
330
|
GGML_OP_ROPE,
|
316
331
|
GGML_OP_ROPE_BACK,
|
317
332
|
GGML_OP_ALIBI,
|
@@ -321,10 +336,14 @@ extern "C" {
|
|
321
336
|
|
322
337
|
GGML_OP_FLASH_ATTN,
|
323
338
|
GGML_OP_FLASH_FF,
|
339
|
+
GGML_OP_FLASH_ATTN_BACK,
|
324
340
|
|
325
341
|
GGML_OP_MAP_UNARY,
|
326
342
|
GGML_OP_MAP_BINARY,
|
327
343
|
|
344
|
+
GGML_OP_CROSS_ENTROPY_LOSS,
|
345
|
+
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
346
|
+
|
328
347
|
GGML_OP_COUNT,
|
329
348
|
};
|
330
349
|
|
@@ -375,7 +394,9 @@ extern "C" {
|
|
375
394
|
|
376
395
|
char name[GGML_MAX_NAME];
|
377
396
|
|
378
|
-
|
397
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
398
|
+
|
399
|
+
char padding[4];
|
379
400
|
};
|
380
401
|
|
381
402
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -413,6 +434,25 @@ extern "C" {
|
|
413
434
|
bool no_alloc; // don't allocate memory for the tensor data
|
414
435
|
};
|
415
436
|
|
437
|
+
|
438
|
+
// compute types
|
439
|
+
enum ggml_task_type {
|
440
|
+
GGML_TASK_INIT = 0,
|
441
|
+
GGML_TASK_COMPUTE,
|
442
|
+
GGML_TASK_FINALIZE,
|
443
|
+
};
|
444
|
+
|
445
|
+
struct ggml_compute_params {
|
446
|
+
enum ggml_task_type type;
|
447
|
+
|
448
|
+
// ith = thread index, nth = number of threads
|
449
|
+
int ith, nth;
|
450
|
+
|
451
|
+
// work buffer for all threads
|
452
|
+
size_t wsize;
|
453
|
+
void * wdata;
|
454
|
+
};
|
455
|
+
|
416
456
|
// misc
|
417
457
|
|
418
458
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -424,8 +464,10 @@ extern "C" {
|
|
424
464
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
425
465
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
426
466
|
|
427
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
428
|
-
GGML_API
|
467
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
468
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
469
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
470
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
429
471
|
|
430
472
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
431
473
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -441,13 +483,17 @@ extern "C" {
|
|
441
483
|
// TODO: temporary until model loading of ggml examples is refactored
|
442
484
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
443
485
|
|
486
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
487
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
488
|
+
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
489
|
+
|
444
490
|
// use this to compute the memory overhead of a tensor
|
445
491
|
GGML_API size_t ggml_tensor_overhead(void);
|
446
492
|
|
447
493
|
// main
|
448
494
|
|
449
495
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
450
|
-
GGML_API void
|
496
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
451
497
|
|
452
498
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
453
499
|
|
@@ -536,6 +582,11 @@ extern "C" {
|
|
536
582
|
struct ggml_tensor * a,
|
537
583
|
struct ggml_tensor * b);
|
538
584
|
|
585
|
+
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
586
|
+
struct ggml_context * ctx,
|
587
|
+
struct ggml_tensor * a,
|
588
|
+
struct ggml_tensor * b);
|
589
|
+
|
539
590
|
GGML_API struct ggml_tensor * ggml_acc(
|
540
591
|
struct ggml_context * ctx,
|
541
592
|
struct ggml_tensor * a,
|
@@ -607,6 +658,11 @@ extern "C" {
|
|
607
658
|
struct ggml_tensor * a,
|
608
659
|
struct ggml_tensor * b);
|
609
660
|
|
661
|
+
GGML_API struct ggml_tensor * ggml_repeat_back(
|
662
|
+
struct ggml_context * ctx,
|
663
|
+
struct ggml_tensor * a,
|
664
|
+
struct ggml_tensor * b);
|
665
|
+
|
610
666
|
GGML_API struct ggml_tensor * ggml_abs(
|
611
667
|
struct ggml_context * ctx,
|
612
668
|
struct ggml_tensor * a);
|
@@ -660,14 +716,22 @@ extern "C" {
|
|
660
716
|
struct ggml_tensor * a,
|
661
717
|
struct ggml_tensor * b);
|
662
718
|
|
663
|
-
// A:
|
664
|
-
// B:
|
719
|
+
// A: n columns, m rows
|
720
|
+
// B: n columns, p rows (i.e. we transpose it internally)
|
665
721
|
// result is m columns, p rows
|
666
722
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
667
723
|
struct ggml_context * ctx,
|
668
724
|
struct ggml_tensor * a,
|
669
725
|
struct ggml_tensor * b);
|
670
726
|
|
727
|
+
// A: m columns, n rows,
|
728
|
+
// B: p columns, n rows,
|
729
|
+
// result is m columns, p rows
|
730
|
+
GGML_API struct ggml_tensor * ggml_out_prod(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a,
|
733
|
+
struct ggml_tensor * b);
|
734
|
+
|
671
735
|
//
|
672
736
|
// operations on tensors without backpropagation
|
673
737
|
//
|
@@ -878,6 +942,17 @@ extern "C" {
|
|
878
942
|
struct ggml_context * ctx,
|
879
943
|
struct ggml_tensor * a);
|
880
944
|
|
945
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
946
|
+
struct ggml_context * ctx,
|
947
|
+
struct ggml_tensor * a,
|
948
|
+
struct ggml_tensor * b);
|
949
|
+
|
950
|
+
// in-place, returns view(a)
|
951
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
952
|
+
struct ggml_context * ctx,
|
953
|
+
struct ggml_tensor * a,
|
954
|
+
struct ggml_tensor * b);
|
955
|
+
|
881
956
|
// rotary position embedding
|
882
957
|
// if mode & 1 == 1, skip n_past elements
|
883
958
|
// if mode & 2 == 1, GPT-NeoX style
|
@@ -944,6 +1019,14 @@ extern "C" {
|
|
944
1019
|
struct ggml_tensor * v,
|
945
1020
|
bool masked);
|
946
1021
|
|
1022
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1023
|
+
struct ggml_context * ctx,
|
1024
|
+
struct ggml_tensor * q,
|
1025
|
+
struct ggml_tensor * k,
|
1026
|
+
struct ggml_tensor * v,
|
1027
|
+
struct ggml_tensor * d,
|
1028
|
+
bool masked);
|
1029
|
+
|
947
1030
|
GGML_API struct ggml_tensor * ggml_flash_ff(
|
948
1031
|
struct ggml_context * ctx,
|
949
1032
|
struct ggml_tensor * a,
|
@@ -967,6 +1050,19 @@ extern "C" {
|
|
967
1050
|
struct ggml_tensor * b,
|
968
1051
|
ggml_binary_op_f32_t fun);
|
969
1052
|
|
1053
|
+
// loss function
|
1054
|
+
|
1055
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
1056
|
+
struct ggml_context * ctx,
|
1057
|
+
struct ggml_tensor * a,
|
1058
|
+
struct ggml_tensor * b);
|
1059
|
+
|
1060
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
1061
|
+
struct ggml_context * ctx,
|
1062
|
+
struct ggml_tensor * a,
|
1063
|
+
struct ggml_tensor * b,
|
1064
|
+
struct ggml_tensor * c);
|
1065
|
+
|
970
1066
|
//
|
971
1067
|
// automatic differentiation
|
972
1068
|
//
|
@@ -1061,6 +1157,8 @@ extern "C" {
|
|
1061
1157
|
struct {
|
1062
1158
|
int n_iter;
|
1063
1159
|
|
1160
|
+
float sched; // schedule multiplier (fixed, decay or warmup)
|
1161
|
+
float decay; // weight decay for AdamW, use 0.0f to disable
|
1064
1162
|
float alpha; // learning rate
|
1065
1163
|
float beta1;
|
1066
1164
|
float beta2;
|
@@ -1085,6 +1183,49 @@ extern "C" {
|
|
1085
1183
|
} lbfgs;
|
1086
1184
|
};
|
1087
1185
|
|
1186
|
+
struct ggml_opt_context {
|
1187
|
+
struct ggml_context * ctx;
|
1188
|
+
struct ggml_opt_params params;
|
1189
|
+
|
1190
|
+
int iter;
|
1191
|
+
int64_t nx; // number of parameter elements
|
1192
|
+
|
1193
|
+
bool just_initialized;
|
1194
|
+
|
1195
|
+
struct {
|
1196
|
+
struct ggml_tensor * x; // view of the parameters
|
1197
|
+
struct ggml_tensor * g1; // gradient
|
1198
|
+
struct ggml_tensor * g2; // gradient squared
|
1199
|
+
struct ggml_tensor * m; // first moment
|
1200
|
+
struct ggml_tensor * v; // second moment
|
1201
|
+
struct ggml_tensor * mh; // first moment hat
|
1202
|
+
struct ggml_tensor * vh; // second moment hat
|
1203
|
+
struct ggml_tensor * pf; // past function values
|
1204
|
+
float fx_best;
|
1205
|
+
float fx_prev;
|
1206
|
+
int n_no_improvement;
|
1207
|
+
} adam;
|
1208
|
+
|
1209
|
+
struct {
|
1210
|
+
struct ggml_tensor * x; // current parameters
|
1211
|
+
struct ggml_tensor * xp; // previous parameters
|
1212
|
+
struct ggml_tensor * g; // current gradient
|
1213
|
+
struct ggml_tensor * gp; // previous gradient
|
1214
|
+
struct ggml_tensor * d; // search direction
|
1215
|
+
struct ggml_tensor * pf; // past function values
|
1216
|
+
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
1217
|
+
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
1218
|
+
struct ggml_tensor * lms; // the L-BFGS memory s
|
1219
|
+
struct ggml_tensor * lmy; // the L-BFGS memory y
|
1220
|
+
float fx_best;
|
1221
|
+
float step;
|
1222
|
+
int j;
|
1223
|
+
int k;
|
1224
|
+
int end;
|
1225
|
+
int n_no_improvement;
|
1226
|
+
} lbfgs;
|
1227
|
+
};
|
1228
|
+
|
1088
1229
|
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
1089
1230
|
|
1090
1231
|
// optimize the function defined by the tensor f
|
@@ -1093,6 +1234,27 @@ extern "C" {
|
|
1093
1234
|
struct ggml_opt_params params,
|
1094
1235
|
struct ggml_tensor * f);
|
1095
1236
|
|
1237
|
+
// initialize optimizer context
|
1238
|
+
GGML_API void ggml_opt_init(
|
1239
|
+
struct ggml_context * ctx,
|
1240
|
+
struct ggml_opt_context * opt,
|
1241
|
+
struct ggml_opt_params params,
|
1242
|
+
int64_t nx);
|
1243
|
+
|
1244
|
+
// continue optimizing the function defined by the tensor f
|
1245
|
+
GGML_API enum ggml_opt_result ggml_opt_resume(
|
1246
|
+
struct ggml_context * ctx,
|
1247
|
+
struct ggml_opt_context * opt,
|
1248
|
+
struct ggml_tensor * f);
|
1249
|
+
|
1250
|
+
// continue optimizing the function defined by the tensor f
|
1251
|
+
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
1252
|
+
struct ggml_context * ctx,
|
1253
|
+
struct ggml_opt_context * opt,
|
1254
|
+
struct ggml_tensor * f,
|
1255
|
+
struct ggml_cgraph * gf,
|
1256
|
+
struct ggml_cgraph * gb);
|
1257
|
+
|
1096
1258
|
//
|
1097
1259
|
// quantization
|
1098
1260
|
//
|