llama_cpp 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -241,6 +241,13 @@ extern "C" {
|
|
241
241
|
GGML_TYPE_Q5_1 = 7,
|
242
242
|
GGML_TYPE_Q8_0 = 8,
|
243
243
|
GGML_TYPE_Q8_1 = 9,
|
244
|
+
// k-quantizations
|
245
|
+
GGML_TYPE_Q2_K = 10,
|
246
|
+
GGML_TYPE_Q3_K = 11,
|
247
|
+
GGML_TYPE_Q4_K = 12,
|
248
|
+
GGML_TYPE_Q5_K = 13,
|
249
|
+
GGML_TYPE_Q6_K = 14,
|
250
|
+
GGML_TYPE_Q8_K = 15,
|
244
251
|
GGML_TYPE_I8,
|
245
252
|
GGML_TYPE_I16,
|
246
253
|
GGML_TYPE_I32,
|
@@ -249,8 +256,8 @@ extern "C" {
|
|
249
256
|
|
250
257
|
enum ggml_backend {
|
251
258
|
GGML_BACKEND_CPU = 0,
|
252
|
-
|
253
|
-
|
259
|
+
GGML_BACKEND_GPU = 10,
|
260
|
+
GGML_BACKEND_GPU_SPLIT = 20,
|
254
261
|
};
|
255
262
|
|
256
263
|
// model file types
|
@@ -264,6 +271,11 @@ extern "C" {
|
|
264
271
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
265
272
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
266
273
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
274
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
275
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
276
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
277
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
278
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
267
279
|
};
|
268
280
|
|
269
281
|
// available tensor operations:
|
@@ -284,6 +296,7 @@ extern "C" {
|
|
284
296
|
GGML_OP_SUM_ROWS,
|
285
297
|
GGML_OP_MEAN,
|
286
298
|
GGML_OP_REPEAT,
|
299
|
+
GGML_OP_REPEAT_BACK,
|
287
300
|
GGML_OP_ABS,
|
288
301
|
GGML_OP_SGN,
|
289
302
|
GGML_OP_NEG,
|
@@ -297,6 +310,7 @@ extern "C" {
|
|
297
310
|
GGML_OP_RMS_NORM_BACK,
|
298
311
|
|
299
312
|
GGML_OP_MUL_MAT,
|
313
|
+
GGML_OP_OUT_PROD,
|
300
314
|
|
301
315
|
GGML_OP_SCALE,
|
302
316
|
GGML_OP_SET,
|
@@ -312,6 +326,7 @@ extern "C" {
|
|
312
326
|
GGML_OP_DIAG_MASK_INF,
|
313
327
|
GGML_OP_DIAG_MASK_ZERO,
|
314
328
|
GGML_OP_SOFT_MAX,
|
329
|
+
GGML_OP_SOFT_MAX_BACK,
|
315
330
|
GGML_OP_ROPE,
|
316
331
|
GGML_OP_ROPE_BACK,
|
317
332
|
GGML_OP_ALIBI,
|
@@ -321,10 +336,14 @@ extern "C" {
|
|
321
336
|
|
322
337
|
GGML_OP_FLASH_ATTN,
|
323
338
|
GGML_OP_FLASH_FF,
|
339
|
+
GGML_OP_FLASH_ATTN_BACK,
|
324
340
|
|
325
341
|
GGML_OP_MAP_UNARY,
|
326
342
|
GGML_OP_MAP_BINARY,
|
327
343
|
|
344
|
+
GGML_OP_CROSS_ENTROPY_LOSS,
|
345
|
+
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
346
|
+
|
328
347
|
GGML_OP_COUNT,
|
329
348
|
};
|
330
349
|
|
@@ -375,7 +394,9 @@ extern "C" {
|
|
375
394
|
|
376
395
|
char name[GGML_MAX_NAME];
|
377
396
|
|
378
|
-
|
397
|
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
398
|
+
|
399
|
+
char padding[4];
|
379
400
|
};
|
380
401
|
|
381
402
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -413,6 +434,25 @@ extern "C" {
|
|
413
434
|
bool no_alloc; // don't allocate memory for the tensor data
|
414
435
|
};
|
415
436
|
|
437
|
+
|
438
|
+
// compute types
|
439
|
+
enum ggml_task_type {
|
440
|
+
GGML_TASK_INIT = 0,
|
441
|
+
GGML_TASK_COMPUTE,
|
442
|
+
GGML_TASK_FINALIZE,
|
443
|
+
};
|
444
|
+
|
445
|
+
struct ggml_compute_params {
|
446
|
+
enum ggml_task_type type;
|
447
|
+
|
448
|
+
// ith = thread index, nth = number of threads
|
449
|
+
int ith, nth;
|
450
|
+
|
451
|
+
// work buffer for all threads
|
452
|
+
size_t wsize;
|
453
|
+
void * wdata;
|
454
|
+
};
|
455
|
+
|
416
456
|
// misc
|
417
457
|
|
418
458
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -424,8 +464,10 @@ extern "C" {
|
|
424
464
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
425
465
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
426
466
|
|
427
|
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
428
|
-
GGML_API
|
467
|
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
468
|
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
469
|
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
470
|
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
429
471
|
|
430
472
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
431
473
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
@@ -441,13 +483,17 @@ extern "C" {
|
|
441
483
|
// TODO: temporary until model loading of ggml examples is refactored
|
442
484
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
443
485
|
|
486
|
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
487
|
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
488
|
+
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
489
|
+
|
444
490
|
// use this to compute the memory overhead of a tensor
|
445
491
|
GGML_API size_t ggml_tensor_overhead(void);
|
446
492
|
|
447
493
|
// main
|
448
494
|
|
449
495
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
450
|
-
GGML_API void
|
496
|
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
451
497
|
|
452
498
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
453
499
|
|
@@ -536,6 +582,11 @@ extern "C" {
|
|
536
582
|
struct ggml_tensor * a,
|
537
583
|
struct ggml_tensor * b);
|
538
584
|
|
585
|
+
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
586
|
+
struct ggml_context * ctx,
|
587
|
+
struct ggml_tensor * a,
|
588
|
+
struct ggml_tensor * b);
|
589
|
+
|
539
590
|
GGML_API struct ggml_tensor * ggml_acc(
|
540
591
|
struct ggml_context * ctx,
|
541
592
|
struct ggml_tensor * a,
|
@@ -607,6 +658,11 @@ extern "C" {
|
|
607
658
|
struct ggml_tensor * a,
|
608
659
|
struct ggml_tensor * b);
|
609
660
|
|
661
|
+
GGML_API struct ggml_tensor * ggml_repeat_back(
|
662
|
+
struct ggml_context * ctx,
|
663
|
+
struct ggml_tensor * a,
|
664
|
+
struct ggml_tensor * b);
|
665
|
+
|
610
666
|
GGML_API struct ggml_tensor * ggml_abs(
|
611
667
|
struct ggml_context * ctx,
|
612
668
|
struct ggml_tensor * a);
|
@@ -660,14 +716,22 @@ extern "C" {
|
|
660
716
|
struct ggml_tensor * a,
|
661
717
|
struct ggml_tensor * b);
|
662
718
|
|
663
|
-
// A:
|
664
|
-
// B:
|
719
|
+
// A: n columns, m rows
|
720
|
+
// B: n columns, p rows (i.e. we transpose it internally)
|
665
721
|
// result is m columns, p rows
|
666
722
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
667
723
|
struct ggml_context * ctx,
|
668
724
|
struct ggml_tensor * a,
|
669
725
|
struct ggml_tensor * b);
|
670
726
|
|
727
|
+
// A: m columns, n rows,
|
728
|
+
// B: p columns, n rows,
|
729
|
+
// result is m columns, p rows
|
730
|
+
GGML_API struct ggml_tensor * ggml_out_prod(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a,
|
733
|
+
struct ggml_tensor * b);
|
734
|
+
|
671
735
|
//
|
672
736
|
// operations on tensors without backpropagation
|
673
737
|
//
|
@@ -878,6 +942,17 @@ extern "C" {
|
|
878
942
|
struct ggml_context * ctx,
|
879
943
|
struct ggml_tensor * a);
|
880
944
|
|
945
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
946
|
+
struct ggml_context * ctx,
|
947
|
+
struct ggml_tensor * a,
|
948
|
+
struct ggml_tensor * b);
|
949
|
+
|
950
|
+
// in-place, returns view(a)
|
951
|
+
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
952
|
+
struct ggml_context * ctx,
|
953
|
+
struct ggml_tensor * a,
|
954
|
+
struct ggml_tensor * b);
|
955
|
+
|
881
956
|
// rotary position embedding
|
882
957
|
// if mode & 1 == 1, skip n_past elements
|
883
958
|
// if mode & 2 == 1, GPT-NeoX style
|
@@ -944,6 +1019,14 @@ extern "C" {
|
|
944
1019
|
struct ggml_tensor * v,
|
945
1020
|
bool masked);
|
946
1021
|
|
1022
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1023
|
+
struct ggml_context * ctx,
|
1024
|
+
struct ggml_tensor * q,
|
1025
|
+
struct ggml_tensor * k,
|
1026
|
+
struct ggml_tensor * v,
|
1027
|
+
struct ggml_tensor * d,
|
1028
|
+
bool masked);
|
1029
|
+
|
947
1030
|
GGML_API struct ggml_tensor * ggml_flash_ff(
|
948
1031
|
struct ggml_context * ctx,
|
949
1032
|
struct ggml_tensor * a,
|
@@ -967,6 +1050,19 @@ extern "C" {
|
|
967
1050
|
struct ggml_tensor * b,
|
968
1051
|
ggml_binary_op_f32_t fun);
|
969
1052
|
|
1053
|
+
// loss function
|
1054
|
+
|
1055
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
1056
|
+
struct ggml_context * ctx,
|
1057
|
+
struct ggml_tensor * a,
|
1058
|
+
struct ggml_tensor * b);
|
1059
|
+
|
1060
|
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
1061
|
+
struct ggml_context * ctx,
|
1062
|
+
struct ggml_tensor * a,
|
1063
|
+
struct ggml_tensor * b,
|
1064
|
+
struct ggml_tensor * c);
|
1065
|
+
|
970
1066
|
//
|
971
1067
|
// automatic differentiation
|
972
1068
|
//
|
@@ -1061,6 +1157,8 @@ extern "C" {
|
|
1061
1157
|
struct {
|
1062
1158
|
int n_iter;
|
1063
1159
|
|
1160
|
+
float sched; // schedule multiplier (fixed, decay or warmup)
|
1161
|
+
float decay; // weight decay for AdamW, use 0.0f to disable
|
1064
1162
|
float alpha; // learning rate
|
1065
1163
|
float beta1;
|
1066
1164
|
float beta2;
|
@@ -1085,6 +1183,49 @@ extern "C" {
|
|
1085
1183
|
} lbfgs;
|
1086
1184
|
};
|
1087
1185
|
|
1186
|
+
struct ggml_opt_context {
|
1187
|
+
struct ggml_context * ctx;
|
1188
|
+
struct ggml_opt_params params;
|
1189
|
+
|
1190
|
+
int iter;
|
1191
|
+
int64_t nx; // number of parameter elements
|
1192
|
+
|
1193
|
+
bool just_initialized;
|
1194
|
+
|
1195
|
+
struct {
|
1196
|
+
struct ggml_tensor * x; // view of the parameters
|
1197
|
+
struct ggml_tensor * g1; // gradient
|
1198
|
+
struct ggml_tensor * g2; // gradient squared
|
1199
|
+
struct ggml_tensor * m; // first moment
|
1200
|
+
struct ggml_tensor * v; // second moment
|
1201
|
+
struct ggml_tensor * mh; // first moment hat
|
1202
|
+
struct ggml_tensor * vh; // second moment hat
|
1203
|
+
struct ggml_tensor * pf; // past function values
|
1204
|
+
float fx_best;
|
1205
|
+
float fx_prev;
|
1206
|
+
int n_no_improvement;
|
1207
|
+
} adam;
|
1208
|
+
|
1209
|
+
struct {
|
1210
|
+
struct ggml_tensor * x; // current parameters
|
1211
|
+
struct ggml_tensor * xp; // previous parameters
|
1212
|
+
struct ggml_tensor * g; // current gradient
|
1213
|
+
struct ggml_tensor * gp; // previous gradient
|
1214
|
+
struct ggml_tensor * d; // search direction
|
1215
|
+
struct ggml_tensor * pf; // past function values
|
1216
|
+
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
1217
|
+
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
1218
|
+
struct ggml_tensor * lms; // the L-BFGS memory s
|
1219
|
+
struct ggml_tensor * lmy; // the L-BFGS memory y
|
1220
|
+
float fx_best;
|
1221
|
+
float step;
|
1222
|
+
int j;
|
1223
|
+
int k;
|
1224
|
+
int end;
|
1225
|
+
int n_no_improvement;
|
1226
|
+
} lbfgs;
|
1227
|
+
};
|
1228
|
+
|
1088
1229
|
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
1089
1230
|
|
1090
1231
|
// optimize the function defined by the tensor f
|
@@ -1093,6 +1234,27 @@ extern "C" {
|
|
1093
1234
|
struct ggml_opt_params params,
|
1094
1235
|
struct ggml_tensor * f);
|
1095
1236
|
|
1237
|
+
// initialize optimizer context
|
1238
|
+
GGML_API void ggml_opt_init(
|
1239
|
+
struct ggml_context * ctx,
|
1240
|
+
struct ggml_opt_context * opt,
|
1241
|
+
struct ggml_opt_params params,
|
1242
|
+
int64_t nx);
|
1243
|
+
|
1244
|
+
// continue optimizing the function defined by the tensor f
|
1245
|
+
GGML_API enum ggml_opt_result ggml_opt_resume(
|
1246
|
+
struct ggml_context * ctx,
|
1247
|
+
struct ggml_opt_context * opt,
|
1248
|
+
struct ggml_tensor * f);
|
1249
|
+
|
1250
|
+
// continue optimizing the function defined by the tensor f
|
1251
|
+
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
1252
|
+
struct ggml_context * ctx,
|
1253
|
+
struct ggml_opt_context * opt,
|
1254
|
+
struct ggml_tensor * f,
|
1255
|
+
struct ggml_cgraph * gf,
|
1256
|
+
struct ggml_cgraph * gb);
|
1257
|
+
|
1096
1258
|
//
|
1097
1259
|
// quantization
|
1098
1260
|
//
|