llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -241,6 +241,13 @@ extern "C" {
241
241
  GGML_TYPE_Q5_1 = 7,
242
242
  GGML_TYPE_Q8_0 = 8,
243
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
244
251
  GGML_TYPE_I8,
245
252
  GGML_TYPE_I16,
246
253
  GGML_TYPE_I32,
@@ -249,8 +256,8 @@ extern "C" {
249
256
 
250
257
  enum ggml_backend {
251
258
  GGML_BACKEND_CPU = 0,
252
- GGML_BACKEND_CUDA = 1,
253
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
254
261
  };
255
262
 
256
263
  // model file types
@@ -264,6 +271,11 @@ extern "C" {
264
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
265
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
266
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
267
279
  };
268
280
 
269
281
  // available tensor operations:
@@ -284,6 +296,7 @@ extern "C" {
284
296
  GGML_OP_SUM_ROWS,
285
297
  GGML_OP_MEAN,
286
298
  GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
287
300
  GGML_OP_ABS,
288
301
  GGML_OP_SGN,
289
302
  GGML_OP_NEG,
@@ -297,6 +310,7 @@ extern "C" {
297
310
  GGML_OP_RMS_NORM_BACK,
298
311
 
299
312
  GGML_OP_MUL_MAT,
313
+ GGML_OP_OUT_PROD,
300
314
 
301
315
  GGML_OP_SCALE,
302
316
  GGML_OP_SET,
@@ -312,6 +326,7 @@ extern "C" {
312
326
  GGML_OP_DIAG_MASK_INF,
313
327
  GGML_OP_DIAG_MASK_ZERO,
314
328
  GGML_OP_SOFT_MAX,
329
+ GGML_OP_SOFT_MAX_BACK,
315
330
  GGML_OP_ROPE,
316
331
  GGML_OP_ROPE_BACK,
317
332
  GGML_OP_ALIBI,
@@ -321,10 +336,14 @@ extern "C" {
321
336
 
322
337
  GGML_OP_FLASH_ATTN,
323
338
  GGML_OP_FLASH_FF,
339
+ GGML_OP_FLASH_ATTN_BACK,
324
340
 
325
341
  GGML_OP_MAP_UNARY,
326
342
  GGML_OP_MAP_BINARY,
327
343
 
344
+ GGML_OP_CROSS_ENTROPY_LOSS,
345
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
346
+
328
347
  GGML_OP_COUNT,
329
348
  };
330
349
 
@@ -375,7 +394,9 @@ extern "C" {
375
394
 
376
395
  char name[GGML_MAX_NAME];
377
396
 
378
- char padding[16];
397
+ void * extra; // extra things e.g. for ggml-cuda.cu
398
+
399
+ char padding[4];
379
400
  };
380
401
 
381
402
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -413,6 +434,25 @@ extern "C" {
413
434
  bool no_alloc; // don't allocate memory for the tensor data
414
435
  };
415
436
 
437
+
438
+ // compute types
439
+ enum ggml_task_type {
440
+ GGML_TASK_INIT = 0,
441
+ GGML_TASK_COMPUTE,
442
+ GGML_TASK_FINALIZE,
443
+ };
444
+
445
+ struct ggml_compute_params {
446
+ enum ggml_task_type type;
447
+
448
+ // ith = thread index, nth = number of threads
449
+ int ith, nth;
450
+
451
+ // work buffer for all threads
452
+ size_t wsize;
453
+ void * wdata;
454
+ };
455
+
416
456
  // misc
417
457
 
418
458
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -424,8 +464,10 @@ extern "C" {
424
464
  GGML_API void ggml_print_object (const struct ggml_object * obj);
425
465
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
426
466
 
427
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
428
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
467
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
468
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
469
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
470
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
429
471
 
430
472
  GGML_API int ggml_blck_size (enum ggml_type type);
431
473
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -441,13 +483,17 @@ extern "C" {
441
483
  // TODO: temporary until model loading of ggml examples is refactored
442
484
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
443
485
 
486
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
487
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
488
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
489
+
444
490
  // use this to compute the memory overhead of a tensor
445
491
  GGML_API size_t ggml_tensor_overhead(void);
446
492
 
447
493
  // main
448
494
 
449
495
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
450
- GGML_API void ggml_free(struct ggml_context * ctx);
496
+ GGML_API void ggml_free(struct ggml_context * ctx);
451
497
 
452
498
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
453
499
 
@@ -536,6 +582,11 @@ extern "C" {
536
582
  struct ggml_tensor * a,
537
583
  struct ggml_tensor * b);
538
584
 
585
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
586
+ struct ggml_context * ctx,
587
+ struct ggml_tensor * a,
588
+ struct ggml_tensor * b);
589
+
539
590
  GGML_API struct ggml_tensor * ggml_acc(
540
591
  struct ggml_context * ctx,
541
592
  struct ggml_tensor * a,
@@ -607,6 +658,11 @@ extern "C" {
607
658
  struct ggml_tensor * a,
608
659
  struct ggml_tensor * b);
609
660
 
661
+ GGML_API struct ggml_tensor * ggml_repeat_back(
662
+ struct ggml_context * ctx,
663
+ struct ggml_tensor * a,
664
+ struct ggml_tensor * b);
665
+
610
666
  GGML_API struct ggml_tensor * ggml_abs(
611
667
  struct ggml_context * ctx,
612
668
  struct ggml_tensor * a);
@@ -660,14 +716,22 @@ extern "C" {
660
716
  struct ggml_tensor * a,
661
717
  struct ggml_tensor * b);
662
718
 
663
- // A: m rows, n columns
664
- // B: p rows, n columns (i.e. we transpose it internally)
719
+ // A: n columns, m rows
720
+ // B: n columns, p rows (i.e. we transpose it internally)
665
721
  // result is m columns, p rows
666
722
  GGML_API struct ggml_tensor * ggml_mul_mat(
667
723
  struct ggml_context * ctx,
668
724
  struct ggml_tensor * a,
669
725
  struct ggml_tensor * b);
670
726
 
727
+ // A: m columns, n rows,
728
+ // B: p columns, n rows,
729
+ // result is m columns, p rows
730
+ GGML_API struct ggml_tensor * ggml_out_prod(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a,
733
+ struct ggml_tensor * b);
734
+
671
735
  //
672
736
  // operations on tensors without backpropagation
673
737
  //
@@ -878,6 +942,17 @@ extern "C" {
878
942
  struct ggml_context * ctx,
879
943
  struct ggml_tensor * a);
880
944
 
945
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
946
+ struct ggml_context * ctx,
947
+ struct ggml_tensor * a,
948
+ struct ggml_tensor * b);
949
+
950
+ // in-place, returns view(a)
951
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
952
+ struct ggml_context * ctx,
953
+ struct ggml_tensor * a,
954
+ struct ggml_tensor * b);
955
+
881
956
  // rotary position embedding
882
957
  // if mode & 1 == 1, skip n_past elements
883
958
  // if mode & 2 == 1, GPT-NeoX style
@@ -944,6 +1019,14 @@ extern "C" {
944
1019
  struct ggml_tensor * v,
945
1020
  bool masked);
946
1021
 
1022
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1023
+ struct ggml_context * ctx,
1024
+ struct ggml_tensor * q,
1025
+ struct ggml_tensor * k,
1026
+ struct ggml_tensor * v,
1027
+ struct ggml_tensor * d,
1028
+ bool masked);
1029
+
947
1030
  GGML_API struct ggml_tensor * ggml_flash_ff(
948
1031
  struct ggml_context * ctx,
949
1032
  struct ggml_tensor * a,
@@ -967,6 +1050,19 @@ extern "C" {
967
1050
  struct ggml_tensor * b,
968
1051
  ggml_binary_op_f32_t fun);
969
1052
 
1053
+ // loss function
1054
+
1055
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1056
+ struct ggml_context * ctx,
1057
+ struct ggml_tensor * a,
1058
+ struct ggml_tensor * b);
1059
+
1060
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1061
+ struct ggml_context * ctx,
1062
+ struct ggml_tensor * a,
1063
+ struct ggml_tensor * b,
1064
+ struct ggml_tensor * c);
1065
+
970
1066
  //
971
1067
  // automatic differentiation
972
1068
  //
@@ -1061,6 +1157,8 @@ extern "C" {
1061
1157
  struct {
1062
1158
  int n_iter;
1063
1159
 
1160
+ float sched; // schedule multiplier (fixed, decay or warmup)
1161
+ float decay; // weight decay for AdamW, use 0.0f to disable
1064
1162
  float alpha; // learning rate
1065
1163
  float beta1;
1066
1164
  float beta2;
@@ -1085,6 +1183,49 @@ extern "C" {
1085
1183
  } lbfgs;
1086
1184
  };
1087
1185
 
1186
+ struct ggml_opt_context {
1187
+ struct ggml_context * ctx;
1188
+ struct ggml_opt_params params;
1189
+
1190
+ int iter;
1191
+ int64_t nx; // number of parameter elements
1192
+
1193
+ bool just_initialized;
1194
+
1195
+ struct {
1196
+ struct ggml_tensor * x; // view of the parameters
1197
+ struct ggml_tensor * g1; // gradient
1198
+ struct ggml_tensor * g2; // gradient squared
1199
+ struct ggml_tensor * m; // first moment
1200
+ struct ggml_tensor * v; // second moment
1201
+ struct ggml_tensor * mh; // first moment hat
1202
+ struct ggml_tensor * vh; // second moment hat
1203
+ struct ggml_tensor * pf; // past function values
1204
+ float fx_best;
1205
+ float fx_prev;
1206
+ int n_no_improvement;
1207
+ } adam;
1208
+
1209
+ struct {
1210
+ struct ggml_tensor * x; // current parameters
1211
+ struct ggml_tensor * xp; // previous parameters
1212
+ struct ggml_tensor * g; // current gradient
1213
+ struct ggml_tensor * gp; // previous gradient
1214
+ struct ggml_tensor * d; // search direction
1215
+ struct ggml_tensor * pf; // past function values
1216
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1217
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1218
+ struct ggml_tensor * lms; // the L-BFGS memory s
1219
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1220
+ float fx_best;
1221
+ float step;
1222
+ int j;
1223
+ int k;
1224
+ int end;
1225
+ int n_no_improvement;
1226
+ } lbfgs;
1227
+ };
1228
+
1088
1229
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1089
1230
 
1090
1231
  // optimize the function defined by the tensor f
@@ -1093,6 +1234,27 @@ extern "C" {
1093
1234
  struct ggml_opt_params params,
1094
1235
  struct ggml_tensor * f);
1095
1236
 
1237
+ // initialize optimizer context
1238
+ GGML_API void ggml_opt_init(
1239
+ struct ggml_context * ctx,
1240
+ struct ggml_opt_context * opt,
1241
+ struct ggml_opt_params params,
1242
+ int64_t nx);
1243
+
1244
+ // continue optimizing the function defined by the tensor f
1245
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1246
+ struct ggml_context * ctx,
1247
+ struct ggml_opt_context * opt,
1248
+ struct ggml_tensor * f);
1249
+
1250
+ // continue optimizing the function defined by the tensor f
1251
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1252
+ struct ggml_context * ctx,
1253
+ struct ggml_opt_context * opt,
1254
+ struct ggml_tensor * f,
1255
+ struct ggml_cgraph * gf,
1256
+ struct ggml_cgraph * gb);
1257
+
1096
1258
  //
1097
1259
  // quantization
1098
1260
  //