llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -241,6 +241,13 @@ extern "C" {
241
241
  GGML_TYPE_Q5_1 = 7,
242
242
  GGML_TYPE_Q8_0 = 8,
243
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
244
251
  GGML_TYPE_I8,
245
252
  GGML_TYPE_I16,
246
253
  GGML_TYPE_I32,
@@ -249,8 +256,8 @@ extern "C" {
249
256
 
250
257
  enum ggml_backend {
251
258
  GGML_BACKEND_CPU = 0,
252
- GGML_BACKEND_CUDA = 1,
253
- GGML_BACKEND_CL = 2,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
254
261
  };
255
262
 
256
263
  // model file types
@@ -264,6 +271,11 @@ extern "C" {
264
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
265
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
266
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
267
279
  };
268
280
 
269
281
  // available tensor operations:
@@ -284,6 +296,7 @@ extern "C" {
284
296
  GGML_OP_SUM_ROWS,
285
297
  GGML_OP_MEAN,
286
298
  GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
287
300
  GGML_OP_ABS,
288
301
  GGML_OP_SGN,
289
302
  GGML_OP_NEG,
@@ -297,6 +310,7 @@ extern "C" {
297
310
  GGML_OP_RMS_NORM_BACK,
298
311
 
299
312
  GGML_OP_MUL_MAT,
313
+ GGML_OP_OUT_PROD,
300
314
 
301
315
  GGML_OP_SCALE,
302
316
  GGML_OP_SET,
@@ -312,6 +326,7 @@ extern "C" {
312
326
  GGML_OP_DIAG_MASK_INF,
313
327
  GGML_OP_DIAG_MASK_ZERO,
314
328
  GGML_OP_SOFT_MAX,
329
+ GGML_OP_SOFT_MAX_BACK,
315
330
  GGML_OP_ROPE,
316
331
  GGML_OP_ROPE_BACK,
317
332
  GGML_OP_ALIBI,
@@ -321,10 +336,14 @@ extern "C" {
321
336
 
322
337
  GGML_OP_FLASH_ATTN,
323
338
  GGML_OP_FLASH_FF,
339
+ GGML_OP_FLASH_ATTN_BACK,
324
340
 
325
341
  GGML_OP_MAP_UNARY,
326
342
  GGML_OP_MAP_BINARY,
327
343
 
344
+ GGML_OP_CROSS_ENTROPY_LOSS,
345
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
346
+
328
347
  GGML_OP_COUNT,
329
348
  };
330
349
 
@@ -375,7 +394,9 @@ extern "C" {
375
394
 
376
395
  char name[GGML_MAX_NAME];
377
396
 
378
- char padding[16];
397
+ void * extra; // extra things e.g. for ggml-cuda.cu
398
+
399
+ char padding[4];
379
400
  };
380
401
 
381
402
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -413,6 +434,25 @@ extern "C" {
413
434
  bool no_alloc; // don't allocate memory for the tensor data
414
435
  };
415
436
 
437
+
438
+ // compute types
439
+ enum ggml_task_type {
440
+ GGML_TASK_INIT = 0,
441
+ GGML_TASK_COMPUTE,
442
+ GGML_TASK_FINALIZE,
443
+ };
444
+
445
+ struct ggml_compute_params {
446
+ enum ggml_task_type type;
447
+
448
+ // ith = thread index, nth = number of threads
449
+ int ith, nth;
450
+
451
+ // work buffer for all threads
452
+ size_t wsize;
453
+ void * wdata;
454
+ };
455
+
416
456
  // misc
417
457
 
418
458
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -424,8 +464,10 @@ extern "C" {
424
464
  GGML_API void ggml_print_object (const struct ggml_object * obj);
425
465
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
426
466
 
427
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
428
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
467
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
468
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
469
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
470
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
429
471
 
430
472
  GGML_API int ggml_blck_size (enum ggml_type type);
431
473
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -441,13 +483,17 @@ extern "C" {
441
483
  // TODO: temporary until model loading of ggml examples is refactored
442
484
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
443
485
 
486
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
487
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
488
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
489
+
444
490
  // use this to compute the memory overhead of a tensor
445
491
  GGML_API size_t ggml_tensor_overhead(void);
446
492
 
447
493
  // main
448
494
 
449
495
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
450
- GGML_API void ggml_free(struct ggml_context * ctx);
496
+ GGML_API void ggml_free(struct ggml_context * ctx);
451
497
 
452
498
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
453
499
 
@@ -536,6 +582,11 @@ extern "C" {
536
582
  struct ggml_tensor * a,
537
583
  struct ggml_tensor * b);
538
584
 
585
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
586
+ struct ggml_context * ctx,
587
+ struct ggml_tensor * a,
588
+ struct ggml_tensor * b);
589
+
539
590
  GGML_API struct ggml_tensor * ggml_acc(
540
591
  struct ggml_context * ctx,
541
592
  struct ggml_tensor * a,
@@ -607,6 +658,11 @@ extern "C" {
607
658
  struct ggml_tensor * a,
608
659
  struct ggml_tensor * b);
609
660
 
661
+ GGML_API struct ggml_tensor * ggml_repeat_back(
662
+ struct ggml_context * ctx,
663
+ struct ggml_tensor * a,
664
+ struct ggml_tensor * b);
665
+
610
666
  GGML_API struct ggml_tensor * ggml_abs(
611
667
  struct ggml_context * ctx,
612
668
  struct ggml_tensor * a);
@@ -660,14 +716,22 @@ extern "C" {
660
716
  struct ggml_tensor * a,
661
717
  struct ggml_tensor * b);
662
718
 
663
- // A: m rows, n columns
664
- // B: p rows, n columns (i.e. we transpose it internally)
719
+ // A: n columns, m rows
720
+ // B: n columns, p rows (i.e. we transpose it internally)
665
721
  // result is m columns, p rows
666
722
  GGML_API struct ggml_tensor * ggml_mul_mat(
667
723
  struct ggml_context * ctx,
668
724
  struct ggml_tensor * a,
669
725
  struct ggml_tensor * b);
670
726
 
727
+ // A: m columns, n rows,
728
+ // B: p columns, n rows,
729
+ // result is m columns, p rows
730
+ GGML_API struct ggml_tensor * ggml_out_prod(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a,
733
+ struct ggml_tensor * b);
734
+
671
735
  //
672
736
  // operations on tensors without backpropagation
673
737
  //
@@ -878,6 +942,17 @@ extern "C" {
878
942
  struct ggml_context * ctx,
879
943
  struct ggml_tensor * a);
880
944
 
945
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
946
+ struct ggml_context * ctx,
947
+ struct ggml_tensor * a,
948
+ struct ggml_tensor * b);
949
+
950
+ // in-place, returns view(a)
951
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
952
+ struct ggml_context * ctx,
953
+ struct ggml_tensor * a,
954
+ struct ggml_tensor * b);
955
+
881
956
  // rotary position embedding
882
957
  // if mode & 1 == 1, skip n_past elements
883
958
  // if mode & 2 == 1, GPT-NeoX style
@@ -944,6 +1019,14 @@ extern "C" {
944
1019
  struct ggml_tensor * v,
945
1020
  bool masked);
946
1021
 
1022
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1023
+ struct ggml_context * ctx,
1024
+ struct ggml_tensor * q,
1025
+ struct ggml_tensor * k,
1026
+ struct ggml_tensor * v,
1027
+ struct ggml_tensor * d,
1028
+ bool masked);
1029
+
947
1030
  GGML_API struct ggml_tensor * ggml_flash_ff(
948
1031
  struct ggml_context * ctx,
949
1032
  struct ggml_tensor * a,
@@ -967,6 +1050,19 @@ extern "C" {
967
1050
  struct ggml_tensor * b,
968
1051
  ggml_binary_op_f32_t fun);
969
1052
 
1053
+ // loss function
1054
+
1055
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1056
+ struct ggml_context * ctx,
1057
+ struct ggml_tensor * a,
1058
+ struct ggml_tensor * b);
1059
+
1060
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1061
+ struct ggml_context * ctx,
1062
+ struct ggml_tensor * a,
1063
+ struct ggml_tensor * b,
1064
+ struct ggml_tensor * c);
1065
+
970
1066
  //
971
1067
  // automatic differentiation
972
1068
  //
@@ -1061,6 +1157,8 @@ extern "C" {
1061
1157
  struct {
1062
1158
  int n_iter;
1063
1159
 
1160
+ float sched; // schedule multiplier (fixed, decay or warmup)
1161
+ float decay; // weight decay for AdamW, use 0.0f to disable
1064
1162
  float alpha; // learning rate
1065
1163
  float beta1;
1066
1164
  float beta2;
@@ -1085,6 +1183,49 @@ extern "C" {
1085
1183
  } lbfgs;
1086
1184
  };
1087
1185
 
1186
+ struct ggml_opt_context {
1187
+ struct ggml_context * ctx;
1188
+ struct ggml_opt_params params;
1189
+
1190
+ int iter;
1191
+ int64_t nx; // number of parameter elements
1192
+
1193
+ bool just_initialized;
1194
+
1195
+ struct {
1196
+ struct ggml_tensor * x; // view of the parameters
1197
+ struct ggml_tensor * g1; // gradient
1198
+ struct ggml_tensor * g2; // gradient squared
1199
+ struct ggml_tensor * m; // first moment
1200
+ struct ggml_tensor * v; // second moment
1201
+ struct ggml_tensor * mh; // first moment hat
1202
+ struct ggml_tensor * vh; // second moment hat
1203
+ struct ggml_tensor * pf; // past function values
1204
+ float fx_best;
1205
+ float fx_prev;
1206
+ int n_no_improvement;
1207
+ } adam;
1208
+
1209
+ struct {
1210
+ struct ggml_tensor * x; // current parameters
1211
+ struct ggml_tensor * xp; // previous parameters
1212
+ struct ggml_tensor * g; // current gradient
1213
+ struct ggml_tensor * gp; // previous gradient
1214
+ struct ggml_tensor * d; // search direction
1215
+ struct ggml_tensor * pf; // past function values
1216
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1217
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1218
+ struct ggml_tensor * lms; // the L-BFGS memory s
1219
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1220
+ float fx_best;
1221
+ float step;
1222
+ int j;
1223
+ int k;
1224
+ int end;
1225
+ int n_no_improvement;
1226
+ } lbfgs;
1227
+ };
1228
+
1088
1229
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1089
1230
 
1090
1231
  // optimize the function defined by the tensor f
@@ -1093,6 +1234,27 @@ extern "C" {
1093
1234
  struct ggml_opt_params params,
1094
1235
  struct ggml_tensor * f);
1095
1236
 
1237
+ // initialize optimizer context
1238
+ GGML_API void ggml_opt_init(
1239
+ struct ggml_context * ctx,
1240
+ struct ggml_opt_context * opt,
1241
+ struct ggml_opt_params params,
1242
+ int64_t nx);
1243
+
1244
+ // continue optimizing the function defined by the tensor f
1245
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1246
+ struct ggml_context * ctx,
1247
+ struct ggml_opt_context * opt,
1248
+ struct ggml_tensor * f);
1249
+
1250
+ // continue optimizing the function defined by the tensor f
1251
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1252
+ struct ggml_context * ctx,
1253
+ struct ggml_opt_context * opt,
1254
+ struct ggml_tensor * f,
1255
+ struct ggml_cgraph * gf,
1256
+ struct ggml_cgraph * gb);
1257
+
1096
1258
  //
1097
1259
  // quantization
1098
1260
  //