llama_cpp 0.3.4 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -199,6 +199,7 @@
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
+ #define GGML_MAX_OP_PARAMS 32
202
203
  #define GGML_DEFAULT_N_THREADS 4
203
204
 
204
205
 
@@ -207,6 +208,7 @@
207
208
 
208
209
  #define GGML_UNUSED(x) (void)(x)
209
210
 
211
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
210
212
 
211
213
  #define GGML_ASSERT(x) \
212
214
  do { \
@@ -329,16 +331,6 @@ extern "C" {
329
331
  GGML_OP_ARGMAX,
330
332
  GGML_OP_REPEAT,
331
333
  GGML_OP_REPEAT_BACK,
332
- GGML_OP_ABS,
333
- GGML_OP_SGN,
334
- GGML_OP_NEG,
335
- GGML_OP_STEP,
336
- GGML_OP_TANH,
337
- GGML_OP_ELU,
338
- GGML_OP_RELU,
339
- GGML_OP_GELU,
340
- GGML_OP_GELU_QUICK,
341
- GGML_OP_SILU,
342
334
  GGML_OP_SILU_BACK,
343
335
  GGML_OP_NORM, // normalize
344
336
  GGML_OP_RMS_NORM,
@@ -377,6 +369,8 @@ extern "C" {
377
369
  GGML_OP_WIN_PART,
378
370
  GGML_OP_WIN_UNPART,
379
371
 
372
+ GGML_OP_UNARY,
373
+
380
374
  GGML_OP_MAP_UNARY,
381
375
  GGML_OP_MAP_BINARY,
382
376
 
@@ -390,6 +384,24 @@ extern "C" {
390
384
  GGML_OP_COUNT,
391
385
  };
392
386
 
387
+ enum ggml_unary_op {
388
+ GGML_UNARY_OP_ABS,
389
+ GGML_UNARY_OP_SGN,
390
+ GGML_UNARY_OP_NEG,
391
+ GGML_UNARY_OP_STEP,
392
+ GGML_UNARY_OP_TANH,
393
+ GGML_UNARY_OP_ELU,
394
+ GGML_UNARY_OP_RELU,
395
+ GGML_UNARY_OP_GELU,
396
+ GGML_UNARY_OP_GELU_QUICK,
397
+ GGML_UNARY_OP_SILU,
398
+ };
399
+
400
+ enum ggml_object_type {
401
+ GGML_OBJECT_TENSOR,
402
+ GGML_OBJECT_GRAPH,
403
+ GGML_OBJECT_WORK_BUFFER
404
+ };
393
405
 
394
406
  // ggml object
395
407
  struct ggml_object {
@@ -398,7 +410,9 @@ extern "C" {
398
410
 
399
411
  struct ggml_object * next;
400
412
 
401
- char padding[8];
413
+ enum ggml_object_type type;
414
+
415
+ char padding[4];
402
416
  };
403
417
 
404
418
  static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -418,6 +432,9 @@ extern "C" {
418
432
  // compute data
419
433
  enum ggml_op op;
420
434
 
435
+ // op params - allocated as int32_t for alignment
436
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
437
+
421
438
  bool is_param;
422
439
 
423
440
  struct ggml_tensor * grad;
@@ -434,7 +451,7 @@ extern "C" {
434
451
 
435
452
  void * extra; // extra things e.g. for ggml-cuda.cu
436
453
 
437
- char padding[8];
454
+ char padding[4];
438
455
  };
439
456
 
440
457
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -455,6 +472,11 @@ extern "C" {
455
472
  void * abort_callback_data;
456
473
  };
457
474
 
475
+ // next prime after GGML_MAX_NODES
476
+ // #define GGML_GRAPH_HASHTABLE_SIZE 4099
477
+ // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
478
+ #define GGML_GRAPH_HASHTABLE_SIZE 8273
479
+
458
480
  // computation graph
459
481
  struct ggml_cgraph {
460
482
  int n_nodes;
@@ -464,12 +486,16 @@ extern "C" {
464
486
  struct ggml_tensor * grads[GGML_MAX_NODES];
465
487
  struct ggml_tensor * leafs[GGML_MAX_NODES];
466
488
 
489
+ void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
490
+
467
491
  // performance
468
492
  int perf_runs;
469
493
  int64_t perf_cycles;
470
494
  int64_t perf_time_us;
471
495
  };
472
496
 
497
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
498
+
473
499
  // scratch buffer
474
500
  struct ggml_scratch {
475
501
  size_t offs;
@@ -531,6 +557,7 @@ extern "C" {
531
557
 
532
558
  GGML_API const char * ggml_type_name(enum ggml_type type);
533
559
  GGML_API const char * ggml_op_name (enum ggml_op op);
560
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
534
561
 
535
562
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
536
563
 
@@ -554,6 +581,7 @@ extern "C" {
554
581
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
555
582
 
556
583
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
584
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
557
585
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
558
586
 
559
587
  GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
@@ -613,9 +641,11 @@ extern "C" {
613
641
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
614
642
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
615
643
 
616
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
617
- GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
618
- GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
644
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
645
+
646
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
647
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
648
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
619
649
 
620
650
  //
621
651
  // operations on tensors with backpropagation
@@ -625,6 +655,11 @@ extern "C" {
625
655
  struct ggml_context * ctx,
626
656
  struct ggml_tensor * a);
627
657
 
658
+ // in-place, returns view(a)
659
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
660
+ struct ggml_context * ctx,
661
+ struct ggml_tensor * a);
662
+
628
663
  GGML_API struct ggml_tensor * ggml_add(
629
664
  struct ggml_context * ctx,
630
665
  struct ggml_tensor * a,
@@ -849,14 +884,17 @@ extern "C" {
849
884
 
850
885
  GGML_API struct ggml_tensor * ggml_rms_norm(
851
886
  struct ggml_context * ctx,
852
- struct ggml_tensor * a);
887
+ struct ggml_tensor * a,
888
+ float eps);
853
889
 
854
890
  GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
855
891
  struct ggml_context * ctx,
856
- struct ggml_tensor * a);
892
+ struct ggml_tensor * a,
893
+ float eps);
857
894
 
858
895
  // a - x
859
896
  // b - dy
897
+ // TODO: update with configurable eps
860
898
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
861
899
  struct ggml_context * ctx,
862
900
  struct ggml_tensor * a,
@@ -948,11 +986,22 @@ extern "C" {
948
986
  struct ggml_tensor * a,
949
987
  struct ggml_tensor * b);
950
988
 
989
+ // a -> b, in-place, return view(b)
990
+ GGML_API struct ggml_tensor * ggml_cpy_inplace(
991
+ struct ggml_context * ctx,
992
+ struct ggml_tensor * a,
993
+ struct ggml_tensor * b);
994
+
951
995
  // make contiguous
952
996
  GGML_API struct ggml_tensor * ggml_cont(
953
997
  struct ggml_context * ctx,
954
998
  struct ggml_tensor * a);
955
999
 
1000
+ // make contiguous, in-place
1001
+ GGML_API struct ggml_tensor * ggml_cont_inplace(
1002
+ struct ggml_context * ctx,
1003
+ struct ggml_tensor * a);
1004
+
956
1005
  // return view(a), b specifies the new shape
957
1006
  // TODO: when we start computing gradient, make a copy instead of view
958
1007
  GGML_API struct ggml_tensor * ggml_reshape(
@@ -1121,7 +1170,18 @@ extern "C" {
1121
1170
  int mode,
1122
1171
  int n_ctx);
1123
1172
 
1124
- // custom RoPE, in-place, returns view(a)
1173
+ // custom RoPE
1174
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1175
+ struct ggml_context * ctx,
1176
+ struct ggml_tensor * a,
1177
+ int n_past,
1178
+ int n_dims,
1179
+ int mode,
1180
+ int n_ctx,
1181
+ float freq_base,
1182
+ float freq_scale);
1183
+
1184
+ // in-place, returns view(a)
1125
1185
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1126
1186
  struct ggml_context * ctx,
1127
1187
  struct ggml_tensor * a,
@@ -1264,6 +1324,16 @@ extern "C" {
1264
1324
  typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1265
1325
  typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1266
1326
 
1327
+ GGML_API struct ggml_tensor * ggml_unary(
1328
+ struct ggml_context * ctx,
1329
+ struct ggml_tensor * a,
1330
+ enum ggml_unary_op op);
1331
+
1332
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
1333
+ struct ggml_context * ctx,
1334
+ struct ggml_tensor * a,
1335
+ enum ggml_unary_op op);
1336
+
1267
1337
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1268
1338
  struct ggml_context * ctx,
1269
1339
  struct ggml_tensor * a,
@@ -1343,11 +1413,17 @@ extern "C" {
1343
1413
  struct ggml_context * ctx,
1344
1414
  struct ggml_tensor * tensor);
1345
1415
 
1416
+
1346
1417
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1347
1418
 
1348
1419
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1349
1420
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1350
1421
 
1422
+ // graph allocation in a context
1423
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1424
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1425
+ GGML_API size_t ggml_graph_overhead(void);
1426
+
1351
1427
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1352
1428
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1353
1429
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);