llama_cpp 0.9.1 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -58,7 +58,8 @@
58
58
  // {
59
59
  // ...
60
60
  //
61
- // struct ggml_cgraph gf = ggml_build_forward(f);
61
+ // struct ggml_cgraph * gf = ggml_new_graph(ctx);
62
+ // ggml_build_forward_expand(gf, f);
62
63
  //
63
64
  // // set the input variable and parameter values
64
65
  // ggml_set_f32(x, 2.0f);
@@ -213,15 +214,14 @@
213
214
  #define GGML_QNT_VERSION 2 // bump this on quantization format changes
214
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
216
 
216
- #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 16384
218
- #define GGML_MAX_PARAMS 1024
219
- #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
221
- #define GGML_MAX_NAME 64
222
- #define GGML_MAX_OP_PARAMS 64
223
- #define GGML_DEFAULT_N_THREADS 4
224
-
217
+ #define GGML_MAX_DIMS 4
218
+ #define GGML_MAX_PARAMS 1024
219
+ #define GGML_MAX_CONTEXTS 64
220
+ #define GGML_MAX_SRC 6
221
+ #define GGML_MAX_NAME 64
222
+ #define GGML_MAX_OP_PARAMS 64
223
+ #define GGML_DEFAULT_N_THREADS 4
224
+ #define GGML_DEFAULT_GRAPH_SIZE 2048
225
225
  #if UINTPTR_MAX == 0xFFFFFFFF
226
226
  #define GGML_MEM_ALIGN 4
227
227
  #else
@@ -245,7 +245,10 @@
245
245
  do { \
246
246
  if (!(x)) { \
247
247
  fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- abort(); \
248
+ fflush(stderr); \
249
+ fflush(stdout); \
250
+ ggml_print_backtrace(); \
251
+ exit(1); \
249
252
  } \
250
253
  } while (0)
251
254
 
@@ -400,13 +403,8 @@ extern "C" {
400
403
  GGML_OP_ROPE_BACK,
401
404
  GGML_OP_ALIBI,
402
405
  GGML_OP_CLAMP,
403
- GGML_OP_CONV_1D,
404
- GGML_OP_CONV_1D_STAGE_0, // internal
405
- GGML_OP_CONV_1D_STAGE_1, // internal
406
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
- GGML_OP_CONV_2D,
408
- GGML_OP_CONV_2D_STAGE_0, // internal
409
- GGML_OP_CONV_2D_STAGE_1, // internal
407
+ GGML_OP_IM2COL,
410
408
  GGML_OP_CONV_TRANSPOSE_2D,
411
409
  GGML_OP_POOL_1D,
412
410
  GGML_OP_POOL_2D,
@@ -451,6 +449,7 @@ extern "C" {
451
449
  GGML_UNARY_OP_GELU,
452
450
  GGML_UNARY_OP_GELU_QUICK,
453
451
  GGML_UNARY_OP_SILU,
452
+ GGML_UNARY_OP_LEAKY
454
453
  };
455
454
 
456
455
  enum ggml_object_type {
@@ -531,37 +530,33 @@ extern "C" {
531
530
 
532
531
  int n_threads;
533
532
 
534
- // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
535
- int n_tasks[GGML_MAX_NODES];
536
-
537
533
  // abort ggml_graph_compute when true
538
534
  bool (*abort_callback)(void * data);
539
535
  void * abort_callback_data;
540
536
  };
541
537
 
542
- // next prime after GGML_MAX_NODES
543
- // #define GGML_GRAPH_HASHTABLE_SIZE 4099
544
- // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
545
- // #define GGML_GRAPH_HASHTABLE_SIZE 8273
546
- // #define GGML_GRAPH_HASHTABLE_SIZE 16411
547
- #define GGML_GRAPH_HASHTABLE_SIZE 32771
548
-
549
538
  enum ggml_cgraph_eval_order {
550
539
  GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
551
540
  GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
552
541
  GGML_CGRAPH_EVAL_ORDER_COUNT
553
542
  };
554
543
 
544
+ struct ggml_hash_set {
545
+ size_t size;
546
+ struct ggml_tensor ** keys;
547
+ };
548
+
555
549
  // computation graph
556
550
  struct ggml_cgraph {
551
+ int size;
557
552
  int n_nodes;
558
553
  int n_leafs;
559
554
 
560
- struct ggml_tensor * nodes[GGML_MAX_NODES];
561
- struct ggml_tensor * grads[GGML_MAX_NODES];
562
- struct ggml_tensor * leafs[GGML_MAX_NODES];
555
+ struct ggml_tensor ** nodes;
556
+ struct ggml_tensor ** grads;
557
+ struct ggml_tensor ** leafs;
563
558
 
564
- void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
559
+ struct ggml_hash_set visited_hash_table;
565
560
 
566
561
  enum ggml_cgraph_eval_order order;
567
562
 
@@ -571,8 +566,6 @@ extern "C" {
571
566
  int64_t perf_time_us;
572
567
  };
573
568
 
574
- static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
575
-
576
569
  // scratch buffer
577
570
  struct ggml_scratch {
578
571
  size_t offs;
@@ -617,6 +610,8 @@ extern "C" {
617
610
  GGML_API int64_t ggml_cycles(void);
618
611
  GGML_API int64_t ggml_cycles_per_ms(void);
619
612
 
613
+ GGML_API void ggml_print_backtrace(void);
614
+
620
615
  GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
621
616
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
622
617
 
@@ -709,7 +704,7 @@ extern "C" {
709
704
  // Context tensor enumeration and lookup
710
705
  GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
711
706
  GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
712
- GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
707
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
713
708
 
714
709
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
715
710
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@@ -943,6 +938,10 @@ extern "C" {
943
938
  struct ggml_context * ctx,
944
939
  struct ggml_tensor * a);
945
940
 
941
+ GGML_API struct ggml_tensor * ggml_leaky(
942
+ struct ggml_context * ctx,
943
+ struct ggml_tensor * a);
944
+
946
945
  GGML_API struct ggml_tensor * ggml_relu_inplace(
947
946
  struct ggml_context * ctx,
948
947
  struct ggml_tensor * a);
@@ -1372,8 +1371,13 @@ extern "C" {
1372
1371
  int n_dims,
1373
1372
  int mode,
1374
1373
  int n_ctx,
1374
+ int n_orig_ctx,
1375
1375
  float freq_base,
1376
1376
  float freq_scale,
1377
+ float ext_factor,
1378
+ float attn_factor,
1379
+ float beta_fast,
1380
+ float beta_slow,
1377
1381
  float xpos_base,
1378
1382
  bool xpos_down);
1379
1383
 
@@ -1394,6 +1398,18 @@ extern "C" {
1394
1398
  float min,
1395
1399
  float max);
1396
1400
 
1401
+ GGML_API struct ggml_tensor * ggml_im2col(
1402
+ struct ggml_context * ctx,
1403
+ struct ggml_tensor * a,
1404
+ struct ggml_tensor * b,
1405
+ int s0,
1406
+ int s1,
1407
+ int p0,
1408
+ int p1,
1409
+ int d0,
1410
+ int d1,
1411
+ bool is_2D);
1412
+
1397
1413
  GGML_API struct ggml_tensor * ggml_conv_1d(
1398
1414
  struct ggml_context * ctx,
1399
1415
  struct ggml_tensor * a,
@@ -1477,6 +1493,8 @@ extern "C" {
1477
1493
  int s0, // stride
1478
1494
  int p0); // padding
1479
1495
 
1496
+ // the result will have 2*p0 padding for the first dimension
1497
+ // and 2*p1 padding for the second dimension
1480
1498
  GGML_API struct ggml_tensor * ggml_pool_2d(
1481
1499
  struct ggml_context * ctx,
1482
1500
  struct ggml_tensor * a,
@@ -1485,8 +1503,8 @@ extern "C" {
1485
1503
  int k1,
1486
1504
  int s0,
1487
1505
  int s1,
1488
- int p0,
1489
- int p1);
1506
+ float p0,
1507
+ float p1);
1490
1508
 
1491
1509
  // nearest interpolate
1492
1510
  // used in stable-diffusion
@@ -1727,19 +1745,22 @@ extern "C" {
1727
1745
  GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1728
1746
  GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
1729
1747
 
1730
- GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1731
- GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1732
-
1733
1748
  // graph allocation in a context
1734
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1735
- GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1749
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
1750
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
1751
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1752
+ GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
1753
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
1754
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
1755
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
1756
+
1736
1757
  GGML_API size_t ggml_graph_overhead(void);
1758
+ GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
1737
1759
 
1738
1760
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1739
1761
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1740
1762
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1741
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1742
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1763
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1743
1764
 
1744
1765
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1745
1766
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -1747,8 +1768,8 @@ extern "C" {
1747
1768
 
1748
1769
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1749
1770
 
1750
- GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1751
- GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1771
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1772
+ GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1752
1773
 
1753
1774
  // print info and performance information for the graph
1754
1775
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
@@ -1811,6 +1832,8 @@ extern "C" {
1811
1832
  struct ggml_opt_params {
1812
1833
  enum ggml_opt_type type;
1813
1834
 
1835
+ size_t graph_size;
1836
+
1814
1837
  int n_threads;
1815
1838
 
1816
1839
  // delta-based convergence test