llama_cpp 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,7 +58,8 @@
58
58
  // {
59
59
  // ...
60
60
  //
61
- // struct ggml_cgraph gf = ggml_build_forward(f);
61
+ // struct ggml_cgraph * gf = ggml_new_graph(ctx);
62
+ // ggml_build_forward_expand(gf, f);
62
63
  //
63
64
  // // set the input variable and parameter values
64
65
  // ggml_set_f32(x, 2.0f);
@@ -213,15 +214,14 @@
213
214
  #define GGML_QNT_VERSION 2 // bump this on quantization format changes
214
215
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
215
216
 
216
- #define GGML_MAX_DIMS 4
217
- #define GGML_MAX_NODES 16384
218
- #define GGML_MAX_PARAMS 1024
219
- #define GGML_MAX_CONTEXTS 64
220
- #define GGML_MAX_SRC 6
221
- #define GGML_MAX_NAME 64
222
- #define GGML_MAX_OP_PARAMS 64
223
- #define GGML_DEFAULT_N_THREADS 4
224
-
217
+ #define GGML_MAX_DIMS 4
218
+ #define GGML_MAX_PARAMS 1024
219
+ #define GGML_MAX_CONTEXTS 64
220
+ #define GGML_MAX_SRC 6
221
+ #define GGML_MAX_NAME 64
222
+ #define GGML_MAX_OP_PARAMS 64
223
+ #define GGML_DEFAULT_N_THREADS 4
224
+ #define GGML_DEFAULT_GRAPH_SIZE 2048
225
225
  #if UINTPTR_MAX == 0xFFFFFFFF
226
226
  #define GGML_MEM_ALIGN 4
227
227
  #else
@@ -245,7 +245,10 @@
245
245
  do { \
246
246
  if (!(x)) { \
247
247
  fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- abort(); \
248
+ fflush(stderr); \
249
+ fflush(stdout); \
250
+ ggml_print_backtrace(); \
251
+ exit(1); \
249
252
  } \
250
253
  } while (0)
251
254
 
@@ -400,13 +403,8 @@ extern "C" {
400
403
  GGML_OP_ROPE_BACK,
401
404
  GGML_OP_ALIBI,
402
405
  GGML_OP_CLAMP,
403
- GGML_OP_CONV_1D,
404
- GGML_OP_CONV_1D_STAGE_0, // internal
405
- GGML_OP_CONV_1D_STAGE_1, // internal
406
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
- GGML_OP_CONV_2D,
408
- GGML_OP_CONV_2D_STAGE_0, // internal
409
- GGML_OP_CONV_2D_STAGE_1, // internal
407
+ GGML_OP_IM2COL,
410
408
  GGML_OP_CONV_TRANSPOSE_2D,
411
409
  GGML_OP_POOL_1D,
412
410
  GGML_OP_POOL_2D,
@@ -451,6 +449,7 @@ extern "C" {
451
449
  GGML_UNARY_OP_GELU,
452
450
  GGML_UNARY_OP_GELU_QUICK,
453
451
  GGML_UNARY_OP_SILU,
452
+ GGML_UNARY_OP_LEAKY
454
453
  };
455
454
 
456
455
  enum ggml_object_type {
@@ -531,37 +530,33 @@ extern "C" {
531
530
 
532
531
  int n_threads;
533
532
 
534
- // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
535
- int n_tasks[GGML_MAX_NODES];
536
-
537
533
  // abort ggml_graph_compute when true
538
534
  bool (*abort_callback)(void * data);
539
535
  void * abort_callback_data;
540
536
  };
541
537
 
542
- // next prime after GGML_MAX_NODES
543
- // #define GGML_GRAPH_HASHTABLE_SIZE 4099
544
- // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
545
- // #define GGML_GRAPH_HASHTABLE_SIZE 8273
546
- // #define GGML_GRAPH_HASHTABLE_SIZE 16411
547
- #define GGML_GRAPH_HASHTABLE_SIZE 32771
548
-
549
538
  enum ggml_cgraph_eval_order {
550
539
  GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
551
540
  GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
552
541
  GGML_CGRAPH_EVAL_ORDER_COUNT
553
542
  };
554
543
 
544
+ struct ggml_hash_set {
545
+ size_t size;
546
+ struct ggml_tensor ** keys;
547
+ };
548
+
555
549
  // computation graph
556
550
  struct ggml_cgraph {
551
+ int size;
557
552
  int n_nodes;
558
553
  int n_leafs;
559
554
 
560
- struct ggml_tensor * nodes[GGML_MAX_NODES];
561
- struct ggml_tensor * grads[GGML_MAX_NODES];
562
- struct ggml_tensor * leafs[GGML_MAX_NODES];
555
+ struct ggml_tensor ** nodes;
556
+ struct ggml_tensor ** grads;
557
+ struct ggml_tensor ** leafs;
563
558
 
564
- void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
559
+ struct ggml_hash_set visited_hash_table;
565
560
 
566
561
  enum ggml_cgraph_eval_order order;
567
562
 
@@ -571,8 +566,6 @@ extern "C" {
571
566
  int64_t perf_time_us;
572
567
  };
573
568
 
574
- static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
575
-
576
569
  // scratch buffer
577
570
  struct ggml_scratch {
578
571
  size_t offs;
@@ -617,6 +610,8 @@ extern "C" {
617
610
  GGML_API int64_t ggml_cycles(void);
618
611
  GGML_API int64_t ggml_cycles_per_ms(void);
619
612
 
613
+ GGML_API void ggml_print_backtrace(void);
614
+
620
615
  GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
621
616
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
622
617
 
@@ -709,7 +704,7 @@ extern "C" {
709
704
  // Context tensor enumeration and lookup
710
705
  GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
711
706
  GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
712
- GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
707
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
713
708
 
714
709
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
715
710
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@@ -943,6 +938,10 @@ extern "C" {
943
938
  struct ggml_context * ctx,
944
939
  struct ggml_tensor * a);
945
940
 
941
+ GGML_API struct ggml_tensor * ggml_leaky(
942
+ struct ggml_context * ctx,
943
+ struct ggml_tensor * a);
944
+
946
945
  GGML_API struct ggml_tensor * ggml_relu_inplace(
947
946
  struct ggml_context * ctx,
948
947
  struct ggml_tensor * a);
@@ -1372,8 +1371,13 @@ extern "C" {
1372
1371
  int n_dims,
1373
1372
  int mode,
1374
1373
  int n_ctx,
1374
+ int n_orig_ctx,
1375
1375
  float freq_base,
1376
1376
  float freq_scale,
1377
+ float ext_factor,
1378
+ float attn_factor,
1379
+ float beta_fast,
1380
+ float beta_slow,
1377
1381
  float xpos_base,
1378
1382
  bool xpos_down);
1379
1383
 
@@ -1394,6 +1398,18 @@ extern "C" {
1394
1398
  float min,
1395
1399
  float max);
1396
1400
 
1401
+ GGML_API struct ggml_tensor * ggml_im2col(
1402
+ struct ggml_context * ctx,
1403
+ struct ggml_tensor * a,
1404
+ struct ggml_tensor * b,
1405
+ int s0,
1406
+ int s1,
1407
+ int p0,
1408
+ int p1,
1409
+ int d0,
1410
+ int d1,
1411
+ bool is_2D);
1412
+
1397
1413
  GGML_API struct ggml_tensor * ggml_conv_1d(
1398
1414
  struct ggml_context * ctx,
1399
1415
  struct ggml_tensor * a,
@@ -1477,6 +1493,8 @@ extern "C" {
1477
1493
  int s0, // stride
1478
1494
  int p0); // padding
1479
1495
 
1496
+ // the result will have 2*p0 padding for the first dimension
1497
+ // and 2*p1 padding for the second dimension
1480
1498
  GGML_API struct ggml_tensor * ggml_pool_2d(
1481
1499
  struct ggml_context * ctx,
1482
1500
  struct ggml_tensor * a,
@@ -1485,8 +1503,8 @@ extern "C" {
1485
1503
  int k1,
1486
1504
  int s0,
1487
1505
  int s1,
1488
- int p0,
1489
- int p1);
1506
+ float p0,
1507
+ float p1);
1490
1508
 
1491
1509
  // nearest interpolate
1492
1510
  // used in stable-diffusion
@@ -1727,19 +1745,22 @@ extern "C" {
1727
1745
  GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1728
1746
  GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
1729
1747
 
1730
- GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1731
- GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1732
-
1733
1748
  // graph allocation in a context
1734
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1735
- GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1749
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
1750
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
1751
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1752
+ GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
1753
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
1754
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
1755
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
1756
+
1736
1757
  GGML_API size_t ggml_graph_overhead(void);
1758
+ GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
1737
1759
 
1738
1760
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1739
1761
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1740
1762
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1741
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1742
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1763
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1743
1764
 
1744
1765
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1745
1766
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -1747,8 +1768,8 @@ extern "C" {
1747
1768
 
1748
1769
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1749
1770
 
1750
- GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1751
- GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1771
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1772
+ GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1752
1773
 
1753
1774
  // print info and performance information for the graph
1754
1775
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
@@ -1811,6 +1832,8 @@ extern "C" {
1811
1832
  struct ggml_opt_params {
1812
1833
  enum ggml_opt_type type;
1813
1834
 
1835
+ size_t graph_size;
1836
+
1814
1837
  int n_threads;
1815
1838
 
1816
1839
  // delta-based convergence test