cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml.h CHANGED
@@ -220,7 +220,7 @@
220
220
  #include <stdio.h>
221
221
 
222
222
  #define LM_GGML_FILE_MAGIC 0x67676d6c // "ggml"
223
- #define LM_GGML_FILE_VERSION 1
223
+ #define LM_GGML_FILE_VERSION 2
224
224
 
225
225
  #define LM_GGML_QNT_VERSION 2 // bump this on quantization format changes
226
226
  #define LM_GGML_QNT_VERSION_FACTOR 1000 // do not change this
@@ -231,6 +231,8 @@
231
231
  #define LM_GGML_MAX_SRC 10
232
232
  #ifndef LM_GGML_MAX_NAME
233
233
  #define LM_GGML_MAX_NAME 64
234
+ #define LM_GGML_MAX_N_THREADS 512
235
+
234
236
  #endif
235
237
  #define LM_GGML_MAX_OP_PARAMS 64
236
238
  #define LM_GGML_DEFAULT_N_THREADS 4
@@ -393,6 +395,8 @@ extern "C" {
393
395
  LM_GGML_TYPE_Q4_0_4_4 = 31,
394
396
  LM_GGML_TYPE_Q4_0_4_8 = 32,
395
397
  LM_GGML_TYPE_Q4_0_8_8 = 33,
398
+ LM_GGML_TYPE_TQ1_0 = 34,
399
+ LM_GGML_TYPE_TQ2_0 = 35,
396
400
  LM_GGML_TYPE_COUNT,
397
401
  };
398
402
 
@@ -453,6 +457,8 @@ extern "C" {
453
457
  LM_GGML_OP_SQR,
454
458
  LM_GGML_OP_SQRT,
455
459
  LM_GGML_OP_LOG,
460
+ LM_GGML_OP_SIN,
461
+ LM_GGML_OP_COS,
456
462
  LM_GGML_OP_SUM,
457
463
  LM_GGML_OP_SUM_ROWS,
458
464
  LM_GGML_OP_MEAN,
@@ -490,9 +496,11 @@ extern "C" {
490
496
  LM_GGML_OP_CLAMP,
491
497
  LM_GGML_OP_CONV_TRANSPOSE_1D,
492
498
  LM_GGML_OP_IM2COL,
499
+ LM_GGML_OP_IM2COL_BACK,
493
500
  LM_GGML_OP_CONV_TRANSPOSE_2D,
494
501
  LM_GGML_OP_POOL_1D,
495
502
  LM_GGML_OP_POOL_2D,
503
+ LM_GGML_OP_POOL_2D_BACK,
496
504
  LM_GGML_OP_UPSCALE, // nearest interpolate
497
505
  LM_GGML_OP_PAD,
498
506
  LM_GGML_OP_ARANGE,
@@ -508,6 +516,7 @@ extern "C" {
508
516
  LM_GGML_OP_WIN_UNPART,
509
517
  LM_GGML_OP_GET_REL_POS,
510
518
  LM_GGML_OP_ADD_REL_POS,
519
+ LM_GGML_OP_RWKV_WKV,
511
520
 
512
521
  LM_GGML_OP_UNARY,
513
522
 
@@ -542,6 +551,7 @@ extern "C" {
542
551
  LM_GGML_UNARY_OP_SILU,
543
552
  LM_GGML_UNARY_OP_HARDSWISH,
544
553
  LM_GGML_UNARY_OP_HARDSIGMOID,
554
+ LM_GGML_UNARY_OP_EXP,
545
555
 
546
556
  LM_GGML_UNARY_OP_COUNT,
547
557
  };
@@ -624,6 +634,29 @@ extern "C" {
624
634
  // If it returns true, the computation is aborted
625
635
  typedef bool (*lm_ggml_abort_callback)(void * data);
626
636
 
637
+ // Scheduling priorities
638
+ enum lm_ggml_sched_priority {
639
+ LM_GGML_SCHED_PRIO_NORMAL,
640
+ LM_GGML_SCHED_PRIO_MEDIUM,
641
+ LM_GGML_SCHED_PRIO_HIGH,
642
+ LM_GGML_SCHED_PRIO_REALTIME
643
+ };
644
+
645
+ // Threadpool params
646
+ // Use lm_ggml_threadpool_params_default() or lm_ggml_threadpool_params_init() to populate the defaults
647
+ struct lm_ggml_threadpool_params {
648
+ bool cpumask[LM_GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
649
+ int n_threads; // number of threads
650
+ enum lm_ggml_sched_priority prio; // thread priority
651
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
652
+ bool strict_cpu; // strict cpu placement
653
+ bool paused; // start in paused state
654
+ };
655
+
656
+ struct lm_ggml_threadpool; // forward declaration, see ggml.c
657
+
658
+ typedef struct lm_ggml_threadpool * lm_ggml_threadpool_t;
659
+
627
660
  // the compute plan that needs to be prepared for lm_ggml_graph_compute()
628
661
  // since https://github.com/ggerganov/ggml/issues/287
629
662
  struct lm_ggml_cplan {
@@ -631,6 +664,7 @@ extern "C" {
631
664
  uint8_t * work_data; // work buffer, to be allocated by caller before calling to `lm_ggml_graph_compute()`
632
665
 
633
666
  int n_threads;
667
+ struct lm_ggml_threadpool * threadpool;
634
668
 
635
669
  // abort lm_ggml_graph_compute when true
636
670
  lm_ggml_abort_callback abort_callback;
@@ -647,8 +681,8 @@ extern "C" {
647
681
 
648
682
  struct lm_ggml_hash_set {
649
683
  size_t size;
650
- lm_ggml_bitset_t * used;
651
- struct lm_ggml_tensor ** keys;
684
+ lm_ggml_bitset_t * used; // whether or not the keys are in use i.e. set
685
+ struct lm_ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if lm_ggml_bitset_get(used, i)
652
686
  };
653
687
 
654
688
  // computation graph
@@ -969,6 +1003,22 @@ extern "C" {
969
1003
  struct lm_ggml_context * ctx,
970
1004
  struct lm_ggml_tensor * a);
971
1005
 
1006
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_sin(
1007
+ struct lm_ggml_context * ctx,
1008
+ struct lm_ggml_tensor * a);
1009
+
1010
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_sin_inplace(
1011
+ struct lm_ggml_context * ctx,
1012
+ struct lm_ggml_tensor * a);
1013
+
1014
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_cos(
1015
+ struct lm_ggml_context * ctx,
1016
+ struct lm_ggml_tensor * a);
1017
+
1018
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_cos_inplace(
1019
+ struct lm_ggml_context * ctx,
1020
+ struct lm_ggml_tensor * a);
1021
+
972
1022
  // return scalar
973
1023
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_sum(
974
1024
  struct lm_ggml_context * ctx,
@@ -1119,6 +1169,14 @@ extern "C" {
1119
1169
  struct lm_ggml_context * ctx,
1120
1170
  struct lm_ggml_tensor * a);
1121
1171
 
1172
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_exp(
1173
+ struct lm_ggml_context * ctx,
1174
+ struct lm_ggml_tensor * a);
1175
+
1176
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_exp_inplace(
1177
+ struct lm_ggml_context * ctx,
1178
+ struct lm_ggml_tensor * a);
1179
+
1122
1180
  // normalize along rows
1123
1181
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_norm(
1124
1182
  struct lm_ggml_context * ctx,
@@ -1214,7 +1272,7 @@ extern "C" {
1214
1272
  size_t nb1,
1215
1273
  size_t nb2,
1216
1274
  size_t nb3,
1217
- size_t offset);
1275
+ size_t offset); // in bytes
1218
1276
 
1219
1277
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1220
1278
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_inplace(
@@ -1224,19 +1282,19 @@ extern "C" {
1224
1282
  size_t nb1,
1225
1283
  size_t nb2,
1226
1284
  size_t nb3,
1227
- size_t offset);
1285
+ size_t offset); // in bytes
1228
1286
 
1229
1287
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_1d(
1230
1288
  struct lm_ggml_context * ctx,
1231
1289
  struct lm_ggml_tensor * a,
1232
1290
  struct lm_ggml_tensor * b,
1233
- size_t offset);
1291
+ size_t offset); // in bytes
1234
1292
 
1235
1293
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_1d_inplace(
1236
1294
  struct lm_ggml_context * ctx,
1237
1295
  struct lm_ggml_tensor * a,
1238
1296
  struct lm_ggml_tensor * b,
1239
- size_t offset);
1297
+ size_t offset); // in bytes
1240
1298
 
1241
1299
  // b -> view(a,offset,nb1,nb2,3), return modified a
1242
1300
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_2d(
@@ -1244,7 +1302,7 @@ extern "C" {
1244
1302
  struct lm_ggml_tensor * a,
1245
1303
  struct lm_ggml_tensor * b,
1246
1304
  size_t nb1,
1247
- size_t offset);
1305
+ size_t offset); // in bytes
1248
1306
 
1249
1307
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1250
1308
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_2d_inplace(
@@ -1252,7 +1310,7 @@ extern "C" {
1252
1310
  struct lm_ggml_tensor * a,
1253
1311
  struct lm_ggml_tensor * b,
1254
1312
  size_t nb1,
1255
- size_t offset);
1313
+ size_t offset); // in bytes
1256
1314
 
1257
1315
  // a -> b, return view(b)
1258
1316
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_cpy(
@@ -1566,34 +1624,49 @@ extern "C" {
1566
1624
  float min,
1567
1625
  float max);
1568
1626
 
1627
+ // im2col
1628
+ // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1569
1629
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_im2col(
1570
1630
  struct lm_ggml_context * ctx,
1571
- struct lm_ggml_tensor * a,
1572
- struct lm_ggml_tensor * b,
1573
- int s0,
1574
- int s1,
1575
- int p0,
1576
- int p1,
1577
- int d0,
1578
- int d1,
1579
- bool is_2D,
1580
- enum lm_ggml_type dst_type);
1631
+ struct lm_ggml_tensor * a, // convolution kernel
1632
+ struct lm_ggml_tensor * b, // data
1633
+ int s0, // stride dimension 0
1634
+ int s1, // stride dimension 1
1635
+ int p0, // padding dimension 0
1636
+ int p1, // padding dimension 1
1637
+ int d0, // dilation dimension 0
1638
+ int d1, // dilation dimension 1
1639
+ bool is_2D,
1640
+ enum lm_ggml_type dst_type);
1641
+
1642
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_im2col_back(
1643
+ struct lm_ggml_context * ctx,
1644
+ struct lm_ggml_tensor * a, // convolution kernel
1645
+ struct lm_ggml_tensor * b, // gradient of im2col output
1646
+ int64_t * ne, // shape of im2col input
1647
+ int s0, // stride dimension 0
1648
+ int s1, // stride dimension 1
1649
+ int p0, // padding dimension 0
1650
+ int p1, // padding dimension 1
1651
+ int d0, // dilation dimension 0
1652
+ int d1, // dilation dimension 1
1653
+ bool is_2D);
1581
1654
 
1582
1655
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d(
1583
1656
  struct lm_ggml_context * ctx,
1584
- struct lm_ggml_tensor * a,
1585
- struct lm_ggml_tensor * b,
1586
- int s0,
1587
- int s1,
1588
- int p0,
1589
- int p1,
1590
- int d0,
1591
- int d1);
1657
+ struct lm_ggml_tensor * a, // convolution kernel
1658
+ struct lm_ggml_tensor * b, // data
1659
+ int s0, // stride dimension 0
1660
+ int s1, // stride dimension 1
1661
+ int p0, // padding dimension 0
1662
+ int p1, // padding dimension 1
1663
+ int d0, // dilation dimension 0
1664
+ int d1); // dilation dimension 1
1592
1665
 
1593
1666
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
1594
1667
  struct lm_ggml_context * ctx,
1595
- struct lm_ggml_tensor * a,
1596
- struct lm_ggml_tensor * b,
1668
+ struct lm_ggml_tensor * a, // convolution kernel
1669
+ struct lm_ggml_tensor * b, // data
1597
1670
  int s0, // stride
1598
1671
  int p0, // padding
1599
1672
  int d0); // dilation
@@ -1602,29 +1675,29 @@ extern "C" {
1602
1675
  // alias for lm_ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1603
1676
  LM_GGML_API struct lm_ggml_tensor* lm_ggml_conv_1d_ph(
1604
1677
  struct lm_ggml_context * ctx,
1605
- struct lm_ggml_tensor * a,
1606
- struct lm_ggml_tensor * b,
1607
- int s,
1608
- int d);
1678
+ struct lm_ggml_tensor * a, // convolution kernel
1679
+ struct lm_ggml_tensor * b, // data
1680
+ int s, // stride
1681
+ int d); // dilation
1609
1682
 
1610
1683
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
1611
1684
  struct lm_ggml_context * ctx,
1612
- struct lm_ggml_tensor * a,
1613
- struct lm_ggml_tensor * b,
1614
- int s0,
1615
- int p0,
1616
- int d0);
1685
+ struct lm_ggml_tensor * a, // convolution kernel
1686
+ struct lm_ggml_tensor * b, // data
1687
+ int s0, // stride
1688
+ int p0, // padding
1689
+ int d0); // dilation
1617
1690
 
1618
1691
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d(
1619
1692
  struct lm_ggml_context * ctx,
1620
- struct lm_ggml_tensor * a,
1621
- struct lm_ggml_tensor * b,
1622
- int s0,
1623
- int s1,
1624
- int p0,
1625
- int p1,
1626
- int d0,
1627
- int d1);
1693
+ struct lm_ggml_tensor * a, // convolution kernel
1694
+ struct lm_ggml_tensor * b, // data
1695
+ int s0, // stride dimension 0
1696
+ int s1, // stride dimension 1
1697
+ int p0, // padding dimension 0
1698
+ int p1, // padding dimension 1
1699
+ int d0, // dilation dimension 0
1700
+ int d1); // dilation dimension 1
1628
1701
 
1629
1702
 
1630
1703
  // kernel size is a->ne[0] x a->ne[1]
@@ -1686,6 +1759,18 @@ extern "C" {
1686
1759
  float p0,
1687
1760
  float p1);
1688
1761
 
1762
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_pool_2d_back(
1763
+ struct lm_ggml_context * ctx,
1764
+ struct lm_ggml_tensor * a,
1765
+ struct lm_ggml_tensor * af, // "a"/input used in forward pass
1766
+ enum lm_ggml_op_pool op,
1767
+ int k0,
1768
+ int k1,
1769
+ int s0,
1770
+ int s1,
1771
+ float p0,
1772
+ float p1);
1773
+
1689
1774
  // nearest interpolate
1690
1775
  // multiplies ne0 and ne1 by scale factor
1691
1776
  // used in stable-diffusion
@@ -1840,6 +1925,15 @@ extern "C" {
1840
1925
  struct lm_ggml_tensor * pw,
1841
1926
  struct lm_ggml_tensor * ph);
1842
1927
 
1928
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_rwkv_wkv(
1929
+ struct lm_ggml_context * ctx,
1930
+ struct lm_ggml_tensor * k,
1931
+ struct lm_ggml_tensor * v,
1932
+ struct lm_ggml_tensor * r,
1933
+ struct lm_ggml_tensor * tf,
1934
+ struct lm_ggml_tensor * td,
1935
+ struct lm_ggml_tensor * state);
1936
+
1843
1937
  // custom operators
1844
1938
 
1845
1939
  typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -2010,10 +2104,23 @@ extern "C" {
2010
2104
  LM_GGML_API size_t lm_ggml_graph_overhead(void);
2011
2105
  LM_GGML_API size_t lm_ggml_graph_overhead_custom(size_t size, bool grads);
2012
2106
 
2107
+ LM_GGML_API struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads);
2108
+ LM_GGML_API void lm_ggml_threadpool_params_init (struct lm_ggml_threadpool_params *p, int n_threads);
2109
+ LM_GGML_API bool lm_ggml_threadpool_params_match (const struct lm_ggml_threadpool_params *p0, const struct lm_ggml_threadpool_params *p1);
2110
+ LM_GGML_API struct lm_ggml_threadpool* lm_ggml_threadpool_new (struct lm_ggml_threadpool_params * params);
2111
+ LM_GGML_API void lm_ggml_threadpool_free (struct lm_ggml_threadpool * threadpool);
2112
+ LM_GGML_API int lm_ggml_threadpool_get_n_threads(struct lm_ggml_threadpool * threadpool);
2113
+ LM_GGML_API void lm_ggml_threadpool_pause (struct lm_ggml_threadpool * threadpool);
2114
+ LM_GGML_API void lm_ggml_threadpool_resume (struct lm_ggml_threadpool * threadpool);
2115
+
2013
2116
  // lm_ggml_graph_plan() has to be called before lm_ggml_graph_compute()
2014
2117
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
2015
- LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan (const struct lm_ggml_cgraph * cgraph, int n_threads /*= LM_GGML_DEFAULT_N_THREADS*/);
2016
- LM_GGML_API enum lm_ggml_status lm_ggml_graph_compute( struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan);
2118
+ LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan(
2119
+ const struct lm_ggml_cgraph * cgraph,
2120
+ int n_threads, /* = LM_GGML_DEFAULT_N_THREADS */
2121
+ struct lm_ggml_threadpool * threadpool /* = NULL */ );
2122
+ LM_GGML_API enum lm_ggml_status lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan);
2123
+
2017
2124
  // same as lm_ggml_graph_compute() but the work data is allocated as a part of the context
2018
2125
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
2019
2126
  LM_GGML_API enum lm_ggml_status lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph, int n_threads);