llama_cpp 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -58,7 +58,8 @@
|
|
58
58
|
// {
|
59
59
|
// ...
|
60
60
|
//
|
61
|
-
// struct ggml_cgraph gf =
|
61
|
+
// struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
62
|
+
// ggml_build_forward_expand(gf, f);
|
62
63
|
//
|
63
64
|
// // set the input variable and parameter values
|
64
65
|
// ggml_set_f32(x, 2.0f);
|
@@ -213,15 +214,14 @@
|
|
213
214
|
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
214
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
216
|
|
216
|
-
#define GGML_MAX_DIMS
|
217
|
-
#define
|
218
|
-
#define
|
219
|
-
#define
|
220
|
-
#define
|
221
|
-
#define
|
222
|
-
#define
|
223
|
-
#define
|
224
|
-
|
217
|
+
#define GGML_MAX_DIMS 4
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
|
+
#define GGML_MAX_CONTEXTS 64
|
220
|
+
#define GGML_MAX_SRC 6
|
221
|
+
#define GGML_MAX_NAME 64
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
|
+
#define GGML_DEFAULT_N_THREADS 4
|
224
|
+
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
226
226
|
#define GGML_MEM_ALIGN 4
|
227
227
|
#else
|
@@ -245,7 +245,10 @@
|
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
247
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
|
248
|
+
fflush(stderr); \
|
249
|
+
fflush(stdout); \
|
250
|
+
ggml_print_backtrace(); \
|
251
|
+
exit(1); \
|
249
252
|
} \
|
250
253
|
} while (0)
|
251
254
|
|
@@ -400,13 +403,8 @@ extern "C" {
|
|
400
403
|
GGML_OP_ROPE_BACK,
|
401
404
|
GGML_OP_ALIBI,
|
402
405
|
GGML_OP_CLAMP,
|
403
|
-
GGML_OP_CONV_1D,
|
404
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
406
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
-
|
408
|
-
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
-
GGML_OP_CONV_2D_STAGE_1, // internal
|
407
|
+
GGML_OP_IM2COL,
|
410
408
|
GGML_OP_CONV_TRANSPOSE_2D,
|
411
409
|
GGML_OP_POOL_1D,
|
412
410
|
GGML_OP_POOL_2D,
|
@@ -451,6 +449,7 @@ extern "C" {
|
|
451
449
|
GGML_UNARY_OP_GELU,
|
452
450
|
GGML_UNARY_OP_GELU_QUICK,
|
453
451
|
GGML_UNARY_OP_SILU,
|
452
|
+
GGML_UNARY_OP_LEAKY
|
454
453
|
};
|
455
454
|
|
456
455
|
enum ggml_object_type {
|
@@ -531,37 +530,33 @@ extern "C" {
|
|
531
530
|
|
532
531
|
int n_threads;
|
533
532
|
|
534
|
-
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
535
|
-
int n_tasks[GGML_MAX_NODES];
|
536
|
-
|
537
533
|
// abort ggml_graph_compute when true
|
538
534
|
bool (*abort_callback)(void * data);
|
539
535
|
void * abort_callback_data;
|
540
536
|
};
|
541
537
|
|
542
|
-
// next prime after GGML_MAX_NODES
|
543
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
544
|
-
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
545
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
546
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
547
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
548
|
-
|
549
538
|
enum ggml_cgraph_eval_order {
|
550
539
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
551
540
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
552
541
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
553
542
|
};
|
554
543
|
|
544
|
+
struct ggml_hash_set {
|
545
|
+
size_t size;
|
546
|
+
struct ggml_tensor ** keys;
|
547
|
+
};
|
548
|
+
|
555
549
|
// computation graph
|
556
550
|
struct ggml_cgraph {
|
551
|
+
int size;
|
557
552
|
int n_nodes;
|
558
553
|
int n_leafs;
|
559
554
|
|
560
|
-
struct ggml_tensor
|
561
|
-
struct ggml_tensor
|
562
|
-
struct ggml_tensor
|
555
|
+
struct ggml_tensor ** nodes;
|
556
|
+
struct ggml_tensor ** grads;
|
557
|
+
struct ggml_tensor ** leafs;
|
563
558
|
|
564
|
-
|
559
|
+
struct ggml_hash_set visited_hash_table;
|
565
560
|
|
566
561
|
enum ggml_cgraph_eval_order order;
|
567
562
|
|
@@ -571,8 +566,6 @@ extern "C" {
|
|
571
566
|
int64_t perf_time_us;
|
572
567
|
};
|
573
568
|
|
574
|
-
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
575
|
-
|
576
569
|
// scratch buffer
|
577
570
|
struct ggml_scratch {
|
578
571
|
size_t offs;
|
@@ -617,6 +610,8 @@ extern "C" {
|
|
617
610
|
GGML_API int64_t ggml_cycles(void);
|
618
611
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
619
612
|
|
613
|
+
GGML_API void ggml_print_backtrace(void);
|
614
|
+
|
620
615
|
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
621
616
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
622
617
|
|
@@ -709,7 +704,7 @@ extern "C" {
|
|
709
704
|
// Context tensor enumeration and lookup
|
710
705
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
711
706
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
712
|
-
GGML_API struct ggml_tensor * ggml_get_tensor
|
707
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
713
708
|
|
714
709
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
715
710
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -943,6 +938,10 @@ extern "C" {
|
|
943
938
|
struct ggml_context * ctx,
|
944
939
|
struct ggml_tensor * a);
|
945
940
|
|
941
|
+
GGML_API struct ggml_tensor * ggml_leaky(
|
942
|
+
struct ggml_context * ctx,
|
943
|
+
struct ggml_tensor * a);
|
944
|
+
|
946
945
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
947
946
|
struct ggml_context * ctx,
|
948
947
|
struct ggml_tensor * a);
|
@@ -1372,8 +1371,13 @@ extern "C" {
|
|
1372
1371
|
int n_dims,
|
1373
1372
|
int mode,
|
1374
1373
|
int n_ctx,
|
1374
|
+
int n_orig_ctx,
|
1375
1375
|
float freq_base,
|
1376
1376
|
float freq_scale,
|
1377
|
+
float ext_factor,
|
1378
|
+
float attn_factor,
|
1379
|
+
float beta_fast,
|
1380
|
+
float beta_slow,
|
1377
1381
|
float xpos_base,
|
1378
1382
|
bool xpos_down);
|
1379
1383
|
|
@@ -1394,6 +1398,18 @@ extern "C" {
|
|
1394
1398
|
float min,
|
1395
1399
|
float max);
|
1396
1400
|
|
1401
|
+
GGML_API struct ggml_tensor * ggml_im2col(
|
1402
|
+
struct ggml_context * ctx,
|
1403
|
+
struct ggml_tensor * a,
|
1404
|
+
struct ggml_tensor * b,
|
1405
|
+
int s0,
|
1406
|
+
int s1,
|
1407
|
+
int p0,
|
1408
|
+
int p1,
|
1409
|
+
int d0,
|
1410
|
+
int d1,
|
1411
|
+
bool is_2D);
|
1412
|
+
|
1397
1413
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1398
1414
|
struct ggml_context * ctx,
|
1399
1415
|
struct ggml_tensor * a,
|
@@ -1477,6 +1493,8 @@ extern "C" {
|
|
1477
1493
|
int s0, // stride
|
1478
1494
|
int p0); // padding
|
1479
1495
|
|
1496
|
+
// the result will have 2*p0 padding for the first dimension
|
1497
|
+
// and 2*p1 padding for the second dimension
|
1480
1498
|
GGML_API struct ggml_tensor * ggml_pool_2d(
|
1481
1499
|
struct ggml_context * ctx,
|
1482
1500
|
struct ggml_tensor * a,
|
@@ -1485,8 +1503,8 @@ extern "C" {
|
|
1485
1503
|
int k1,
|
1486
1504
|
int s0,
|
1487
1505
|
int s1,
|
1488
|
-
|
1489
|
-
|
1506
|
+
float p0,
|
1507
|
+
float p1);
|
1490
1508
|
|
1491
1509
|
// nearest interpolate
|
1492
1510
|
// used in stable-diffusion
|
@@ -1727,19 +1745,22 @@ extern "C" {
|
|
1727
1745
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1728
1746
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1729
1747
|
|
1730
|
-
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1731
|
-
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1732
|
-
|
1733
1748
|
// graph allocation in a context
|
1734
|
-
GGML_API struct ggml_cgraph * ggml_new_graph
|
1735
|
-
GGML_API struct ggml_cgraph *
|
1749
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1750
|
+
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1751
|
+
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1752
|
+
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
1753
|
+
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1754
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1755
|
+
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
1756
|
+
|
1736
1757
|
GGML_API size_t ggml_graph_overhead(void);
|
1758
|
+
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
1737
1759
|
|
1738
1760
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1739
1761
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1740
1762
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1741
|
-
GGML_API
|
1742
|
-
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1763
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1743
1764
|
|
1744
1765
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1745
1766
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
@@ -1747,8 +1768,8 @@ extern "C" {
|
|
1747
1768
|
|
1748
1769
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1749
1770
|
|
1750
|
-
GGML_API void
|
1751
|
-
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1771
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
1772
|
+
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1752
1773
|
|
1753
1774
|
// print info and performance information for the graph
|
1754
1775
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
@@ -1811,6 +1832,8 @@ extern "C" {
|
|
1811
1832
|
struct ggml_opt_params {
|
1812
1833
|
enum ggml_opt_type type;
|
1813
1834
|
|
1835
|
+
size_t graph_size;
|
1836
|
+
|
1814
1837
|
int n_threads;
|
1815
1838
|
|
1816
1839
|
// delta-based convergence test
|