llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -58,7 +58,8 @@
|
|
58
58
|
// {
|
59
59
|
// ...
|
60
60
|
//
|
61
|
-
// struct ggml_cgraph gf =
|
61
|
+
// struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
62
|
+
// ggml_build_forward_expand(gf, f);
|
62
63
|
//
|
63
64
|
// // set the input variable and parameter values
|
64
65
|
// ggml_set_f32(x, 2.0f);
|
@@ -213,15 +214,14 @@
|
|
213
214
|
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
214
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
216
|
|
216
|
-
#define GGML_MAX_DIMS
|
217
|
-
#define
|
218
|
-
#define
|
219
|
-
#define
|
220
|
-
#define
|
221
|
-
#define
|
222
|
-
#define
|
223
|
-
#define
|
224
|
-
|
217
|
+
#define GGML_MAX_DIMS 4
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
|
+
#define GGML_MAX_CONTEXTS 64
|
220
|
+
#define GGML_MAX_SRC 6
|
221
|
+
#define GGML_MAX_NAME 64
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
|
+
#define GGML_DEFAULT_N_THREADS 4
|
224
|
+
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
226
226
|
#define GGML_MEM_ALIGN 4
|
227
227
|
#else
|
@@ -245,7 +245,10 @@
|
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
247
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
|
248
|
+
fflush(stderr); \
|
249
|
+
fflush(stdout); \
|
250
|
+
ggml_print_backtrace(); \
|
251
|
+
exit(1); \
|
249
252
|
} \
|
250
253
|
} while (0)
|
251
254
|
|
@@ -400,13 +403,8 @@ extern "C" {
|
|
400
403
|
GGML_OP_ROPE_BACK,
|
401
404
|
GGML_OP_ALIBI,
|
402
405
|
GGML_OP_CLAMP,
|
403
|
-
GGML_OP_CONV_1D,
|
404
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
406
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
-
|
408
|
-
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
-
GGML_OP_CONV_2D_STAGE_1, // internal
|
407
|
+
GGML_OP_IM2COL,
|
410
408
|
GGML_OP_CONV_TRANSPOSE_2D,
|
411
409
|
GGML_OP_POOL_1D,
|
412
410
|
GGML_OP_POOL_2D,
|
@@ -451,6 +449,7 @@ extern "C" {
|
|
451
449
|
GGML_UNARY_OP_GELU,
|
452
450
|
GGML_UNARY_OP_GELU_QUICK,
|
453
451
|
GGML_UNARY_OP_SILU,
|
452
|
+
GGML_UNARY_OP_LEAKY
|
454
453
|
};
|
455
454
|
|
456
455
|
enum ggml_object_type {
|
@@ -531,37 +530,33 @@ extern "C" {
|
|
531
530
|
|
532
531
|
int n_threads;
|
533
532
|
|
534
|
-
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
535
|
-
int n_tasks[GGML_MAX_NODES];
|
536
|
-
|
537
533
|
// abort ggml_graph_compute when true
|
538
534
|
bool (*abort_callback)(void * data);
|
539
535
|
void * abort_callback_data;
|
540
536
|
};
|
541
537
|
|
542
|
-
// next prime after GGML_MAX_NODES
|
543
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
544
|
-
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
545
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
546
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
547
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
548
|
-
|
549
538
|
enum ggml_cgraph_eval_order {
|
550
539
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
551
540
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
552
541
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
553
542
|
};
|
554
543
|
|
544
|
+
struct ggml_hash_set {
|
545
|
+
size_t size;
|
546
|
+
struct ggml_tensor ** keys;
|
547
|
+
};
|
548
|
+
|
555
549
|
// computation graph
|
556
550
|
struct ggml_cgraph {
|
551
|
+
int size;
|
557
552
|
int n_nodes;
|
558
553
|
int n_leafs;
|
559
554
|
|
560
|
-
struct ggml_tensor
|
561
|
-
struct ggml_tensor
|
562
|
-
struct ggml_tensor
|
555
|
+
struct ggml_tensor ** nodes;
|
556
|
+
struct ggml_tensor ** grads;
|
557
|
+
struct ggml_tensor ** leafs;
|
563
558
|
|
564
|
-
|
559
|
+
struct ggml_hash_set visited_hash_table;
|
565
560
|
|
566
561
|
enum ggml_cgraph_eval_order order;
|
567
562
|
|
@@ -571,8 +566,6 @@ extern "C" {
|
|
571
566
|
int64_t perf_time_us;
|
572
567
|
};
|
573
568
|
|
574
|
-
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
575
|
-
|
576
569
|
// scratch buffer
|
577
570
|
struct ggml_scratch {
|
578
571
|
size_t offs;
|
@@ -617,6 +610,8 @@ extern "C" {
|
|
617
610
|
GGML_API int64_t ggml_cycles(void);
|
618
611
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
619
612
|
|
613
|
+
GGML_API void ggml_print_backtrace(void);
|
614
|
+
|
620
615
|
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
621
616
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
622
617
|
|
@@ -709,7 +704,7 @@ extern "C" {
|
|
709
704
|
// Context tensor enumeration and lookup
|
710
705
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
711
706
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
712
|
-
GGML_API struct ggml_tensor * ggml_get_tensor
|
707
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
713
708
|
|
714
709
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
715
710
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -943,6 +938,10 @@ extern "C" {
|
|
943
938
|
struct ggml_context * ctx,
|
944
939
|
struct ggml_tensor * a);
|
945
940
|
|
941
|
+
GGML_API struct ggml_tensor * ggml_leaky(
|
942
|
+
struct ggml_context * ctx,
|
943
|
+
struct ggml_tensor * a);
|
944
|
+
|
946
945
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
947
946
|
struct ggml_context * ctx,
|
948
947
|
struct ggml_tensor * a);
|
@@ -1399,6 +1398,18 @@ extern "C" {
|
|
1399
1398
|
float min,
|
1400
1399
|
float max);
|
1401
1400
|
|
1401
|
+
GGML_API struct ggml_tensor * ggml_im2col(
|
1402
|
+
struct ggml_context * ctx,
|
1403
|
+
struct ggml_tensor * a,
|
1404
|
+
struct ggml_tensor * b,
|
1405
|
+
int s0,
|
1406
|
+
int s1,
|
1407
|
+
int p0,
|
1408
|
+
int p1,
|
1409
|
+
int d0,
|
1410
|
+
int d1,
|
1411
|
+
bool is_2D);
|
1412
|
+
|
1402
1413
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1403
1414
|
struct ggml_context * ctx,
|
1404
1415
|
struct ggml_tensor * a,
|
@@ -1482,6 +1493,8 @@ extern "C" {
|
|
1482
1493
|
int s0, // stride
|
1483
1494
|
int p0); // padding
|
1484
1495
|
|
1496
|
+
// the result will have 2*p0 padding for the first dimension
|
1497
|
+
// and 2*p1 padding for the second dimension
|
1485
1498
|
GGML_API struct ggml_tensor * ggml_pool_2d(
|
1486
1499
|
struct ggml_context * ctx,
|
1487
1500
|
struct ggml_tensor * a,
|
@@ -1490,8 +1503,8 @@ extern "C" {
|
|
1490
1503
|
int k1,
|
1491
1504
|
int s0,
|
1492
1505
|
int s1,
|
1493
|
-
|
1494
|
-
|
1506
|
+
float p0,
|
1507
|
+
float p1);
|
1495
1508
|
|
1496
1509
|
// nearest interpolate
|
1497
1510
|
// used in stable-diffusion
|
@@ -1732,19 +1745,22 @@ extern "C" {
|
|
1732
1745
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1733
1746
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1734
1747
|
|
1735
|
-
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1736
|
-
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1737
|
-
|
1738
1748
|
// graph allocation in a context
|
1739
|
-
GGML_API struct ggml_cgraph * ggml_new_graph
|
1740
|
-
GGML_API struct ggml_cgraph *
|
1749
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1750
|
+
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1751
|
+
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1752
|
+
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
1753
|
+
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1754
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1755
|
+
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
1756
|
+
|
1741
1757
|
GGML_API size_t ggml_graph_overhead(void);
|
1758
|
+
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
1742
1759
|
|
1743
1760
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1744
1761
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1745
1762
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1746
|
-
GGML_API
|
1747
|
-
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1763
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1748
1764
|
|
1749
1765
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1750
1766
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
@@ -1752,8 +1768,8 @@ extern "C" {
|
|
1752
1768
|
|
1753
1769
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1754
1770
|
|
1755
|
-
GGML_API void
|
1756
|
-
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1771
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
1772
|
+
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1757
1773
|
|
1758
1774
|
// print info and performance information for the graph
|
1759
1775
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
@@ -1816,6 +1832,8 @@ extern "C" {
|
|
1816
1832
|
struct ggml_opt_params {
|
1817
1833
|
enum ggml_opt_type type;
|
1818
1834
|
|
1835
|
+
size_t graph_size;
|
1836
|
+
|
1819
1837
|
int n_threads;
|
1820
1838
|
|
1821
1839
|
// delta-based convergence test
|