llama_cpp 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -58,7 +58,8 @@
|
|
58
58
|
// {
|
59
59
|
// ...
|
60
60
|
//
|
61
|
-
// struct ggml_cgraph gf =
|
61
|
+
// struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
62
|
+
// ggml_build_forward_expand(gf, f);
|
62
63
|
//
|
63
64
|
// // set the input variable and parameter values
|
64
65
|
// ggml_set_f32(x, 2.0f);
|
@@ -213,15 +214,14 @@
|
|
213
214
|
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
214
215
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
216
|
|
216
|
-
#define GGML_MAX_DIMS
|
217
|
-
#define
|
218
|
-
#define
|
219
|
-
#define
|
220
|
-
#define
|
221
|
-
#define
|
222
|
-
#define
|
223
|
-
#define
|
224
|
-
|
217
|
+
#define GGML_MAX_DIMS 4
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
|
+
#define GGML_MAX_CONTEXTS 64
|
220
|
+
#define GGML_MAX_SRC 6
|
221
|
+
#define GGML_MAX_NAME 64
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
|
+
#define GGML_DEFAULT_N_THREADS 4
|
224
|
+
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
226
226
|
#define GGML_MEM_ALIGN 4
|
227
227
|
#else
|
@@ -245,7 +245,10 @@
|
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
247
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
|
248
|
+
fflush(stderr); \
|
249
|
+
fflush(stdout); \
|
250
|
+
ggml_print_backtrace(); \
|
251
|
+
exit(1); \
|
249
252
|
} \
|
250
253
|
} while (0)
|
251
254
|
|
@@ -400,13 +403,8 @@ extern "C" {
|
|
400
403
|
GGML_OP_ROPE_BACK,
|
401
404
|
GGML_OP_ALIBI,
|
402
405
|
GGML_OP_CLAMP,
|
403
|
-
GGML_OP_CONV_1D,
|
404
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
406
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
-
|
408
|
-
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
-
GGML_OP_CONV_2D_STAGE_1, // internal
|
407
|
+
GGML_OP_IM2COL,
|
410
408
|
GGML_OP_CONV_TRANSPOSE_2D,
|
411
409
|
GGML_OP_POOL_1D,
|
412
410
|
GGML_OP_POOL_2D,
|
@@ -451,6 +449,7 @@ extern "C" {
|
|
451
449
|
GGML_UNARY_OP_GELU,
|
452
450
|
GGML_UNARY_OP_GELU_QUICK,
|
453
451
|
GGML_UNARY_OP_SILU,
|
452
|
+
GGML_UNARY_OP_LEAKY
|
454
453
|
};
|
455
454
|
|
456
455
|
enum ggml_object_type {
|
@@ -531,37 +530,33 @@ extern "C" {
|
|
531
530
|
|
532
531
|
int n_threads;
|
533
532
|
|
534
|
-
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
535
|
-
int n_tasks[GGML_MAX_NODES];
|
536
|
-
|
537
533
|
// abort ggml_graph_compute when true
|
538
534
|
bool (*abort_callback)(void * data);
|
539
535
|
void * abort_callback_data;
|
540
536
|
};
|
541
537
|
|
542
|
-
// next prime after GGML_MAX_NODES
|
543
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
544
|
-
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
545
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
546
|
-
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
547
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
548
|
-
|
549
538
|
enum ggml_cgraph_eval_order {
|
550
539
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
551
540
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
552
541
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
553
542
|
};
|
554
543
|
|
544
|
+
struct ggml_hash_set {
|
545
|
+
size_t size;
|
546
|
+
struct ggml_tensor ** keys;
|
547
|
+
};
|
548
|
+
|
555
549
|
// computation graph
|
556
550
|
struct ggml_cgraph {
|
551
|
+
int size;
|
557
552
|
int n_nodes;
|
558
553
|
int n_leafs;
|
559
554
|
|
560
|
-
struct ggml_tensor
|
561
|
-
struct ggml_tensor
|
562
|
-
struct ggml_tensor
|
555
|
+
struct ggml_tensor ** nodes;
|
556
|
+
struct ggml_tensor ** grads;
|
557
|
+
struct ggml_tensor ** leafs;
|
563
558
|
|
564
|
-
|
559
|
+
struct ggml_hash_set visited_hash_table;
|
565
560
|
|
566
561
|
enum ggml_cgraph_eval_order order;
|
567
562
|
|
@@ -571,8 +566,6 @@ extern "C" {
|
|
571
566
|
int64_t perf_time_us;
|
572
567
|
};
|
573
568
|
|
574
|
-
static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
|
575
|
-
|
576
569
|
// scratch buffer
|
577
570
|
struct ggml_scratch {
|
578
571
|
size_t offs;
|
@@ -617,6 +610,8 @@ extern "C" {
|
|
617
610
|
GGML_API int64_t ggml_cycles(void);
|
618
611
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
619
612
|
|
613
|
+
GGML_API void ggml_print_backtrace(void);
|
614
|
+
|
620
615
|
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
621
616
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
622
617
|
|
@@ -709,7 +704,7 @@ extern "C" {
|
|
709
704
|
// Context tensor enumeration and lookup
|
710
705
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
711
706
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
712
|
-
GGML_API struct ggml_tensor * ggml_get_tensor
|
707
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
713
708
|
|
714
709
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
715
710
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -943,6 +938,10 @@ extern "C" {
|
|
943
938
|
struct ggml_context * ctx,
|
944
939
|
struct ggml_tensor * a);
|
945
940
|
|
941
|
+
GGML_API struct ggml_tensor * ggml_leaky(
|
942
|
+
struct ggml_context * ctx,
|
943
|
+
struct ggml_tensor * a);
|
944
|
+
|
946
945
|
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
947
946
|
struct ggml_context * ctx,
|
948
947
|
struct ggml_tensor * a);
|
@@ -1399,6 +1398,18 @@ extern "C" {
|
|
1399
1398
|
float min,
|
1400
1399
|
float max);
|
1401
1400
|
|
1401
|
+
GGML_API struct ggml_tensor * ggml_im2col(
|
1402
|
+
struct ggml_context * ctx,
|
1403
|
+
struct ggml_tensor * a,
|
1404
|
+
struct ggml_tensor * b,
|
1405
|
+
int s0,
|
1406
|
+
int s1,
|
1407
|
+
int p0,
|
1408
|
+
int p1,
|
1409
|
+
int d0,
|
1410
|
+
int d1,
|
1411
|
+
bool is_2D);
|
1412
|
+
|
1402
1413
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1403
1414
|
struct ggml_context * ctx,
|
1404
1415
|
struct ggml_tensor * a,
|
@@ -1482,6 +1493,8 @@ extern "C" {
|
|
1482
1493
|
int s0, // stride
|
1483
1494
|
int p0); // padding
|
1484
1495
|
|
1496
|
+
// the result will have 2*p0 padding for the first dimension
|
1497
|
+
// and 2*p1 padding for the second dimension
|
1485
1498
|
GGML_API struct ggml_tensor * ggml_pool_2d(
|
1486
1499
|
struct ggml_context * ctx,
|
1487
1500
|
struct ggml_tensor * a,
|
@@ -1490,8 +1503,8 @@ extern "C" {
|
|
1490
1503
|
int k1,
|
1491
1504
|
int s0,
|
1492
1505
|
int s1,
|
1493
|
-
|
1494
|
-
|
1506
|
+
float p0,
|
1507
|
+
float p1);
|
1495
1508
|
|
1496
1509
|
// nearest interpolate
|
1497
1510
|
// used in stable-diffusion
|
@@ -1732,19 +1745,22 @@ extern "C" {
|
|
1732
1745
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1733
1746
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1734
1747
|
|
1735
|
-
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1736
|
-
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
1737
|
-
|
1738
1748
|
// graph allocation in a context
|
1739
|
-
GGML_API struct ggml_cgraph * ggml_new_graph
|
1740
|
-
GGML_API struct ggml_cgraph *
|
1749
|
+
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
1750
|
+
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
1751
|
+
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
1752
|
+
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
1753
|
+
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
1754
|
+
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
1755
|
+
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
1756
|
+
|
1741
1757
|
GGML_API size_t ggml_graph_overhead(void);
|
1758
|
+
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
1742
1759
|
|
1743
1760
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
1744
1761
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
1745
1762
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
1746
|
-
GGML_API
|
1747
|
-
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
1763
|
+
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
1748
1764
|
|
1749
1765
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
1750
1766
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
@@ -1752,8 +1768,8 @@ extern "C" {
|
|
1752
1768
|
|
1753
1769
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
1754
1770
|
|
1755
|
-
GGML_API void
|
1756
|
-
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1771
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
1772
|
+
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
1757
1773
|
|
1758
1774
|
// print info and performance information for the graph
|
1759
1775
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
@@ -1816,6 +1832,8 @@ extern "C" {
|
|
1816
1832
|
struct ggml_opt_params {
|
1817
1833
|
enum ggml_opt_type type;
|
1818
1834
|
|
1835
|
+
size_t graph_size;
|
1836
|
+
|
1819
1837
|
int n_threads;
|
1820
1838
|
|
1821
1839
|
// delta-based convergence test
|