llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
61
61
|
}
|
62
62
|
}
|
63
63
|
|
64
|
-
// TODO: GGML_PAD ?
|
65
64
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
66
65
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
67
66
|
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
69
68
|
}
|
70
69
|
|
71
70
|
// tallocr
|
72
|
-
struct ggml_tallocr {
|
73
|
-
ggml_backend_buffer_t buffer;
|
74
|
-
void * base;
|
75
|
-
size_t alignment;
|
76
|
-
size_t offset;
|
77
|
-
};
|
78
|
-
|
79
|
-
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
80
|
-
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
81
|
-
if (talloc == NULL) {
|
82
|
-
return NULL;
|
83
|
-
}
|
84
71
|
|
72
|
+
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
85
73
|
void * base = ggml_backend_buffer_get_base(buffer);
|
86
74
|
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
87
75
|
|
88
76
|
assert(align && !(align & (align - 1))); // power of 2
|
89
77
|
|
90
|
-
|
78
|
+
struct ggml_tallocr talloc = (struct ggml_tallocr) {
|
91
79
|
/*.buffer = */ buffer,
|
92
80
|
/*.base = */ base,
|
93
81
|
/*.alignment = */ align,
|
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
|
96
84
|
return talloc;
|
97
85
|
}
|
98
86
|
|
99
|
-
void
|
100
|
-
free(talloc);
|
101
|
-
}
|
102
|
-
|
103
|
-
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
87
|
+
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
104
88
|
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
105
89
|
size = GGML_PAD(size, talloc->alignment);
|
106
90
|
|
@@ -354,12 +338,16 @@ struct hash_node {
|
|
354
338
|
bool allocated;
|
355
339
|
};
|
356
340
|
|
357
|
-
//
|
358
341
|
struct tensor_alloc {
|
359
342
|
size_t offset;
|
360
343
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
361
344
|
};
|
362
345
|
|
346
|
+
struct leaf_alloc {
|
347
|
+
int buffer_id;
|
348
|
+
struct tensor_alloc leaf;
|
349
|
+
};
|
350
|
+
|
363
351
|
struct node_alloc {
|
364
352
|
int buffer_id;
|
365
353
|
struct tensor_alloc dst;
|
@@ -378,7 +366,7 @@ struct ggml_gallocr {
|
|
378
366
|
struct node_alloc * node_allocs; // [n_nodes]
|
379
367
|
int n_nodes;
|
380
368
|
|
381
|
-
struct
|
369
|
+
struct leaf_alloc * leaf_allocs; // [n_leafs]
|
382
370
|
int n_leafs;
|
383
371
|
};
|
384
372
|
|
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
|
543
531
|
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
544
532
|
}
|
545
533
|
|
546
|
-
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
534
|
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
547
535
|
// clear hash tables
|
548
536
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
549
537
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
550
538
|
|
539
|
+
// allocate leafs
|
540
|
+
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
541
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
542
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
543
|
+
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
|
544
|
+
}
|
545
|
+
|
551
546
|
// count number of children and views
|
552
|
-
// allocate
|
547
|
+
// allocate other graph inputs and leafs first to avoid overwriting them
|
553
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
554
549
|
struct ggml_tensor * node = graph->nodes[i];
|
555
550
|
|
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
577
572
|
}
|
578
573
|
}
|
579
574
|
|
580
|
-
// allocate the remaining leafs that are unused on the graph
|
581
|
-
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
-
|
586
|
-
if (hn->n_children == 0) {
|
587
|
-
assert(!hn->allocated);
|
588
|
-
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
-
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
-
}
|
591
|
-
}
|
592
|
-
|
593
575
|
// allocate tensors
|
594
576
|
for (int i = 0; i < graph->n_nodes; i++) {
|
595
577
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
652
634
|
}
|
653
635
|
}
|
654
636
|
|
655
|
-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
637
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
656
638
|
size_t hash_size = graph->visited_hash_table.size;
|
657
639
|
|
658
640
|
// initialize hash table
|
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
676
658
|
}
|
677
659
|
|
678
660
|
// allocate in hash table
|
679
|
-
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
661
|
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
|
680
662
|
|
681
663
|
// set the node_allocs from the hash table
|
682
664
|
if (galloc->n_nodes < graph->n_nodes) {
|
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
711
693
|
}
|
712
694
|
if (galloc->n_leafs < graph->n_leafs) {
|
713
695
|
free(galloc->leaf_allocs);
|
714
|
-
galloc->leaf_allocs = calloc(sizeof(
|
696
|
+
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
715
697
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
698
|
}
|
717
699
|
galloc->n_leafs = graph->n_leafs;
|
718
700
|
for (int i = 0; i < graph->n_leafs; i++) {
|
719
701
|
struct ggml_tensor * leaf = graph->leafs[i];
|
720
702
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
-
galloc->leaf_allocs[i].
|
722
|
-
galloc->leaf_allocs[i].
|
703
|
+
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
704
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
705
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
723
706
|
}
|
724
707
|
|
725
708
|
// reallocate buffers if needed
|
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
727
710
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
728
711
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
729
712
|
|
730
|
-
if
|
713
|
+
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
714
|
+
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
731
715
|
#ifndef NDEBUG
|
732
716
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
733
717
|
#endif
|
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
744
728
|
}
|
745
729
|
|
746
730
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
747
|
-
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
731
|
+
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
748
732
|
}
|
749
733
|
|
750
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
751
|
-
assert(
|
734
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
735
|
+
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
752
736
|
|
753
|
-
if (
|
754
|
-
if (
|
737
|
+
if (tensor->view_src != NULL) {
|
738
|
+
if (tensor->buffer == NULL) {
|
755
739
|
assert(tensor_alloc->offset == SIZE_MAX);
|
756
|
-
if (
|
740
|
+
if (tensor->view_src->buffer == NULL) {
|
757
741
|
// this tensor was allocated without ggml-backend
|
758
742
|
return;
|
759
743
|
}
|
760
|
-
ggml_backend_view_init(galloc->buffers[buffer_id],
|
744
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
761
745
|
}
|
762
746
|
} else {
|
763
|
-
if (
|
747
|
+
if (tensor->data == NULL) {
|
764
748
|
assert(tensor_alloc->offset != SIZE_MAX);
|
765
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id],
|
749
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
766
750
|
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
767
751
|
void * addr = (char *)base + tensor_alloc->offset;
|
768
|
-
ggml_backend_tensor_alloc(galloc->buffers[buffer_id],
|
752
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
769
753
|
} else {
|
770
|
-
if (
|
754
|
+
if (tensor->buffer == NULL) {
|
771
755
|
// this tensor was allocated without ggml-backend
|
772
756
|
return;
|
773
757
|
}
|
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
843
827
|
|
844
828
|
// reset buffers
|
845
829
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
846
|
-
// zero size buffers are not allocated
|
847
830
|
if (galloc->buffers[i] != NULL) {
|
848
831
|
ggml_backend_buffer_reset(galloc->buffers[i]);
|
849
832
|
}
|
850
833
|
}
|
851
834
|
|
852
835
|
// allocate the graph tensors from the previous assignments
|
836
|
+
// leafs
|
837
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
838
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
839
|
+
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
840
|
+
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
841
|
+
}
|
853
842
|
// nodes
|
854
843
|
for (int i = 0; i < graph->n_nodes; i++) {
|
855
844
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
863
852
|
}
|
864
853
|
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
854
|
}
|
866
|
-
// leafs
|
867
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
-
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
-
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
871
|
-
}
|
872
855
|
|
873
856
|
return true;
|
874
857
|
}
|
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
900
883
|
return false;
|
901
884
|
}
|
902
885
|
|
903
|
-
struct ggml_tallocr
|
886
|
+
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
904
887
|
|
905
888
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
906
889
|
if (t->data == NULL) {
|
907
890
|
if (t->view_src == NULL) {
|
908
|
-
ggml_tallocr_alloc(tallocr, t);
|
891
|
+
ggml_tallocr_alloc(&tallocr, t);
|
909
892
|
} else if (t->buffer == NULL) {
|
910
893
|
ggml_backend_view_init(buffer, t);
|
911
894
|
}
|
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
917
900
|
}
|
918
901
|
}
|
919
902
|
|
920
|
-
ggml_tallocr_free(tallocr);
|
921
|
-
|
922
903
|
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
923
904
|
(*buffers)[(*n_buffers)++] = buffer;
|
924
905
|
|
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
11
11
|
typedef struct ggml_backend * ggml_backend_t;
|
12
12
|
|
13
13
|
// Tensor allocator
|
14
|
-
|
14
|
+
struct ggml_tallocr {
|
15
|
+
ggml_backend_buffer_t buffer;
|
16
|
+
void * base;
|
17
|
+
size_t alignment;
|
18
|
+
size_t offset;
|
19
|
+
};
|
15
20
|
|
16
|
-
GGML_API
|
17
|
-
GGML_API void
|
18
|
-
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
21
|
+
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
22
|
+
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
19
23
|
|
20
24
|
// Graph allocator
|
21
25
|
/*
|
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
50
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
51
55
|
// returns false if the buffer allocation failed
|
52
56
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
53
|
-
GGML_API bool ggml_gallocr_reserve_n(
|
57
|
+
GGML_API bool ggml_gallocr_reserve_n(
|
58
|
+
ggml_gallocr_t galloc,
|
59
|
+
struct ggml_cgraph * graph,
|
60
|
+
const int * node_buffer_ids,
|
61
|
+
const int * leaf_buffer_ids);
|
54
62
|
|
55
63
|
// automatic reallocation if the topology changes when using a single buffer
|
56
64
|
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
@@ -86,31 +86,43 @@ extern "C" {
|
|
86
86
|
// (optional) asynchronous tensor data access
|
87
87
|
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
88
88
|
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
89
|
-
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t
|
89
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
90
90
|
|
91
91
|
// (optional) complete all pending operations
|
92
92
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
93
|
|
94
|
-
// compute graph with a plan
|
94
|
+
// compute graph with a plan (not used currently)
|
95
95
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
96
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
|
-
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
98
97
|
|
98
|
+
// compute graph with a plan
|
99
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
99
100
|
// compute graph without a plan (async)
|
100
|
-
|
101
|
+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
101
102
|
|
102
103
|
// check if the backend supports an operation
|
103
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
|
+
|
106
|
+
// (optional) event synchronization
|
107
|
+
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
|
+
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
109
|
+
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
110
|
+
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
111
|
+
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
104
112
|
};
|
105
113
|
|
106
114
|
struct ggml_backend {
|
107
115
|
ggml_guid_t guid;
|
108
116
|
|
109
117
|
struct ggml_backend_i iface;
|
110
|
-
|
111
118
|
ggml_backend_context_t context;
|
112
119
|
};
|
113
120
|
|
121
|
+
struct ggml_backend_event {
|
122
|
+
ggml_backend_t backend;
|
123
|
+
void * context;
|
124
|
+
};
|
125
|
+
|
114
126
|
//
|
115
127
|
// Backend registry
|
116
128
|
//
|