llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
61
61
|
}
|
62
62
|
}
|
63
63
|
|
64
|
-
// TODO: GGML_PAD ?
|
65
64
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
66
65
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
67
66
|
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
69
68
|
}
|
70
69
|
|
71
70
|
// tallocr
|
72
|
-
struct ggml_tallocr {
|
73
|
-
ggml_backend_buffer_t buffer;
|
74
|
-
void * base;
|
75
|
-
size_t alignment;
|
76
|
-
size_t offset;
|
77
|
-
};
|
78
|
-
|
79
|
-
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
80
|
-
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
81
|
-
if (talloc == NULL) {
|
82
|
-
return NULL;
|
83
|
-
}
|
84
71
|
|
72
|
+
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
85
73
|
void * base = ggml_backend_buffer_get_base(buffer);
|
86
74
|
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
87
75
|
|
88
76
|
assert(align && !(align & (align - 1))); // power of 2
|
89
77
|
|
90
|
-
|
78
|
+
struct ggml_tallocr talloc = (struct ggml_tallocr) {
|
91
79
|
/*.buffer = */ buffer,
|
92
80
|
/*.base = */ base,
|
93
81
|
/*.alignment = */ align,
|
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
|
96
84
|
return talloc;
|
97
85
|
}
|
98
86
|
|
99
|
-
void
|
100
|
-
free(talloc);
|
101
|
-
}
|
102
|
-
|
103
|
-
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
87
|
+
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
104
88
|
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
105
89
|
size = GGML_PAD(size, talloc->alignment);
|
106
90
|
|
@@ -354,12 +338,16 @@ struct hash_node {
|
|
354
338
|
bool allocated;
|
355
339
|
};
|
356
340
|
|
357
|
-
//
|
358
341
|
struct tensor_alloc {
|
359
342
|
size_t offset;
|
360
343
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
361
344
|
};
|
362
345
|
|
346
|
+
struct leaf_alloc {
|
347
|
+
int buffer_id;
|
348
|
+
struct tensor_alloc leaf;
|
349
|
+
};
|
350
|
+
|
363
351
|
struct node_alloc {
|
364
352
|
int buffer_id;
|
365
353
|
struct tensor_alloc dst;
|
@@ -378,7 +366,7 @@ struct ggml_gallocr {
|
|
378
366
|
struct node_alloc * node_allocs; // [n_nodes]
|
379
367
|
int n_nodes;
|
380
368
|
|
381
|
-
struct
|
369
|
+
struct leaf_alloc * leaf_allocs; // [n_leafs]
|
382
370
|
int n_leafs;
|
383
371
|
};
|
384
372
|
|
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
|
543
531
|
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
544
532
|
}
|
545
533
|
|
546
|
-
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
534
|
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
547
535
|
// clear hash tables
|
548
536
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
549
537
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
550
538
|
|
539
|
+
// allocate leafs
|
540
|
+
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
541
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
542
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
543
|
+
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
|
544
|
+
}
|
545
|
+
|
551
546
|
// count number of children and views
|
552
|
-
// allocate
|
547
|
+
// allocate other graph inputs and leafs first to avoid overwriting them
|
553
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
554
549
|
struct ggml_tensor * node = graph->nodes[i];
|
555
550
|
|
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
577
572
|
}
|
578
573
|
}
|
579
574
|
|
580
|
-
// allocate the remaining leafs that are unused on the graph
|
581
|
-
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
-
|
586
|
-
if (hn->n_children == 0) {
|
587
|
-
assert(!hn->allocated);
|
588
|
-
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
-
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
-
}
|
591
|
-
}
|
592
|
-
|
593
575
|
// allocate tensors
|
594
576
|
for (int i = 0; i < graph->n_nodes; i++) {
|
595
577
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
652
634
|
}
|
653
635
|
}
|
654
636
|
|
655
|
-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
637
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
656
638
|
size_t hash_size = graph->visited_hash_table.size;
|
657
639
|
|
658
640
|
// initialize hash table
|
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
676
658
|
}
|
677
659
|
|
678
660
|
// allocate in hash table
|
679
|
-
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
661
|
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
|
680
662
|
|
681
663
|
// set the node_allocs from the hash table
|
682
664
|
if (galloc->n_nodes < graph->n_nodes) {
|
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
711
693
|
}
|
712
694
|
if (galloc->n_leafs < graph->n_leafs) {
|
713
695
|
free(galloc->leaf_allocs);
|
714
|
-
galloc->leaf_allocs = calloc(sizeof(
|
696
|
+
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
715
697
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
698
|
}
|
717
699
|
galloc->n_leafs = graph->n_leafs;
|
718
700
|
for (int i = 0; i < graph->n_leafs; i++) {
|
719
701
|
struct ggml_tensor * leaf = graph->leafs[i];
|
720
702
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
-
galloc->leaf_allocs[i].
|
722
|
-
galloc->leaf_allocs[i].
|
703
|
+
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
704
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
705
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
723
706
|
}
|
724
707
|
|
725
708
|
// reallocate buffers if needed
|
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
727
710
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
728
711
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
729
712
|
|
730
|
-
if
|
713
|
+
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
714
|
+
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
731
715
|
#ifndef NDEBUG
|
732
716
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
733
717
|
#endif
|
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
744
728
|
}
|
745
729
|
|
746
730
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
747
|
-
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
731
|
+
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
748
732
|
}
|
749
733
|
|
750
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
751
|
-
assert(
|
734
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
735
|
+
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
752
736
|
|
753
|
-
if (
|
754
|
-
if (
|
737
|
+
if (tensor->view_src != NULL) {
|
738
|
+
if (tensor->buffer == NULL) {
|
755
739
|
assert(tensor_alloc->offset == SIZE_MAX);
|
756
|
-
if (
|
740
|
+
if (tensor->view_src->buffer == NULL) {
|
757
741
|
// this tensor was allocated without ggml-backend
|
758
742
|
return;
|
759
743
|
}
|
760
|
-
ggml_backend_view_init(galloc->buffers[buffer_id],
|
744
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
761
745
|
}
|
762
746
|
} else {
|
763
|
-
if (
|
747
|
+
if (tensor->data == NULL) {
|
764
748
|
assert(tensor_alloc->offset != SIZE_MAX);
|
765
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id],
|
749
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
766
750
|
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
767
751
|
void * addr = (char *)base + tensor_alloc->offset;
|
768
|
-
ggml_backend_tensor_alloc(galloc->buffers[buffer_id],
|
752
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
769
753
|
} else {
|
770
|
-
if (
|
754
|
+
if (tensor->buffer == NULL) {
|
771
755
|
// this tensor was allocated without ggml-backend
|
772
756
|
return;
|
773
757
|
}
|
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
843
827
|
|
844
828
|
// reset buffers
|
845
829
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
846
|
-
// zero size buffers are not allocated
|
847
830
|
if (galloc->buffers[i] != NULL) {
|
848
831
|
ggml_backend_buffer_reset(galloc->buffers[i]);
|
849
832
|
}
|
850
833
|
}
|
851
834
|
|
852
835
|
// allocate the graph tensors from the previous assignments
|
836
|
+
// leafs
|
837
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
838
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
839
|
+
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
840
|
+
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
841
|
+
}
|
853
842
|
// nodes
|
854
843
|
for (int i = 0; i < graph->n_nodes; i++) {
|
855
844
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
863
852
|
}
|
864
853
|
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
854
|
}
|
866
|
-
// leafs
|
867
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
-
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
-
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
871
|
-
}
|
872
855
|
|
873
856
|
return true;
|
874
857
|
}
|
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
900
883
|
return false;
|
901
884
|
}
|
902
885
|
|
903
|
-
struct ggml_tallocr
|
886
|
+
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
904
887
|
|
905
888
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
906
889
|
if (t->data == NULL) {
|
907
890
|
if (t->view_src == NULL) {
|
908
|
-
ggml_tallocr_alloc(tallocr, t);
|
891
|
+
ggml_tallocr_alloc(&tallocr, t);
|
909
892
|
} else if (t->buffer == NULL) {
|
910
893
|
ggml_backend_view_init(buffer, t);
|
911
894
|
}
|
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
917
900
|
}
|
918
901
|
}
|
919
902
|
|
920
|
-
ggml_tallocr_free(tallocr);
|
921
|
-
|
922
903
|
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
923
904
|
(*buffers)[(*n_buffers)++] = buffer;
|
924
905
|
|
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
11
11
|
typedef struct ggml_backend * ggml_backend_t;
|
12
12
|
|
13
13
|
// Tensor allocator
|
14
|
-
|
14
|
+
struct ggml_tallocr {
|
15
|
+
ggml_backend_buffer_t buffer;
|
16
|
+
void * base;
|
17
|
+
size_t alignment;
|
18
|
+
size_t offset;
|
19
|
+
};
|
15
20
|
|
16
|
-
GGML_API
|
17
|
-
GGML_API void
|
18
|
-
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
21
|
+
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
22
|
+
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
19
23
|
|
20
24
|
// Graph allocator
|
21
25
|
/*
|
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
50
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
51
55
|
// returns false if the buffer allocation failed
|
52
56
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
53
|
-
GGML_API bool ggml_gallocr_reserve_n(
|
57
|
+
GGML_API bool ggml_gallocr_reserve_n(
|
58
|
+
ggml_gallocr_t galloc,
|
59
|
+
struct ggml_cgraph * graph,
|
60
|
+
const int * node_buffer_ids,
|
61
|
+
const int * leaf_buffer_ids);
|
54
62
|
|
55
63
|
// automatic reallocation if the topology changes when using a single buffer
|
56
64
|
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
@@ -86,31 +86,43 @@ extern "C" {
|
|
86
86
|
// (optional) asynchronous tensor data access
|
87
87
|
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
88
88
|
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
89
|
-
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t
|
89
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
90
90
|
|
91
91
|
// (optional) complete all pending operations
|
92
92
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
93
|
|
94
|
-
// compute graph with a plan
|
94
|
+
// compute graph with a plan (not used currently)
|
95
95
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
96
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
|
-
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
98
97
|
|
98
|
+
// compute graph with a plan
|
99
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
99
100
|
// compute graph without a plan (async)
|
100
|
-
|
101
|
+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
101
102
|
|
102
103
|
// check if the backend supports an operation
|
103
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
|
+
|
106
|
+
// (optional) event synchronization
|
107
|
+
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
|
+
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
109
|
+
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
110
|
+
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
111
|
+
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
104
112
|
};
|
105
113
|
|
106
114
|
struct ggml_backend {
|
107
115
|
ggml_guid_t guid;
|
108
116
|
|
109
117
|
struct ggml_backend_i iface;
|
110
|
-
|
111
118
|
ggml_backend_context_t context;
|
112
119
|
};
|
113
120
|
|
121
|
+
struct ggml_backend_event {
|
122
|
+
ggml_backend_t backend;
|
123
|
+
void * context;
|
124
|
+
};
|
125
|
+
|
114
126
|
//
|
115
127
|
// Backend registry
|
116
128
|
//
|