llama_cpp 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,6 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
70
73
|
|
71
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
72
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -74,7 +77,7 @@ struct ggml_allocr {
|
|
74
77
|
};
|
75
78
|
|
76
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
77
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
78
81
|
for (int i = 0; i < 1024; i++) {
|
79
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
80
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -83,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
83
86
|
}
|
84
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
85
88
|
}
|
86
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
87
90
|
for (int i = 0; i < 1024; i++) {
|
88
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
89
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -111,10 +114,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
111
114
|
|
112
115
|
size_t max_avail = 0;
|
113
116
|
|
114
|
-
// find the best fitting free block
|
117
|
+
// find the best fitting free block besides the last block
|
115
118
|
int best_fit_block = -1;
|
116
119
|
size_t best_fit_size = SIZE_MAX;
|
117
|
-
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
120
|
+
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
118
121
|
struct free_block * block = &alloc->free_blocks[i];
|
119
122
|
max_avail = MAX(max_avail, block->size);
|
120
123
|
if (block->size >= size && block->size <= best_fit_size) {
|
@@ -126,10 +129,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
126
129
|
AT_PRINTF("block %d\n", best_fit_block);
|
127
130
|
|
128
131
|
if (best_fit_block == -1) {
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
+
// the last block is our last resort
|
133
|
+
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
134
|
+
if (block->size >= size) {
|
135
|
+
best_fit_block = alloc->n_free_blocks - 1;
|
136
|
+
max_avail = MAX(max_avail, block->size);
|
137
|
+
} else {
|
138
|
+
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
139
|
+
__func__, size, max_avail);
|
140
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
132
141
|
return;
|
142
|
+
}
|
133
143
|
}
|
134
144
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
135
145
|
void * addr = block->addr;
|
@@ -229,6 +239,13 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
229
239
|
alloc->n_free_blocks++;
|
230
240
|
}
|
231
241
|
|
242
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
|
+
for (int i = 0; i < n; i++) {
|
244
|
+
alloc->parse_seq[i] = list[i];
|
245
|
+
}
|
246
|
+
alloc->parse_seq_len = n;
|
247
|
+
}
|
248
|
+
|
232
249
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
233
250
|
alloc->n_free_blocks = 1;
|
234
251
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
@@ -248,6 +265,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
248
265
|
/*.hash_table = */ {{0}},
|
249
266
|
/*.max_size = */ 0,
|
250
267
|
/*.measure = */ false,
|
268
|
+
/*.parse_seq = */ {0},
|
269
|
+
/*.parse_seq_len = */ 0,
|
251
270
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
271
|
/*.allocated_tensors = */ = {0},
|
253
272
|
#endif
|
@@ -275,6 +294,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
275
294
|
/*.hash_table = */ {{0}},
|
276
295
|
/*.max_size = */ 0,
|
277
296
|
/*.measure = */ true,
|
297
|
+
/*.parse_seq = */ {0},
|
298
|
+
/*.parse_seq_len = */ 0,
|
278
299
|
#ifdef GGML_ALLOCATOR_DEBUG
|
279
300
|
/*.allocated_tensors = */ = {0},
|
280
301
|
#endif
|
@@ -421,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
421
442
|
else {
|
422
443
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
423
444
|
node->data = parent->data;
|
445
|
+
return;
|
424
446
|
}
|
425
|
-
return;
|
426
447
|
}
|
427
448
|
}
|
428
449
|
}
|
@@ -473,63 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
473
494
|
allocate_node(alloc, input);
|
474
495
|
}
|
475
496
|
}
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
497
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
498
|
+
int last_barrier_pos = 0;
|
499
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
500
|
+
|
501
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
502
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
503
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
504
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
505
|
+
struct ggml_tensor * node = gf->nodes[i];
|
506
|
+
|
507
|
+
// allocate parents (leafs)
|
508
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
509
|
+
struct ggml_tensor * parent = node->src[j];
|
510
|
+
if (parent == NULL) {
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
allocate_node(alloc, parent);
|
484
514
|
}
|
485
|
-
allocate_node(alloc, parent);
|
486
|
-
}
|
487
515
|
|
488
|
-
|
489
|
-
|
516
|
+
// allocate node
|
517
|
+
allocate_node(alloc, node);
|
490
518
|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
519
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
520
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
521
|
+
struct ggml_tensor * parent = node->src[j];
|
522
|
+
if (parent == NULL) {
|
523
|
+
break;
|
524
|
+
}
|
525
|
+
AT_PRINTF("%s", parent->name);
|
526
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
527
|
+
AT_PRINTF(", ");
|
528
|
+
}
|
500
529
|
}
|
530
|
+
AT_PRINTF("\n");
|
501
531
|
}
|
502
|
-
|
532
|
+
|
503
533
|
|
504
534
|
// update parents
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
519
|
-
view_src_hn->n_views -= 1;
|
520
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
521
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
522
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
535
|
+
// update immediately if there is no parse_seq
|
536
|
+
// update only at barriers if there is parse_seq
|
537
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
538
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
|
+
for (int i = update_start; i < update_end; i++) {
|
541
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
542
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
543
|
+
|
544
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
545
|
+
struct ggml_tensor * parent = node->src[j];
|
546
|
+
if (parent == NULL) {
|
547
|
+
break;
|
523
548
|
}
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
549
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
550
|
+
p_hn->n_children -= 1;
|
551
|
+
|
552
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
553
|
+
|
554
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
|
+
if (ggml_is_view(parent)) {
|
556
|
+
struct ggml_tensor * view_src = get_view_source(parent);
|
557
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
|
+
view_src_hn->n_views -= 1;
|
559
|
+
AT_PRINTF("view_src %s\n", view_src->name);
|
560
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
562
|
+
}
|
563
|
+
}
|
564
|
+
else {
|
565
|
+
if (parent->data != node->data) {
|
566
|
+
ggml_allocator_free_tensor(alloc, parent);
|
567
|
+
}
|
568
|
+
}
|
528
569
|
}
|
529
570
|
}
|
530
571
|
}
|
572
|
+
AT_PRINTF("\n");
|
573
|
+
if (alloc->parse_seq_len) {
|
574
|
+
last_barrier_pos = ind + 1;
|
575
|
+
}
|
531
576
|
}
|
532
|
-
AT_PRINTF("\n");
|
533
577
|
}
|
534
578
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
535
579
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -10,6 +10,10 @@ extern "C" {
|
|
10
10
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
11
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
12
12
|
|
13
|
+
// tell the allocator to parse nodes following the order described in the list
|
14
|
+
// you should call this if your graph are optimized to execute out-of-order
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
|
+
|
13
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
14
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
15
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|