llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#define UNUSED(x) (void)(x)
|
10
10
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
11
12
|
|
12
13
|
//#define GGML_ALLOCATOR_DEBUG
|
13
14
|
|
@@ -67,6 +68,8 @@ struct ggml_allocr {
|
|
67
68
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
69
|
size_t max_size;
|
69
70
|
bool measure;
|
71
|
+
int parse_seq[GGML_MAX_CONCUR];
|
72
|
+
int parse_seq_len;
|
70
73
|
|
71
74
|
#ifdef GGML_ALLOCATOR_DEBUG
|
72
75
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -74,7 +77,7 @@ struct ggml_allocr {
|
|
74
77
|
};
|
75
78
|
|
76
79
|
#ifdef GGML_ALLOCATOR_DEBUG
|
77
|
-
static void add_allocated_tensor(struct
|
80
|
+
static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
78
81
|
for (int i = 0; i < 1024; i++) {
|
79
82
|
if (alloc->allocated_tensors[i] == NULL) {
|
80
83
|
alloc->allocated_tensors[i] = tensor;
|
@@ -83,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|
83
86
|
}
|
84
87
|
GGML_ASSERT(!"out of allocated_tensors");
|
85
88
|
}
|
86
|
-
static void remove_allocated_tensor(struct
|
89
|
+
static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
87
90
|
for (int i = 0; i < 1024; i++) {
|
88
91
|
if (alloc->allocated_tensors[i] == tensor ||
|
89
92
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -111,10 +114,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
111
114
|
|
112
115
|
size_t max_avail = 0;
|
113
116
|
|
114
|
-
// find the best fitting free block
|
117
|
+
// find the best fitting free block besides the last block
|
115
118
|
int best_fit_block = -1;
|
116
119
|
size_t best_fit_size = SIZE_MAX;
|
117
|
-
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
120
|
+
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
118
121
|
struct free_block * block = &alloc->free_blocks[i];
|
119
122
|
max_avail = MAX(max_avail, block->size);
|
120
123
|
if (block->size >= size && block->size <= best_fit_size) {
|
@@ -126,10 +129,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
126
129
|
AT_PRINTF("block %d\n", best_fit_block);
|
127
130
|
|
128
131
|
if (best_fit_block == -1) {
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
+
// the last block is our last resort
|
133
|
+
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
134
|
+
if (block->size >= size) {
|
135
|
+
best_fit_block = alloc->n_free_blocks - 1;
|
136
|
+
max_avail = MAX(max_avail, block->size);
|
137
|
+
} else {
|
138
|
+
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
139
|
+
__func__, size, max_avail);
|
140
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
132
141
|
return;
|
142
|
+
}
|
133
143
|
}
|
134
144
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
135
145
|
void * addr = block->addr;
|
@@ -229,6 +239,13 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
229
239
|
alloc->n_free_blocks++;
|
230
240
|
}
|
231
241
|
|
242
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
243
|
+
for (int i = 0; i < n; i++) {
|
244
|
+
alloc->parse_seq[i] = list[i];
|
245
|
+
}
|
246
|
+
alloc->parse_seq_len = n;
|
247
|
+
}
|
248
|
+
|
232
249
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
233
250
|
alloc->n_free_blocks = 1;
|
234
251
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
@@ -248,6 +265,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
248
265
|
/*.hash_table = */ {{0}},
|
249
266
|
/*.max_size = */ 0,
|
250
267
|
/*.measure = */ false,
|
268
|
+
/*.parse_seq = */ {0},
|
269
|
+
/*.parse_seq_len = */ 0,
|
251
270
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
271
|
/*.allocated_tensors = */ = {0},
|
253
272
|
#endif
|
@@ -275,6 +294,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
275
294
|
/*.hash_table = */ {{0}},
|
276
295
|
/*.max_size = */ 0,
|
277
296
|
/*.measure = */ true,
|
297
|
+
/*.parse_seq = */ {0},
|
298
|
+
/*.parse_seq_len = */ 0,
|
278
299
|
#ifdef GGML_ALLOCATOR_DEBUG
|
279
300
|
/*.allocated_tensors = */ = {0},
|
280
301
|
#endif
|
@@ -421,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
421
442
|
else {
|
422
443
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
423
444
|
node->data = parent->data;
|
445
|
+
return;
|
424
446
|
}
|
425
|
-
return;
|
426
447
|
}
|
427
448
|
}
|
428
449
|
}
|
@@ -473,63 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
473
494
|
allocate_node(alloc, input);
|
474
495
|
}
|
475
496
|
}
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
497
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
498
|
+
int last_barrier_pos = 0;
|
499
|
+
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
500
|
+
|
501
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
502
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
503
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
504
|
+
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
505
|
+
struct ggml_tensor * node = gf->nodes[i];
|
506
|
+
|
507
|
+
// allocate parents (leafs)
|
508
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
509
|
+
struct ggml_tensor * parent = node->src[j];
|
510
|
+
if (parent == NULL) {
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
allocate_node(alloc, parent);
|
484
514
|
}
|
485
|
-
allocate_node(alloc, parent);
|
486
|
-
}
|
487
515
|
|
488
|
-
|
489
|
-
|
516
|
+
// allocate node
|
517
|
+
allocate_node(alloc, node);
|
490
518
|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
519
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
520
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
521
|
+
struct ggml_tensor * parent = node->src[j];
|
522
|
+
if (parent == NULL) {
|
523
|
+
break;
|
524
|
+
}
|
525
|
+
AT_PRINTF("%s", parent->name);
|
526
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
527
|
+
AT_PRINTF(", ");
|
528
|
+
}
|
500
529
|
}
|
530
|
+
AT_PRINTF("\n");
|
501
531
|
}
|
502
|
-
|
532
|
+
|
503
533
|
|
504
534
|
// update parents
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
519
|
-
view_src_hn->n_views -= 1;
|
520
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
521
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
522
|
-
ggml_allocator_free_tensor(alloc, view_src);
|
535
|
+
// update immediately if there is no parse_seq
|
536
|
+
// update only at barriers if there is parse_seq
|
537
|
+
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
538
|
+
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
|
+
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
|
+
for (int i = update_start; i < update_end; i++) {
|
541
|
+
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
542
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
543
|
+
|
544
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
545
|
+
struct ggml_tensor * parent = node->src[j];
|
546
|
+
if (parent == NULL) {
|
547
|
+
break;
|
523
548
|
}
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
549
|
+
struct hash_node * p_hn = hash_get(ht, parent);
|
550
|
+
p_hn->n_children -= 1;
|
551
|
+
|
552
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
553
|
+
|
554
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
|
+
if (ggml_is_view(parent)) {
|
556
|
+
struct ggml_tensor * view_src = get_view_source(parent);
|
557
|
+
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
|
+
view_src_hn->n_views -= 1;
|
559
|
+
AT_PRINTF("view_src %s\n", view_src->name);
|
560
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
+
ggml_allocator_free_tensor(alloc, view_src);
|
562
|
+
}
|
563
|
+
}
|
564
|
+
else {
|
565
|
+
if (parent->data != node->data) {
|
566
|
+
ggml_allocator_free_tensor(alloc, parent);
|
567
|
+
}
|
568
|
+
}
|
528
569
|
}
|
529
570
|
}
|
530
571
|
}
|
572
|
+
AT_PRINTF("\n");
|
573
|
+
if (alloc->parse_seq_len) {
|
574
|
+
last_barrier_pos = ind + 1;
|
575
|
+
}
|
531
576
|
}
|
532
|
-
AT_PRINTF("\n");
|
533
577
|
}
|
534
578
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
535
579
|
if (outputs != NULL && outputs[g] != NULL) {
|
@@ -10,6 +10,10 @@ extern "C" {
|
|
10
10
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
11
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
12
12
|
|
13
|
+
// tell the allocator to parse nodes following the order described in the list
|
14
|
+
// you should call this if your graph are optimized to execute out-of-order
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
|
+
|
13
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
14
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
15
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|