llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,7 +1,9 @@
|
|
1
|
-
#include "ggml-backend.h"
|
1
|
+
#include "ggml-backend-impl.h"
|
2
2
|
#include "ggml-alloc.h"
|
3
|
+
#include "ggml-impl.h"
|
3
4
|
|
4
5
|
#include <assert.h>
|
6
|
+
#include <limits.h>
|
5
7
|
#include <stdarg.h>
|
6
8
|
#include <stdio.h>
|
7
9
|
#include <stdlib.h>
|
@@ -33,6 +35,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
33
35
|
}
|
34
36
|
|
35
37
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
38
|
+
if (buffer == NULL) {
|
39
|
+
return;
|
40
|
+
}
|
41
|
+
|
36
42
|
if (buffer->iface.free_buffer != NULL) {
|
37
43
|
buffer->iface.free_buffer(buffer);
|
38
44
|
}
|
@@ -43,15 +49,20 @@ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|
43
49
|
return ggml_backend_get_alignment(buffer->backend);
|
44
50
|
}
|
45
51
|
|
46
|
-
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
47
|
-
return buffer->iface.get_base(buffer);
|
48
|
-
}
|
49
|
-
|
50
52
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
51
53
|
return buffer->size;
|
52
54
|
}
|
53
55
|
|
56
|
+
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
57
|
+
void * base = buffer->iface.get_base(buffer);
|
58
|
+
|
59
|
+
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
60
|
+
|
61
|
+
return base;
|
62
|
+
}
|
63
|
+
|
54
64
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
65
|
+
// get_alloc_size is optional, defaults to ggml_nbytes
|
55
66
|
if (buffer->iface.get_alloc_size) {
|
56
67
|
return buffer->iface.get_alloc_size(buffer, tensor);
|
57
68
|
}
|
@@ -59,12 +70,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
59
70
|
}
|
60
71
|
|
61
72
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
73
|
+
// init_tensor is optional
|
62
74
|
if (buffer->iface.init_tensor) {
|
63
75
|
buffer->iface.init_tensor(buffer, tensor);
|
64
76
|
}
|
65
77
|
}
|
66
78
|
|
67
79
|
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
80
|
+
// free_tensor is optional
|
68
81
|
if (buffer->iface.free_tensor) {
|
69
82
|
buffer->iface.free_tensor(buffer, tensor);
|
70
83
|
}
|
@@ -73,14 +86,21 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
73
86
|
// backend
|
74
87
|
|
75
88
|
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
76
|
-
return tensor->buffer->backend;
|
89
|
+
return tensor->buffer ? tensor->buffer->backend : NULL;
|
77
90
|
}
|
78
91
|
|
79
92
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
93
|
+
if (backend == NULL) {
|
94
|
+
return "NULL";
|
95
|
+
}
|
80
96
|
return backend->iface.get_name(backend);
|
81
97
|
}
|
82
98
|
|
83
99
|
void ggml_backend_free(ggml_backend_t backend) {
|
100
|
+
if (backend == NULL) {
|
101
|
+
return;
|
102
|
+
}
|
103
|
+
|
84
104
|
backend->iface.free(backend);
|
85
105
|
}
|
86
106
|
|
@@ -101,13 +121,23 @@ void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * dat
|
|
101
121
|
}
|
102
122
|
|
103
123
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
104
|
-
ggml_get_backend(tensor)
|
105
|
-
|
124
|
+
ggml_backend_t backend = ggml_get_backend(tensor);
|
125
|
+
|
126
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
127
|
+
GGML_ASSERT(backend != NULL && "tensor backend not set");
|
128
|
+
|
129
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
130
|
+
backend->iface.synchronize(backend);
|
106
131
|
}
|
107
132
|
|
108
133
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
109
|
-
ggml_get_backend(tensor)
|
110
|
-
|
134
|
+
ggml_backend_t backend = ggml_get_backend(tensor);
|
135
|
+
|
136
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
137
|
+
GGML_ASSERT(backend != NULL && "tensor backend not set");
|
138
|
+
|
139
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
140
|
+
backend->iface.synchronize(backend);
|
111
141
|
}
|
112
142
|
|
113
143
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -156,7 +186,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
156
186
|
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
157
187
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
158
188
|
|
159
|
-
//
|
189
|
+
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
160
190
|
|
161
191
|
if (src == dst) {
|
162
192
|
return;
|
@@ -234,6 +264,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backen
|
|
234
264
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
235
265
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
236
266
|
|
267
|
+
GGML_ASSERT(data != NULL && "failed to allocate buffer");
|
268
|
+
|
237
269
|
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
238
270
|
}
|
239
271
|
|
@@ -271,8 +303,7 @@ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml
|
|
271
303
|
}
|
272
304
|
|
273
305
|
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
274
|
-
|
275
|
-
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
306
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
276
307
|
|
277
308
|
UNUSED(backend);
|
278
309
|
}
|
@@ -383,3 +414,537 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
383
414
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
384
415
|
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
385
416
|
}
|
417
|
+
|
418
|
+
// scheduler
|
419
|
+
|
420
|
+
#define GGML_MAX_BACKENDS 4
|
421
|
+
#define GGML_MAX_SPLITS 256
|
422
|
+
#define GGML_MAX_SPLIT_INPUTS 16
|
423
|
+
|
424
|
+
struct ggml_backend_sched_split {
|
425
|
+
ggml_tallocr_t tallocr;
|
426
|
+
int i_start;
|
427
|
+
int i_end;
|
428
|
+
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
429
|
+
int n_inputs;
|
430
|
+
struct ggml_cgraph * graph;
|
431
|
+
};
|
432
|
+
|
433
|
+
struct ggml_backend_sched {
|
434
|
+
int n_backends;
|
435
|
+
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
436
|
+
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
437
|
+
|
438
|
+
ggml_gallocr_t galloc;
|
439
|
+
|
440
|
+
struct ggml_hash_set hash_set;
|
441
|
+
ggml_tallocr_t * node_talloc; // [hash_set.size]
|
442
|
+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
|
443
|
+
|
444
|
+
struct ggml_cgraph * graph;
|
445
|
+
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
446
|
+
int n_splits;
|
447
|
+
|
448
|
+
struct ggml_context * ctx;
|
449
|
+
|
450
|
+
// align context_buffer to GGML_MEM_ALIGN
|
451
|
+
#ifdef _MSC_VER
|
452
|
+
__declspec(align(GGML_MEM_ALIGN))
|
453
|
+
#else
|
454
|
+
__attribute__((aligned(GGML_MEM_ALIGN)))
|
455
|
+
#endif
|
456
|
+
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + GGML_MAX_SPLITS*sizeof(struct ggml_cgraph)];
|
457
|
+
};
|
458
|
+
|
459
|
+
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
460
|
+
#define node_allocr(node) sched->node_talloc[hash_id(node)]
|
461
|
+
|
462
|
+
static bool ggml_is_view_op(enum ggml_op op) {
|
463
|
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
464
|
+
}
|
465
|
+
|
466
|
+
// returns the priority of the backend, lower is better
|
467
|
+
static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
468
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
469
|
+
if (sched->backends[i] == backend) {
|
470
|
+
return i;
|
471
|
+
}
|
472
|
+
}
|
473
|
+
return INT_MAX;
|
474
|
+
}
|
475
|
+
|
476
|
+
static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
477
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
478
|
+
if (sched->tallocs[i] == allocr) {
|
479
|
+
return i;
|
480
|
+
}
|
481
|
+
}
|
482
|
+
return INT_MAX;
|
483
|
+
}
|
484
|
+
|
485
|
+
// returns the backend that should be used for the node based on the current locations
|
486
|
+
char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
487
|
+
static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
488
|
+
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
489
|
+
// ie. kv cache updates
|
490
|
+
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
491
|
+
// dst
|
492
|
+
ggml_backend_t cur_backend = ggml_get_backend(node);
|
493
|
+
if (cur_backend != NULL) {
|
494
|
+
sprintf(causes[hash_id(node)], "1.dst");
|
495
|
+
return cur_backend;
|
496
|
+
}
|
497
|
+
|
498
|
+
// view_src
|
499
|
+
if (node->view_src != NULL && ggml_get_backend(node->view_src) != NULL) {
|
500
|
+
sprintf(causes[hash_id(node)], "1.vsrc");
|
501
|
+
return ggml_get_backend(node->view_src);
|
502
|
+
}
|
503
|
+
|
504
|
+
// src
|
505
|
+
int cur_prio = INT_MAX;
|
506
|
+
size_t cur_size = 0;
|
507
|
+
|
508
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
509
|
+
const struct ggml_tensor * src = node->src[i];
|
510
|
+
if (src == NULL) {
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
ggml_backend_t src_backend = ggml_get_backend(src);
|
514
|
+
if (src_backend != NULL) {
|
515
|
+
int src_prio = sched_backend_prio(sched, src_backend);
|
516
|
+
size_t src_size = ggml_nbytes(src);
|
517
|
+
if (src_prio < cur_prio && src_size >= cur_size) {
|
518
|
+
cur_prio = src_prio;
|
519
|
+
cur_size = src_size;
|
520
|
+
cur_backend = src_backend;
|
521
|
+
sprintf(causes[hash_id(node)], "1.src%d", i);
|
522
|
+
}
|
523
|
+
}
|
524
|
+
}
|
525
|
+
return cur_backend;
|
526
|
+
}
|
527
|
+
|
528
|
+
static char * fmt_size(size_t size) {
|
529
|
+
static char buffer[128];
|
530
|
+
if (size >= 1024*1024) {
|
531
|
+
sprintf(buffer, "%zuM", size/1024/1024);
|
532
|
+
} else {
|
533
|
+
sprintf(buffer, "%zuK", size/1024);
|
534
|
+
}
|
535
|
+
return buffer;
|
536
|
+
}
|
537
|
+
|
538
|
+
static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
539
|
+
int cur_split = 0;
|
540
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
541
|
+
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
542
|
+
ggml_backend_t split_backend = ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend;
|
543
|
+
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs);
|
544
|
+
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
545
|
+
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
546
|
+
}
|
547
|
+
fprintf(stderr, "\n");
|
548
|
+
cur_split++;
|
549
|
+
}
|
550
|
+
struct ggml_tensor * node = graph->nodes[i];
|
551
|
+
if (ggml_is_view_op(node->op)) {
|
552
|
+
continue;
|
553
|
+
}
|
554
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
555
|
+
ggml_backend_t node_backend = node_allocr ? ggml_tallocr_get_buffer(node_allocr)->backend : NULL;
|
556
|
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]);
|
557
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
558
|
+
struct ggml_tensor * src = node->src[j];
|
559
|
+
if (src == NULL) {
|
560
|
+
break;
|
561
|
+
}
|
562
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
563
|
+
ggml_backend_t src_backend = src_allocr ? ggml_tallocr_get_buffer(src_allocr)->backend : NULL;
|
564
|
+
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]);
|
565
|
+
}
|
566
|
+
fprintf(stderr, "\n");
|
567
|
+
}
|
568
|
+
}
|
569
|
+
|
570
|
+
// creates a copy of the tensor with the same memory layout
|
571
|
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
572
|
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
573
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
574
|
+
dup->nb[i] = tensor->nb[i];
|
575
|
+
}
|
576
|
+
return dup;
|
577
|
+
}
|
578
|
+
|
579
|
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
580
|
+
// TODO: merge passes
|
581
|
+
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
582
|
+
// reset state
|
583
|
+
size_t hash_size = sched->hash_set.size;
|
584
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
585
|
+
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
586
|
+
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
587
|
+
sched->n_splits = 0;
|
588
|
+
|
589
|
+
struct ggml_init_params params = {
|
590
|
+
/*.mem_size = */ sizeof(sched->context_buffer),
|
591
|
+
/*.mem_buffer = */ sched->context_buffer,
|
592
|
+
/*.no_alloc = */ true
|
593
|
+
};
|
594
|
+
|
595
|
+
if (sched->ctx != NULL) {
|
596
|
+
ggml_free(sched->ctx);
|
597
|
+
}
|
598
|
+
|
599
|
+
sched->ctx = ggml_init(params);
|
600
|
+
|
601
|
+
// pass 1: assign backends to ops with allocated inputs
|
602
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
603
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
604
|
+
if (node_allocr(leaf) != NULL) {
|
605
|
+
// do not overwrite user assignments
|
606
|
+
continue;
|
607
|
+
}
|
608
|
+
ggml_backend_t leaf_backend = ggml_get_backend(leaf);
|
609
|
+
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
610
|
+
leaf_backend = ggml_get_backend(leaf->view_src);
|
611
|
+
}
|
612
|
+
if (leaf_backend != NULL) {
|
613
|
+
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
614
|
+
}
|
615
|
+
}
|
616
|
+
|
617
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
618
|
+
struct ggml_tensor * node = graph->nodes[i];
|
619
|
+
if (node_allocr(node) != NULL) {
|
620
|
+
// do not overwrite user assignments
|
621
|
+
continue;
|
622
|
+
}
|
623
|
+
ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
|
624
|
+
if (node_backend != NULL) {
|
625
|
+
node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
|
626
|
+
}
|
627
|
+
}
|
628
|
+
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
629
|
+
|
630
|
+
// pass 2: assign backends to ops from current assignments
|
631
|
+
// TODO:
|
632
|
+
// - reuse sched_backend_from_cur
|
633
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
634
|
+
struct ggml_tensor * node = graph->nodes[i];
|
635
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
636
|
+
if (node_allocr == NULL) {
|
637
|
+
int cur_prio = INT_MAX;
|
638
|
+
size_t cur_size = 0;
|
639
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
640
|
+
struct ggml_tensor * src = node->src[j];
|
641
|
+
if (src == NULL) {
|
642
|
+
break;
|
643
|
+
}
|
644
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
645
|
+
if (src_allocr != NULL) {
|
646
|
+
int src_prio = sched_allocr_prio(sched, src_allocr);
|
647
|
+
size_t src_size = ggml_nbytes(src);
|
648
|
+
if (src_prio < cur_prio && src_size >= cur_size) {
|
649
|
+
cur_prio = src_prio;
|
650
|
+
cur_size = src_size;
|
651
|
+
node_allocr = src_allocr;
|
652
|
+
sprintf(causes[hash_id(node)], "2.src%d", j);
|
653
|
+
}
|
654
|
+
}
|
655
|
+
}
|
656
|
+
if (node_allocr != NULL) {
|
657
|
+
node_allocr(node) = node_allocr;
|
658
|
+
}
|
659
|
+
}
|
660
|
+
}
|
661
|
+
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
662
|
+
|
663
|
+
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
664
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
665
|
+
struct ggml_tensor * node = graph->nodes[i];
|
666
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
667
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
668
|
+
struct ggml_tensor * src = node->src[j];
|
669
|
+
if (src == NULL) {
|
670
|
+
break;
|
671
|
+
}
|
672
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
673
|
+
if (src_allocr == NULL) {
|
674
|
+
node_allocr(src) = node_allocr;
|
675
|
+
}
|
676
|
+
}
|
677
|
+
}
|
678
|
+
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
679
|
+
|
680
|
+
// pass 4: split graph, find tensors that need to be copied
|
681
|
+
// TODO:
|
682
|
+
// - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
|
683
|
+
// find first backend
|
684
|
+
int cur_split = 0;
|
685
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
686
|
+
struct ggml_tensor * node = graph->nodes[i];
|
687
|
+
if (node->view_src == NULL) {
|
688
|
+
sched->splits[0].tallocr = node_allocr(node);
|
689
|
+
break;
|
690
|
+
}
|
691
|
+
}
|
692
|
+
sched->splits[0].i_start = 0;
|
693
|
+
sched->splits[0].n_inputs = 0;
|
694
|
+
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
695
|
+
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
696
|
+
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
697
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
698
|
+
struct ggml_tensor * node = graph->nodes[i];
|
699
|
+
|
700
|
+
if (ggml_is_view_op(node->op)) {
|
701
|
+
continue;
|
702
|
+
}
|
703
|
+
|
704
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
705
|
+
|
706
|
+
if (node_allocr != cur_allocr) {
|
707
|
+
sched->splits[cur_split].i_end = i;
|
708
|
+
cur_split++;
|
709
|
+
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
710
|
+
sched->splits[cur_split].tallocr = node_allocr;
|
711
|
+
sched->splits[cur_split].i_start = i;
|
712
|
+
sched->splits[cur_split].n_inputs = 0;
|
713
|
+
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
714
|
+
cur_allocr = node_allocr;
|
715
|
+
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
716
|
+
}
|
717
|
+
|
718
|
+
// find inputs that are not on the same backend
|
719
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
720
|
+
struct ggml_tensor * src = node->src[j];
|
721
|
+
if (src == NULL) {
|
722
|
+
break;
|
723
|
+
}
|
724
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
725
|
+
if (src_allocr != node_allocr) {
|
726
|
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
727
|
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
728
|
+
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
729
|
+
|
730
|
+
// create copies
|
731
|
+
size_t id = hash_id(src);
|
732
|
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
733
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
734
|
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
735
|
+
node_allocr(tensor_copy) = cur_allocr;
|
736
|
+
ggml_backend_t backend = ggml_tallocr_get_buffer(cur_allocr)->backend;
|
737
|
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
738
|
+
}
|
739
|
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
740
|
+
}
|
741
|
+
}
|
742
|
+
}
|
743
|
+
sched->splits[cur_split].i_end = graph->n_nodes;
|
744
|
+
sched->n_splits = cur_split + 1;
|
745
|
+
|
746
|
+
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
747
|
+
|
748
|
+
#if 1
|
749
|
+
// sanity check: all sources should have the same backend as the node
|
750
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
751
|
+
struct ggml_tensor * node = graph->nodes[i];
|
752
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
753
|
+
if (node_allocr == NULL) {
|
754
|
+
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
755
|
+
}
|
756
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
757
|
+
struct ggml_tensor * src = node->src[j];
|
758
|
+
if (src == NULL) {
|
759
|
+
break;
|
760
|
+
}
|
761
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
762
|
+
if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
|
763
|
+
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
764
|
+
node->name, node_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL",
|
765
|
+
j, src->name, src_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL");
|
766
|
+
}
|
767
|
+
}
|
768
|
+
}
|
769
|
+
#endif
|
770
|
+
|
771
|
+
// create copies of the graph for each split
|
772
|
+
// FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
|
773
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
|
774
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
775
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
776
|
+
split->graph = ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end);
|
777
|
+
|
778
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
779
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
780
|
+
struct ggml_tensor * input = split->inputs[j];
|
781
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
782
|
+
input_cpy->src[0] = input;
|
783
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
784
|
+
}
|
785
|
+
|
786
|
+
for (int j = split->i_start; j < split->i_end; j++) {
|
787
|
+
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
788
|
+
}
|
789
|
+
}
|
790
|
+
sched->graph = graph_copy;
|
791
|
+
}
|
792
|
+
|
793
|
+
static void sched_alloc_splits(ggml_backend_sched_t sched) {
|
794
|
+
ggml_gallocr_alloc_graph_n(
|
795
|
+
sched->galloc,
|
796
|
+
sched->graph,
|
797
|
+
sched->hash_set,
|
798
|
+
sched->node_talloc);
|
799
|
+
}
|
800
|
+
|
801
|
+
static void sched_compute_splits(ggml_backend_sched_t sched) {
|
802
|
+
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
|
803
|
+
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
|
804
|
+
|
805
|
+
struct ggml_backend_sched_split * splits = sched->splits;
|
806
|
+
|
807
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
808
|
+
struct ggml_backend_sched_split * split = &splits[i];
|
809
|
+
ggml_backend_t split_backend = ggml_tallocr_get_buffer(split->tallocr)->backend;
|
810
|
+
int split_backend_id = sched_backend_prio(sched, split_backend);
|
811
|
+
|
812
|
+
// copy the input tensors to the split backend
|
813
|
+
uint64_t copy_start_us = ggml_time_us();
|
814
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
815
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)];
|
816
|
+
if (split->inputs[j]->buffer == NULL) {
|
817
|
+
if (split->inputs[j]->view_src == NULL) {
|
818
|
+
fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name);
|
819
|
+
exit(1);
|
820
|
+
}
|
821
|
+
struct ggml_tensor * view = split->inputs[j];
|
822
|
+
view->backend = view->view_src->backend;
|
823
|
+
view->buffer = view->view_src->buffer;
|
824
|
+
view->data = (char *)view->view_src->data + view->view_offs;
|
825
|
+
ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
|
826
|
+
}
|
827
|
+
if (input_cpy->buffer == NULL) {
|
828
|
+
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
829
|
+
exit(1);
|
830
|
+
}
|
831
|
+
GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend);
|
832
|
+
GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
833
|
+
ggml_backend_tensor_copy(split->inputs[j], input_cpy);
|
834
|
+
}
|
835
|
+
// ggml_backend_synchronize(split_backend);
|
836
|
+
int64_t copy_end_us = ggml_time_us();
|
837
|
+
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
838
|
+
|
839
|
+
#if 0
|
840
|
+
char split_filename[GGML_MAX_NAME];
|
841
|
+
snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
|
842
|
+
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
843
|
+
#endif
|
844
|
+
|
845
|
+
uint64_t compute_start_us = ggml_time_us();
|
846
|
+
ggml_backend_graph_compute(split_backend, split->graph);
|
847
|
+
// ggml_backend_synchronize(split_backend);
|
848
|
+
uint64_t compute_end_us = ggml_time_us();
|
849
|
+
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
850
|
+
}
|
851
|
+
|
852
|
+
#if 0
|
853
|
+
// per-backend timings
|
854
|
+
fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
|
855
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
856
|
+
if (copy_us[i] > 0 || compute_us[i] > 0) {
|
857
|
+
fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
|
858
|
+
}
|
859
|
+
}
|
860
|
+
#endif
|
861
|
+
}
|
862
|
+
|
863
|
+
static void sched_reset(ggml_backend_sched_t sched) {
|
864
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
865
|
+
ggml_tallocr_reset(sched->tallocs[i]);
|
866
|
+
}
|
867
|
+
}
|
868
|
+
|
869
|
+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
870
|
+
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
871
|
+
|
872
|
+
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
|
873
|
+
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
874
|
+
|
875
|
+
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
|
876
|
+
|
877
|
+
sched->n_backends = n_backends;
|
878
|
+
for (int i = 0; i < n_backends; i++) {
|
879
|
+
sched->backends[i] = backends[i];
|
880
|
+
}
|
881
|
+
|
882
|
+
sched->galloc = ggml_gallocr_new();
|
883
|
+
|
884
|
+
// init measure allocs for each backend
|
885
|
+
for (int i = 0; i < n_backends; i++) {
|
886
|
+
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
|
887
|
+
}
|
888
|
+
|
889
|
+
return sched;
|
890
|
+
}
|
891
|
+
|
892
|
+
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
893
|
+
if (sched == NULL) {
|
894
|
+
return;
|
895
|
+
}
|
896
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
897
|
+
ggml_tallocr_free(sched->tallocs[i]);
|
898
|
+
}
|
899
|
+
ggml_gallocr_free(sched->galloc);
|
900
|
+
free(sched->hash_set.keys);
|
901
|
+
free(sched->node_talloc);
|
902
|
+
free(sched->node_copies);
|
903
|
+
free(sched);
|
904
|
+
}
|
905
|
+
|
906
|
+
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
907
|
+
// initialize hash tables
|
908
|
+
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
909
|
+
sched->hash_set.size = hash_size;
|
910
|
+
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
911
|
+
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
912
|
+
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
913
|
+
|
914
|
+
sched_split_graph(sched, measure_graph);
|
915
|
+
sched_alloc_splits(sched);
|
916
|
+
|
917
|
+
// allocate buffers and reset allocators
|
918
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
919
|
+
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
920
|
+
ggml_tallocr_free(sched->tallocs[i]);
|
921
|
+
sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
|
922
|
+
}
|
923
|
+
|
924
|
+
sched_reset(sched);
|
925
|
+
}
|
926
|
+
|
927
|
+
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
928
|
+
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
929
|
+
|
930
|
+
sched_split_graph(sched, graph);
|
931
|
+
sched_alloc_splits(sched);
|
932
|
+
sched_compute_splits(sched);
|
933
|
+
sched_reset(sched);
|
934
|
+
}
|
935
|
+
|
936
|
+
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
937
|
+
int backend_index = sched_backend_prio(sched, backend);
|
938
|
+
return sched->tallocs[backend_index];
|
939
|
+
}
|
940
|
+
|
941
|
+
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
942
|
+
int backend_index = sched_backend_prio(sched, backend);
|
943
|
+
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
944
|
+
}
|
945
|
+
|
946
|
+
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
947
|
+
int backend_index = sched_backend_prio(sched, backend);
|
948
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
949
|
+
node_allocr(node) = sched->tallocs[backend_index];
|
950
|
+
}
|