cui-llama.rn 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +2 -4
- package/cpp/common.cpp +6 -14
- package/cpp/common.h +59 -40
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1640 -1604
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +87 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4442 -19516
- package/cpp/ggml.h +25 -146
- package/cpp/llama-sampling.cpp +392 -241
- package/cpp/llama-sampling.h +18 -0
- package/cpp/llama-vocab.cpp +16 -0
- package/cpp/llama-vocab.h +5 -0
- package/cpp/llama.cpp +2084 -2007
- package/cpp/llama.h +13 -11
- package/cpp/sampling.cpp +19 -11
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/cpp/ggml-backend.cpp
CHANGED
@@ -8,6 +8,7 @@
|
|
8
8
|
#include <windows.h>
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#include "ggml-backend.h"
|
11
12
|
#include "ggml-backend-impl.h"
|
12
13
|
#include "ggml-alloc.h"
|
13
14
|
#include "ggml-impl.h"
|
@@ -34,6 +35,11 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
|
|
34
35
|
}
|
35
36
|
|
36
37
|
lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
38
|
+
if (size == 0) {
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
40
|
+
return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
|
41
|
+
}
|
42
|
+
|
37
43
|
return buft->iface.alloc_buffer(buft, size);
|
38
44
|
}
|
39
45
|
|
@@ -89,7 +95,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
|
|
89
95
|
}
|
90
96
|
|
91
97
|
const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
|
92
|
-
return
|
98
|
+
return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
|
93
99
|
}
|
94
100
|
|
95
101
|
void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
|
@@ -108,6 +114,11 @@ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
|
|
108
114
|
}
|
109
115
|
|
110
116
|
void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
118
|
+
if (buffer->size == 0) {
|
119
|
+
return NULL;
|
120
|
+
}
|
121
|
+
|
111
122
|
void * base = buffer->iface.get_base(buffer);
|
112
123
|
|
113
124
|
LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
@@ -122,6 +133,15 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct
|
|
122
133
|
}
|
123
134
|
}
|
124
135
|
|
136
|
+
void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
137
|
+
// clear is optional if the buffer is zero-sized
|
138
|
+
if (buffer->size == 0) {
|
139
|
+
return;
|
140
|
+
}
|
141
|
+
|
142
|
+
buffer->iface.clear(buffer, value);
|
143
|
+
}
|
144
|
+
|
125
145
|
size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
|
126
146
|
return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
|
127
147
|
}
|
@@ -134,10 +154,6 @@ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, st
|
|
134
154
|
return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
|
135
155
|
}
|
136
156
|
|
137
|
-
void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
138
|
-
buffer->iface.clear(buffer, value);
|
139
|
-
}
|
140
|
-
|
141
157
|
bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
|
142
158
|
return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
|
143
159
|
}
|
@@ -198,7 +214,7 @@ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
|
|
198
214
|
}
|
199
215
|
|
200
216
|
lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
|
201
|
-
return backend->
|
217
|
+
return lm_ggml_backend_dev_buffer_type(backend->device);
|
202
218
|
}
|
203
219
|
|
204
220
|
lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
|
@@ -238,43 +254,42 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
|
|
238
254
|
void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
239
255
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
240
256
|
|
257
|
+
if (size == 0) {
|
258
|
+
return;
|
259
|
+
}
|
260
|
+
|
241
261
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
242
262
|
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
243
263
|
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
244
264
|
|
245
|
-
if (!size) {
|
246
|
-
return;
|
247
|
-
}
|
248
|
-
|
249
265
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
250
266
|
}
|
251
267
|
|
252
268
|
void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
253
269
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
254
270
|
|
271
|
+
if (size == 0) {
|
272
|
+
return;
|
273
|
+
}
|
274
|
+
|
255
275
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
256
276
|
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
257
277
|
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
|
258
278
|
|
259
|
-
if (!size) {
|
260
|
-
return;
|
261
|
-
}
|
262
|
-
|
263
279
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
264
280
|
}
|
265
281
|
|
266
282
|
LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
267
283
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
268
284
|
|
269
|
-
|
270
|
-
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
271
|
-
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
272
|
-
|
273
|
-
if (!size) {
|
285
|
+
if (size == 0) {
|
274
286
|
return;
|
275
287
|
}
|
276
288
|
|
277
|
-
LM_GGML_ASSERT(buf
|
289
|
+
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
290
|
+
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
291
|
+
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
292
|
+
LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
278
293
|
|
279
294
|
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
280
295
|
}
|
@@ -316,32 +331,15 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
|
|
316
331
|
}
|
317
332
|
|
318
333
|
bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
319
|
-
|
320
|
-
if (backend->device) {
|
321
|
-
return lm_ggml_backend_dev_supports_op(backend->device, op);
|
322
|
-
}
|
323
|
-
|
324
|
-
return backend->iface.supports_op(backend, op);
|
334
|
+
return lm_ggml_backend_dev_supports_op(backend->device, op);
|
325
335
|
}
|
326
336
|
|
327
337
|
bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
|
328
|
-
|
329
|
-
if (backend->device) {
|
330
|
-
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
331
|
-
}
|
332
|
-
return backend->iface.supports_buft(backend, buft);
|
338
|
+
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
333
339
|
}
|
334
340
|
|
335
341
|
bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
336
|
-
|
337
|
-
if (backend->device) {
|
338
|
-
return lm_ggml_backend_dev_offload_op(backend->device, op);
|
339
|
-
}
|
340
|
-
|
341
|
-
if (backend->iface.offload_op != NULL) {
|
342
|
-
return backend->iface.offload_op(backend, op);
|
343
|
-
}
|
344
|
-
return false;
|
342
|
+
return lm_ggml_backend_dev_offload_op(backend->device, op);
|
345
343
|
}
|
346
344
|
|
347
345
|
lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
|
@@ -561,6 +559,16 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
|
|
561
559
|
# include "ggml-amx.h"
|
562
560
|
#endif
|
563
561
|
|
562
|
+
#ifdef LM_GGML_USE_CANN
|
563
|
+
#include "ggml-cann.h"
|
564
|
+
#endif
|
565
|
+
|
566
|
+
#ifdef LM_GGML_USE_KOMPUTE
|
567
|
+
#include "ggml-kompute.h"
|
568
|
+
#endif
|
569
|
+
|
570
|
+
#include "ggml-cpu.h"
|
571
|
+
|
564
572
|
struct lm_ggml_backend_registry {
|
565
573
|
std::vector<lm_ggml_backend_reg_t> backends;
|
566
574
|
std::vector<lm_ggml_backend_dev_t> devices;
|
@@ -578,6 +586,9 @@ struct lm_ggml_backend_registry {
|
|
578
586
|
#ifdef LM_GGML_USE_VULKAN
|
579
587
|
register_backend(lm_ggml_backend_vk_reg());
|
580
588
|
#endif
|
589
|
+
#ifdef LM_GGML_USE_CANN
|
590
|
+
register_backend(lm_ggml_backend_cann_reg());
|
591
|
+
#endif
|
581
592
|
#ifdef LM_GGML_USE_BLAS
|
582
593
|
register_backend(lm_ggml_backend_blas_reg());
|
583
594
|
#endif
|
@@ -587,8 +598,9 @@ struct lm_ggml_backend_registry {
|
|
587
598
|
#ifdef LM_GGML_USE_AMX
|
588
599
|
register_backend(lm_ggml_backend_amx_reg());
|
589
600
|
#endif
|
590
|
-
|
591
|
-
|
601
|
+
#ifdef LM_GGML_USE_KOMPUTE
|
602
|
+
register_backend(lm_ggml_backend_kompute_reg());
|
603
|
+
#endif
|
592
604
|
|
593
605
|
register_backend(lm_ggml_backend_cpu_reg());
|
594
606
|
}
|
@@ -694,9 +706,9 @@ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type typ
|
|
694
706
|
}
|
695
707
|
|
696
708
|
lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
697
|
-
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(
|
709
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
|
698
710
|
if (!dev) {
|
699
|
-
dev = lm_ggml_backend_dev_by_type(
|
711
|
+
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
700
712
|
}
|
701
713
|
if (!dev) {
|
702
714
|
return NULL;
|
@@ -704,1922 +716,1946 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
|
704
716
|
return lm_ggml_backend_dev_init(dev, NULL);
|
705
717
|
}
|
706
718
|
|
707
|
-
//
|
708
|
-
|
709
|
-
static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
|
710
|
-
return "CPU";
|
711
|
-
|
712
|
-
LM_GGML_UNUSED(buffer);
|
713
|
-
}
|
719
|
+
// multi-buffer buffer
|
714
720
|
|
715
|
-
|
716
|
-
|
721
|
+
struct lm_ggml_backend_multi_buffer_context {
|
722
|
+
lm_ggml_backend_buffer_t * buffers;
|
723
|
+
size_t n_buffers;
|
724
|
+
};
|
717
725
|
|
718
|
-
|
719
|
-
|
720
|
-
|
726
|
+
static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
727
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
728
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
729
|
+
lm_ggml_backend_buffer_free(ctx->buffers[i]);
|
721
730
|
}
|
722
731
|
|
723
|
-
|
732
|
+
free(ctx->buffers);
|
733
|
+
free(ctx);
|
724
734
|
}
|
725
735
|
|
726
|
-
static void
|
727
|
-
|
736
|
+
static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
737
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
738
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
739
|
+
lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
|
740
|
+
}
|
728
741
|
}
|
729
742
|
|
730
|
-
static
|
731
|
-
|
743
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
|
744
|
+
/* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
|
745
|
+
/* .get_base = */ NULL,
|
746
|
+
/* .init_tensor = */ NULL,
|
747
|
+
/* .memset_tensor = */ NULL,
|
748
|
+
/* .set_tensor = */ NULL,
|
749
|
+
/* .get_tensor = */ NULL,
|
750
|
+
/* .cpy_tensor = */ NULL,
|
751
|
+
/* .clear = */ lm_ggml_backend_multi_buffer_clear,
|
752
|
+
/* .reset = */ NULL,
|
753
|
+
};
|
732
754
|
|
733
|
-
|
734
|
-
|
755
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
756
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
|
757
|
+
ctx->n_buffers = n_buffers;
|
758
|
+
ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
|
735
759
|
|
736
|
-
|
737
|
-
memcpy((char *)tensor->data + offset, data, size);
|
760
|
+
LM_GGML_ASSERT(ctx->buffers != NULL);
|
738
761
|
|
739
|
-
|
740
|
-
|
762
|
+
size_t total_size = 0;
|
763
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
764
|
+
ctx->buffers[i] = buffers[i];
|
765
|
+
total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
|
766
|
+
}
|
741
767
|
|
742
|
-
|
743
|
-
|
768
|
+
return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
|
769
|
+
}
|
744
770
|
|
745
|
-
|
771
|
+
bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
|
772
|
+
return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
|
746
773
|
}
|
747
774
|
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
775
|
+
void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
|
776
|
+
LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
|
777
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
778
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
779
|
+
lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
752
780
|
}
|
753
|
-
|
781
|
+
}
|
754
782
|
|
755
|
-
|
783
|
+
// creates a copy of the tensor with the same memory layout
|
784
|
+
static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
|
785
|
+
struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
|
786
|
+
for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
|
787
|
+
dup->nb[i] = tensor->nb[i];
|
788
|
+
}
|
789
|
+
return dup;
|
756
790
|
}
|
757
791
|
|
758
|
-
static
|
759
|
-
|
792
|
+
static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
|
793
|
+
return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
|
760
794
|
}
|
761
795
|
|
762
|
-
|
763
|
-
/* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
|
764
|
-
/* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
|
765
|
-
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
766
|
-
/* .init_tensor = */ NULL, // no initialization required
|
767
|
-
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
768
|
-
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
769
|
-
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
770
|
-
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
771
|
-
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
772
|
-
/* .reset = */ NULL,
|
773
|
-
};
|
796
|
+
// scheduler
|
774
797
|
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
779
|
-
/* .init_tensor = */ NULL, // no initialization required
|
780
|
-
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
781
|
-
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
782
|
-
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
783
|
-
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
784
|
-
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
785
|
-
/* .reset = */ NULL,
|
786
|
-
};
|
798
|
+
#ifndef LM_GGML_SCHED_MAX_BACKENDS
|
799
|
+
#define LM_GGML_SCHED_MAX_BACKENDS 16
|
800
|
+
#endif
|
787
801
|
|
788
|
-
|
789
|
-
|
802
|
+
#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
|
803
|
+
#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
|
804
|
+
#endif
|
790
805
|
|
791
|
-
|
792
|
-
|
806
|
+
#ifndef LM_GGML_SCHED_MAX_COPIES
|
807
|
+
#define LM_GGML_SCHED_MAX_COPIES 4
|
808
|
+
#endif
|
793
809
|
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
810
|
+
struct lm_ggml_backend_sched_split {
|
811
|
+
int backend_id;
|
812
|
+
int i_start;
|
813
|
+
int i_end;
|
814
|
+
struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
815
|
+
int n_inputs;
|
816
|
+
// graph view of this split
|
817
|
+
struct lm_ggml_cgraph graph;
|
818
|
+
};
|
799
819
|
|
800
|
-
|
820
|
+
struct lm_ggml_backend_sched {
|
821
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
822
|
+
bool is_alloc;
|
801
823
|
|
802
|
-
|
803
|
-
LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
804
|
-
return NULL;
|
805
|
-
}
|
824
|
+
int n_backends;
|
806
825
|
|
807
|
-
|
808
|
-
|
826
|
+
lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
|
827
|
+
lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
|
828
|
+
lm_ggml_gallocr_t galloc;
|
809
829
|
|
810
|
-
|
811
|
-
|
830
|
+
// hash map of the nodes in the graph
|
831
|
+
struct lm_ggml_hash_set hash_set;
|
832
|
+
int * hv_tensor_backend_ids; // [hash_set.size]
|
833
|
+
struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
812
834
|
|
813
|
-
|
814
|
-
|
835
|
+
int * node_backend_ids; // [graph_size]
|
836
|
+
int * leaf_backend_ids; // [graph_size]
|
815
837
|
|
816
|
-
|
817
|
-
|
838
|
+
int * prev_node_backend_ids; // [graph_size]
|
839
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
818
840
|
|
819
|
-
|
820
|
-
|
841
|
+
// copy of the graph with modified inputs
|
842
|
+
struct lm_ggml_cgraph graph;
|
821
843
|
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
827
|
-
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
828
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
829
|
-
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
830
|
-
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
831
|
-
},
|
832
|
-
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
833
|
-
/* .context = */ NULL,
|
834
|
-
};
|
844
|
+
// graph splits
|
845
|
+
struct lm_ggml_backend_sched_split * splits;
|
846
|
+
int n_splits;
|
847
|
+
int splits_capacity;
|
835
848
|
|
836
|
-
|
837
|
-
|
849
|
+
// pipeline parallelism support
|
850
|
+
int n_copies;
|
851
|
+
int cur_copy;
|
852
|
+
lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
|
853
|
+
struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
854
|
+
int n_graph_inputs;
|
838
855
|
|
839
|
-
|
856
|
+
struct lm_ggml_context * ctx;
|
840
857
|
|
841
|
-
|
858
|
+
lm_ggml_backend_sched_eval_callback callback_eval;
|
859
|
+
void * callback_eval_user_data;
|
842
860
|
|
843
|
-
|
861
|
+
char * context_buffer;
|
862
|
+
size_t context_buffer_size;
|
844
863
|
|
845
|
-
|
846
|
-
|
864
|
+
int debug;
|
865
|
+
};
|
847
866
|
|
848
|
-
|
849
|
-
|
867
|
+
#define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
868
|
+
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
869
|
+
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
870
|
+
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
850
871
|
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
872
|
+
// returns the priority of the backend, lower id is higher priority
|
873
|
+
static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
|
874
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
875
|
+
if (sched->backends[i] == backend) {
|
876
|
+
return i;
|
877
|
+
}
|
878
|
+
}
|
879
|
+
return -1;
|
855
880
|
}
|
856
881
|
|
857
|
-
static
|
858
|
-
|
859
|
-
|
882
|
+
static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
|
883
|
+
lm_ggml_backend_buffer_t buffer = tensor->buffer;
|
884
|
+
if (buffer == NULL) {
|
885
|
+
return -1;
|
886
|
+
}
|
860
887
|
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
return NULL;
|
888
|
+
// find highest prio backend that supports the buffer type and the op
|
889
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
890
|
+
if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
891
|
+
lm_ggml_backend_supports_op(sched->backends[i], op)) {
|
892
|
+
return i;
|
893
|
+
}
|
868
894
|
}
|
869
895
|
|
870
|
-
|
871
|
-
buffer
|
872
|
-
|
873
|
-
|
896
|
+
#ifndef NDEBUG
|
897
|
+
LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
898
|
+
__func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
|
899
|
+
#endif
|
874
900
|
|
875
|
-
return
|
901
|
+
return -1;
|
876
902
|
}
|
877
903
|
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
887
|
-
},
|
888
|
-
/* .context = */ NULL,
|
889
|
-
};
|
890
|
-
|
891
|
-
return &lm_ggml_backend_cpu_buffer_type_hbm;
|
892
|
-
}
|
904
|
+
#if 0
|
905
|
+
#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
906
|
+
static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
907
|
+
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
908
|
+
#define GET_CAUSE(node) causes[hash_id(node)]
|
909
|
+
#else
|
910
|
+
#define SET_CAUSE(node, ...)
|
911
|
+
#define GET_CAUSE(node) ""
|
893
912
|
#endif
|
894
913
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
uint8_t * work_data;
|
900
|
-
size_t work_size;
|
914
|
+
// returns the backend that should be used for the node based on the current locations
|
915
|
+
static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
|
916
|
+
// TODO: use supports_op to check if the backend supports the op
|
901
917
|
|
902
|
-
|
903
|
-
|
904
|
-
|
918
|
+
// assign pre-allocated nodes to their backend
|
919
|
+
int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
920
|
+
if (cur_backend_id != -1) {
|
921
|
+
SET_CAUSE(tensor, "1.dst");
|
922
|
+
return cur_backend_id;
|
923
|
+
}
|
905
924
|
|
906
|
-
|
907
|
-
|
925
|
+
// view_src
|
926
|
+
if (tensor->view_src != NULL) {
|
927
|
+
cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
928
|
+
if (cur_backend_id != -1) {
|
929
|
+
SET_CAUSE(tensor, "1.vsrc");
|
930
|
+
return cur_backend_id;
|
931
|
+
}
|
932
|
+
}
|
908
933
|
|
909
|
-
|
910
|
-
|
934
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
935
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
936
|
+
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
937
|
+
}
|
911
938
|
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
}
|
939
|
+
// graph input
|
940
|
+
if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
|
941
|
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
942
|
+
SET_CAUSE(tensor, "1.inp");
|
943
|
+
return cur_backend_id;
|
944
|
+
}
|
918
945
|
|
919
|
-
|
920
|
-
|
946
|
+
// operations with weights are preferably run on the same backend as the weights
|
947
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
948
|
+
const struct lm_ggml_tensor * src = tensor->src[i];
|
949
|
+
if (src == NULL) {
|
950
|
+
continue;
|
951
|
+
}
|
952
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
953
|
+
// not an ideal solution
|
954
|
+
if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
955
|
+
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
956
|
+
// check if a backend with higher prio wants to offload the op
|
957
|
+
if (src_backend_id == sched->n_backends - 1) {
|
958
|
+
for (int b = 0; b < src_backend_id; b++) {
|
959
|
+
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
960
|
+
SET_CAUSE(tensor, "1.off");
|
961
|
+
return b;
|
962
|
+
}
|
963
|
+
}
|
964
|
+
}
|
965
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
966
|
+
return src_backend_id;
|
967
|
+
}
|
968
|
+
}
|
921
969
|
|
922
|
-
|
970
|
+
return -1;
|
923
971
|
}
|
924
972
|
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
936
|
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
973
|
+
static char * fmt_size(size_t size) {
|
974
|
+
static char buffer[128];
|
975
|
+
if (size >= 1024*1024) {
|
976
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
977
|
+
} else {
|
978
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
979
|
+
}
|
980
|
+
return buffer;
|
981
|
+
}
|
937
982
|
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
983
|
+
static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
984
|
+
int cur_split = 0;
|
985
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
986
|
+
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
987
|
+
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
988
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
|
989
|
+
sched->splits[cur_split].n_inputs);
|
990
|
+
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
991
|
+
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
992
|
+
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
993
|
+
}
|
994
|
+
LM_GGML_LOG_DEBUG("\n");
|
995
|
+
cur_split++;
|
996
|
+
}
|
997
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
998
|
+
if (lm_ggml_is_view_op(node->op)) {
|
999
|
+
continue;
|
1000
|
+
}
|
1001
|
+
if (sched->debug > 1) {
|
1002
|
+
lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
|
1003
|
+
LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
|
1004
|
+
fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1005
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1006
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1007
|
+
if (src == NULL) {
|
1008
|
+
continue;
|
1009
|
+
}
|
1010
|
+
lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
|
1011
|
+
LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1012
|
+
fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1013
|
+
}
|
1014
|
+
LM_GGML_LOG_DEBUG("\n");
|
943
1015
|
}
|
944
1016
|
}
|
945
|
-
|
946
|
-
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
947
|
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
948
|
-
|
949
|
-
return cpu_plan;
|
950
1017
|
}
|
951
1018
|
|
952
|
-
static
|
953
|
-
|
1019
|
+
static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
|
1020
|
+
lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
1021
|
+
lm_ggml_backend_buffer_type_t buft = NULL;
|
954
1022
|
|
955
|
-
|
956
|
-
|
1023
|
+
if (buf) {
|
1024
|
+
// the tensor is already allocated
|
1025
|
+
buft = buf->buft;
|
1026
|
+
} else {
|
1027
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
1028
|
+
int tensor_backend_id = tensor_backend_id(t);
|
1029
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
1030
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
1031
|
+
}
|
1032
|
+
if (tensor_backend_id != -1) {
|
1033
|
+
buft = sched->bufts[tensor_backend_id];
|
1034
|
+
}
|
1035
|
+
}
|
957
1036
|
|
958
|
-
|
1037
|
+
return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
959
1038
|
}
|
960
1039
|
|
961
|
-
static
|
962
|
-
|
1040
|
+
static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
1041
|
+
if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
1042
|
+
*node_backend_id = cur_backend_id;
|
1043
|
+
SET_CAUSE(node, "2.sup");
|
1044
|
+
}
|
1045
|
+
}
|
963
1046
|
|
964
|
-
|
1047
|
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
1048
|
+
static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1049
|
+
// reset splits
|
1050
|
+
sched->n_splits = 0;
|
1051
|
+
sched->n_graph_inputs = 0;
|
1052
|
+
sched->is_reset = false;
|
965
1053
|
|
966
|
-
|
967
|
-
|
1054
|
+
struct lm_ggml_init_params params = {
|
1055
|
+
/* .mem_size = */ sched->context_buffer_size,
|
1056
|
+
/* .mem_buffer = */ sched->context_buffer,
|
1057
|
+
/* .no_alloc = */ true
|
1058
|
+
};
|
968
1059
|
|
969
|
-
|
970
|
-
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
1060
|
+
lm_ggml_free(sched->ctx);
|
971
1061
|
|
972
|
-
|
1062
|
+
sched->ctx = lm_ggml_init(params);
|
1063
|
+
if (sched->ctx == NULL) {
|
1064
|
+
LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
|
1065
|
+
}
|
973
1066
|
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
1067
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
1068
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1069
|
+
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1070
|
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
1071
|
+
// do not overwrite user assignments
|
1072
|
+
if (*leaf_backend_id == -1) {
|
1073
|
+
*leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
980
1074
|
}
|
981
|
-
cpu_ctx->work_size = cplan.work_size;
|
982
1075
|
}
|
983
|
-
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
984
|
-
|
985
|
-
cplan.abort_callback = cpu_ctx->abort_callback;
|
986
|
-
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
987
1076
|
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
/* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
|
995
|
-
/* .set_tensor_async = */ NULL,
|
996
|
-
/* .get_tensor_async = */ NULL,
|
997
|
-
/* .cpy_tensor_async = */ NULL,
|
998
|
-
/* .synchronize = */ NULL,
|
999
|
-
/* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
|
1000
|
-
/* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
|
1001
|
-
/* .graph_plan_update = */ NULL,
|
1002
|
-
/* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
|
1003
|
-
/* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
|
1004
|
-
/* .supports_op = */ NULL,
|
1005
|
-
/* .supports_buft = */ NULL,
|
1006
|
-
/* .offload_op = */ NULL,
|
1007
|
-
/* .event_record = */ NULL,
|
1008
|
-
/* .event_wait = */ NULL,
|
1009
|
-
};
|
1077
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1078
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1079
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1080
|
+
// do not overwrite user assignments
|
1081
|
+
if (*node_backend_id == -1) {
|
1082
|
+
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
|
1010
1083
|
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1084
|
+
#if 0
|
1085
|
+
// src
|
1086
|
+
if (node->op == LM_GGML_OP_NONE) {
|
1087
|
+
continue;
|
1088
|
+
}
|
1015
1089
|
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1090
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1091
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1092
|
+
if (src == NULL) {
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1096
|
+
if (*src_backend_id == -1) {
|
1097
|
+
*src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
|
1098
|
+
}
|
1099
|
+
}
|
1100
|
+
#endif
|
1101
|
+
}
|
1020
1102
|
}
|
1021
1103
|
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1104
|
+
// pass 2: expand current backend assignments
|
1105
|
+
// assign the same backend to adjacent nodes
|
1106
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1107
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1108
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
1109
|
+
// expand gpu down
|
1110
|
+
{
|
1111
|
+
int cur_backend_id = -1;
|
1112
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1113
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1114
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1115
|
+
continue;
|
1116
|
+
}
|
1117
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1118
|
+
if (*node_backend_id != -1) {
|
1119
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1120
|
+
// skip cpu (lowest prio backend)
|
1121
|
+
cur_backend_id = -1;
|
1122
|
+
} else {
|
1123
|
+
cur_backend_id = *node_backend_id;
|
1124
|
+
}
|
1125
|
+
} else if (cur_backend_id != -1) {
|
1126
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1127
|
+
}
|
1128
|
+
}
|
1039
1129
|
}
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
}
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1130
|
+
// expand gpu up
|
1131
|
+
{
|
1132
|
+
int cur_backend_id = -1;
|
1133
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1134
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1135
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1136
|
+
continue;
|
1137
|
+
}
|
1138
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1139
|
+
if (*node_backend_id != -1) {
|
1140
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1141
|
+
// skip cpu (lowest prio backend)
|
1142
|
+
cur_backend_id = -1;
|
1143
|
+
} else {
|
1144
|
+
cur_backend_id = *node_backend_id;
|
1145
|
+
}
|
1146
|
+
} else if (cur_backend_id != -1) {
|
1147
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1148
|
+
}
|
1149
|
+
}
|
1150
|
+
}
|
1151
|
+
// expand rest down
|
1152
|
+
{
|
1153
|
+
int cur_backend_id = -1;
|
1154
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1155
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1156
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1157
|
+
continue;
|
1158
|
+
}
|
1159
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1160
|
+
if (*node_backend_id != -1) {
|
1161
|
+
cur_backend_id = *node_backend_id;
|
1162
|
+
} else if (cur_backend_id != -1) {
|
1163
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1164
|
+
}
|
1165
|
+
}
|
1166
|
+
}
|
1167
|
+
// expand rest up
|
1168
|
+
{
|
1169
|
+
int cur_backend_id = -1;
|
1170
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1171
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1172
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1173
|
+
continue;
|
1174
|
+
}
|
1175
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1176
|
+
if (*node_backend_id != -1) {
|
1177
|
+
cur_backend_id = *node_backend_id;
|
1178
|
+
} else if (cur_backend_id != -1) {
|
1179
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1180
|
+
}
|
1181
|
+
}
|
1063
1182
|
}
|
1064
|
-
ctx->threadpool = threadpool;
|
1065
|
-
}
|
1066
|
-
|
1067
|
-
void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
|
1068
|
-
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
1069
|
-
|
1070
|
-
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
1071
|
-
ctx->abort_callback = abort_callback;
|
1072
|
-
ctx->abort_callback_data = abort_callback_data;
|
1073
|
-
}
|
1074
|
-
|
1075
|
-
lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
1076
|
-
LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
1077
|
-
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
1078
|
-
}
|
1079
|
-
|
1080
|
-
////////////////////////
|
1081
|
-
|
1082
|
-
struct lm_ggml_backend_cpu_device_context {
|
1083
|
-
std::string description = "CPU";
|
1084
1183
|
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1184
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
1185
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
1186
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
1187
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
1188
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
1189
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
1190
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1191
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1192
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1193
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1194
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1195
|
+
continue;
|
1091
1196
|
}
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
if (
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1197
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1198
|
+
if (*node_backend_id == -1) {
|
1199
|
+
// unassigned node: find the backend with the most supported inputs
|
1200
|
+
int n_supported_best = -1;
|
1201
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1202
|
+
if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1203
|
+
int n_supported = 0;
|
1204
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1205
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1206
|
+
if (src == NULL) {
|
1207
|
+
continue;
|
1103
1208
|
}
|
1104
|
-
|
1105
|
-
|
1209
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1210
|
+
n_supported++;
|
1106
1211
|
}
|
1107
|
-
|
1212
|
+
}
|
1213
|
+
if (n_supported > n_supported_best) {
|
1214
|
+
n_supported_best = n_supported;
|
1215
|
+
*node_backend_id = b;
|
1216
|
+
SET_CAUSE(node, "3.best");
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
}
|
1220
|
+
} else {
|
1221
|
+
// assigned node: upgrade to higher prio backend if possible
|
1222
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
1223
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1224
|
+
bool supported = true;
|
1225
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1226
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1227
|
+
if (src == NULL) {
|
1228
|
+
continue;
|
1229
|
+
}
|
1230
|
+
if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1231
|
+
supported = false;
|
1232
|
+
break;
|
1233
|
+
}
|
1234
|
+
}
|
1235
|
+
if (supported) {
|
1236
|
+
*node_backend_id = b;
|
1237
|
+
SET_CAUSE(node, "3.upg");
|
1108
1238
|
break;
|
1109
1239
|
}
|
1110
1240
|
}
|
1111
1241
|
}
|
1112
|
-
fclose(f);
|
1113
1242
|
}
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1243
|
+
}
|
1244
|
+
|
1245
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
1246
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1247
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1248
|
+
int * cur_backend_id = &tensor_backend_id(node);
|
1249
|
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1250
|
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
1251
|
+
SET_CAUSE(node, "4.vsrc");
|
1252
|
+
}
|
1253
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1254
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1255
|
+
if (src == NULL) {
|
1256
|
+
continue;
|
1257
|
+
}
|
1258
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1259
|
+
if (*src_backend_id == -1) {
|
1260
|
+
if (src->view_src != NULL) {
|
1261
|
+
// views are always on the same backend as the source
|
1262
|
+
*src_backend_id = tensor_backend_id(src->view_src);
|
1263
|
+
SET_CAUSE(src, "4.vsrc");
|
1264
|
+
} else {
|
1265
|
+
*src_backend_id = *cur_backend_id;
|
1266
|
+
SET_CAUSE(src, "4.cur");
|
1138
1267
|
}
|
1139
1268
|
}
|
1140
|
-
RegCloseKey(hKey);
|
1141
1269
|
}
|
1142
|
-
#endif
|
1143
1270
|
}
|
1144
|
-
};
|
1145
|
-
|
1146
|
-
static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
|
1147
|
-
return "CPU";
|
1148
|
-
|
1149
|
-
LM_GGML_UNUSED(dev);
|
1150
|
-
}
|
1151
1271
|
|
1152
|
-
|
1153
|
-
|
1272
|
+
// pass 5: split graph, find tensors that need to be copied
|
1273
|
+
{
|
1274
|
+
int i_split = 0;
|
1275
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[0];
|
1276
|
+
// find the backend of the first split, skipping view ops
|
1277
|
+
int i = 0;
|
1278
|
+
for (; i < graph->n_nodes; i++) {
|
1279
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1280
|
+
if (!lm_ggml_is_view_op(node->op)) {
|
1281
|
+
split->backend_id = tensor_backend_id(node);
|
1282
|
+
break;
|
1283
|
+
}
|
1284
|
+
}
|
1285
|
+
split->i_start = 0;
|
1286
|
+
split->n_inputs = 0;
|
1287
|
+
int cur_backend_id = split->backend_id;
|
1288
|
+
for (; i < graph->n_nodes; i++) {
|
1289
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1154
1290
|
|
1155
|
-
|
1156
|
-
|
1291
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1292
|
+
continue;
|
1293
|
+
}
|
1157
1294
|
|
1158
|
-
|
1159
|
-
// TODO
|
1160
|
-
*free = 0;
|
1161
|
-
*total = 0;
|
1295
|
+
const int node_backend_id = tensor_backend_id(node);
|
1162
1296
|
|
1163
|
-
|
1164
|
-
}
|
1297
|
+
assert(node_backend_id != -1); // all nodes should be assigned by now
|
1165
1298
|
|
1166
|
-
|
1167
|
-
|
1299
|
+
// check if we should start a new split based on the sources of the current node
|
1300
|
+
bool need_new_split = false;
|
1301
|
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1302
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1303
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1304
|
+
if (src == NULL) {
|
1305
|
+
continue;
|
1306
|
+
}
|
1307
|
+
// check if a weight is on a different and incompatible backend
|
1308
|
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1309
|
+
if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1310
|
+
int src_backend_id = tensor_backend_id(src);
|
1311
|
+
if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1312
|
+
need_new_split = true;
|
1313
|
+
break;
|
1314
|
+
}
|
1315
|
+
}
|
1316
|
+
// check if the split has too many inputs
|
1317
|
+
// FIXME: count the number of inputs instead of only checking when full
|
1318
|
+
if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1319
|
+
const size_t id = hash_id(src);
|
1320
|
+
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
1321
|
+
bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1322
|
+
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
1323
|
+
need_new_split = true;
|
1324
|
+
break;
|
1325
|
+
}
|
1326
|
+
}
|
1327
|
+
}
|
1328
|
+
}
|
1168
1329
|
|
1169
|
-
|
1170
|
-
|
1330
|
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
1331
|
+
split->i_end = i;
|
1332
|
+
i_split++;
|
1333
|
+
if (i_split >= sched->splits_capacity) {
|
1334
|
+
sched->splits_capacity *= 2;
|
1335
|
+
sched->splits = (lm_ggml_backend_sched_split *)
|
1336
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1337
|
+
LM_GGML_ASSERT(sched->splits != NULL);
|
1338
|
+
}
|
1339
|
+
split = &sched->splits[i_split];
|
1340
|
+
split->backend_id = node_backend_id;
|
1341
|
+
split->i_start = i;
|
1342
|
+
split->n_inputs = 0;
|
1343
|
+
cur_backend_id = node_backend_id;
|
1344
|
+
}
|
1171
1345
|
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
/* .async = */ false,
|
1179
|
-
/* .host_buffer = */ false,
|
1180
|
-
/* .buffer_from_host_ptr = */ true,
|
1181
|
-
/* .events = */ false,
|
1182
|
-
};
|
1183
|
-
}
|
1346
|
+
// find inputs that are not on the same backend
|
1347
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1348
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1349
|
+
if (src == NULL) {
|
1350
|
+
continue;
|
1351
|
+
}
|
1184
1352
|
|
1185
|
-
|
1186
|
-
|
1353
|
+
size_t src_id = hash_id(src);
|
1354
|
+
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
1355
|
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1187
1356
|
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1357
|
+
if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1358
|
+
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
1359
|
+
lm_ggml_backend_t backend = sched->backends[src_backend_id];
|
1360
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1361
|
+
struct lm_ggml_tensor * tensor_copy;
|
1362
|
+
if (c == sched->cur_copy) {
|
1363
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1364
|
+
} else {
|
1365
|
+
tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1366
|
+
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1367
|
+
}
|
1368
|
+
if (sched->n_copies > 1) {
|
1369
|
+
lm_ggml_set_input(tensor_copy);
|
1370
|
+
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1371
|
+
}
|
1372
|
+
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
1373
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1374
|
+
}
|
1375
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1376
|
+
LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1377
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1378
|
+
}
|
1379
|
+
}
|
1191
1380
|
|
1192
|
-
|
1193
|
-
|
1381
|
+
if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1382
|
+
// create a copy of the input in the split's backend
|
1383
|
+
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
1384
|
+
lm_ggml_backend_t backend = sched->backends[cur_backend_id];
|
1385
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1386
|
+
struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1387
|
+
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1388
|
+
if (sched->n_copies > 1) {
|
1389
|
+
lm_ggml_set_input(tensor_copy);
|
1390
|
+
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1391
|
+
}
|
1392
|
+
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
1393
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1394
|
+
}
|
1395
|
+
int n_inputs = split->n_inputs++;
|
1396
|
+
LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1397
|
+
split->inputs[n_inputs] = src;
|
1398
|
+
}
|
1399
|
+
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
}
|
1403
|
+
split->i_end = graph->n_nodes;
|
1404
|
+
sched->n_splits = i_split + 1;
|
1405
|
+
}
|
1194
1406
|
|
1195
|
-
|
1196
|
-
|
1407
|
+
if (sched->debug) {
|
1408
|
+
lm_ggml_backend_sched_print_assignments(sched, graph);
|
1409
|
+
}
|
1197
1410
|
|
1198
|
-
|
1199
|
-
|
1411
|
+
// swap node_backend_ids and leaf _backend_ids with prevs
|
1412
|
+
{
|
1413
|
+
int * tmp = sched->node_backend_ids;
|
1414
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
1415
|
+
sched->prev_node_backend_ids = tmp;
|
1200
1416
|
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1417
|
+
tmp = sched->leaf_backend_ids;
|
1418
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
1419
|
+
sched->prev_leaf_backend_ids = tmp;
|
1420
|
+
}
|
1204
1421
|
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
1213
|
-
case LM_GGML_OP_MUL_MAT:
|
1214
|
-
return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
1215
|
-
case LM_GGML_OP_ROPE_BACK:
|
1216
|
-
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
1217
|
-
case LM_GGML_OP_IM2COL_BACK:
|
1218
|
-
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
1219
|
-
case LM_GGML_OP_OUT_PROD:
|
1220
|
-
return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
|
1221
|
-
default:
|
1222
|
-
return true;
|
1422
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
1423
|
+
if (sched->graph.size < graph_size) {
|
1424
|
+
sched->graph.size = graph_size;
|
1425
|
+
sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
|
1426
|
+
sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
|
1427
|
+
LM_GGML_ASSERT(sched->graph.nodes != NULL);
|
1428
|
+
LM_GGML_ASSERT(sched->graph.leafs != NULL);
|
1223
1429
|
}
|
1430
|
+
sched->graph.n_nodes = 0;
|
1431
|
+
sched->graph.n_leafs = 0;
|
1224
1432
|
|
1225
|
-
|
1226
|
-
}
|
1227
|
-
|
1228
|
-
static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
|
1229
|
-
return lm_ggml_backend_buft_is_host(buft);
|
1230
|
-
|
1231
|
-
LM_GGML_UNUSED(dev);
|
1232
|
-
}
|
1433
|
+
struct lm_ggml_cgraph * graph_copy = &sched->graph;
|
1233
1434
|
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
/* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
|
1238
|
-
/* .get_type = */ lm_ggml_backend_cpu_device_get_type,
|
1239
|
-
/* .get_props = */ lm_ggml_backend_cpu_device_get_props,
|
1240
|
-
/* .init_backend = */ lm_ggml_backend_cpu_device_init,
|
1241
|
-
/* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
|
1242
|
-
/* .get_host_buffer_type = */ NULL,
|
1243
|
-
/* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
|
1244
|
-
/* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
|
1245
|
-
/* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
|
1246
|
-
/* .offload_op = */ NULL,
|
1247
|
-
/* .event_new = */ NULL,
|
1248
|
-
/* .event_free = */ NULL,
|
1249
|
-
/* .event_synchronize = */ NULL,
|
1250
|
-
};
|
1435
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1436
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[i];
|
1437
|
+
split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
|
1251
1438
|
|
1252
|
-
|
1439
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1440
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1441
|
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
1253
1442
|
|
1254
|
-
|
1255
|
-
|
1443
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1444
|
+
const size_t input_id = hash_id(input);
|
1445
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
1256
1446
|
|
1257
|
-
|
1258
|
-
|
1447
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
1448
|
+
struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
|
1449
|
+
input_dep->src[0] = input;
|
1450
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
1451
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1259
1452
|
|
1260
|
-
|
1261
|
-
|
1453
|
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
1454
|
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
1455
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1456
|
+
}
|
1262
1457
|
|
1263
|
-
|
1264
|
-
|
1458
|
+
for (int j = split->i_start; j < split->i_end; j++) {
|
1459
|
+
assert(graph_copy->size > graph_copy->n_nodes);
|
1460
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1461
|
+
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1462
|
+
}
|
1463
|
+
}
|
1265
1464
|
|
1266
|
-
|
1267
|
-
|
1465
|
+
if (sched->n_copies > 1) {
|
1466
|
+
// add input copies as leafs so that they are allocated first
|
1467
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1468
|
+
struct lm_ggml_tensor * input = sched->graph_inputs[i];
|
1469
|
+
size_t id = hash_id(input);
|
1470
|
+
int backend_id = tensor_backend_id(input);
|
1471
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1472
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1473
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1474
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1475
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1476
|
+
}
|
1477
|
+
}
|
1268
1478
|
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1479
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1480
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[i];
|
1481
|
+
int backend_id = split->backend_id;
|
1482
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1483
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1484
|
+
size_t id = hash_id(input);
|
1485
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1486
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1487
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1488
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1489
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1490
|
+
}
|
1491
|
+
}
|
1492
|
+
}
|
1493
|
+
}
|
1275
1494
|
|
1276
|
-
|
1495
|
+
// add leafs from the original graph
|
1496
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1497
|
+
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1498
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1499
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1500
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1501
|
+
}
|
1277
1502
|
}
|
1278
1503
|
|
1279
|
-
static
|
1280
|
-
|
1281
|
-
|
1504
|
+
static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
|
1505
|
+
bool backend_ids_changed = false;
|
1506
|
+
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
1507
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
1508
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
1509
|
+
backend_ids_changed = true;
|
1510
|
+
break;
|
1511
|
+
}
|
1512
|
+
}
|
1513
|
+
if (!backend_ids_changed) {
|
1514
|
+
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
1515
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
1516
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
1517
|
+
backend_ids_changed = true;
|
1518
|
+
break;
|
1519
|
+
}
|
1520
|
+
}
|
1282
1521
|
}
|
1283
|
-
return NULL;
|
1284
1522
|
|
1285
|
-
|
1523
|
+
// allocate graph
|
1524
|
+
if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1525
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1526
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1527
|
+
#ifndef NDEBUG
|
1528
|
+
LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
1529
|
+
#endif
|
1530
|
+
lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1531
|
+
if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1532
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
1533
|
+
return false;
|
1534
|
+
}
|
1535
|
+
}
|
1536
|
+
|
1537
|
+
return true;
|
1286
1538
|
}
|
1287
1539
|
|
1288
|
-
static
|
1289
|
-
|
1290
|
-
/* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
|
1291
|
-
/* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
|
1292
|
-
/* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
|
1293
|
-
};
|
1540
|
+
static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
|
1541
|
+
struct lm_ggml_backend_sched_split * splits = sched->splits;
|
1294
1542
|
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
};
|
1543
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1544
|
+
struct lm_ggml_backend_sched_split * split = &splits[i];
|
1545
|
+
int split_backend_id = split->backend_id;
|
1546
|
+
lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1300
1547
|
|
1301
|
-
|
1302
|
-
|
1548
|
+
// copy the input tensors to the split backend
|
1549
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1550
|
+
lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1551
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1552
|
+
struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
1303
1553
|
|
1304
|
-
|
1554
|
+
if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
|
1555
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1556
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1557
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1558
|
+
} else {
|
1559
|
+
lm_ggml_backend_synchronize(split_backend);
|
1560
|
+
}
|
1561
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1562
|
+
} else {
|
1563
|
+
// wait for the split backend to finish using the input before overwriting it
|
1564
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1565
|
+
lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1566
|
+
} else {
|
1567
|
+
lm_ggml_backend_synchronize(split_backend);
|
1568
|
+
}
|
1569
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
1570
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
1571
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
1572
|
+
lm_ggml_backend_synchronize(input_backend);
|
1573
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1574
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1575
|
+
} else {
|
1576
|
+
lm_ggml_backend_synchronize(split_backend);
|
1577
|
+
}
|
1578
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1579
|
+
}
|
1580
|
+
}
|
1581
|
+
}
|
1305
1582
|
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1583
|
+
if (!sched->callback_eval) {
|
1584
|
+
enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1585
|
+
if (ec != LM_GGML_STATUS_SUCCESS) {
|
1586
|
+
return ec;
|
1587
|
+
}
|
1588
|
+
} else {
|
1589
|
+
// similar to lm_ggml_backend_compare_graph_backend
|
1590
|
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
1591
|
+
struct lm_ggml_tensor * t = split->graph.nodes[j0];
|
1310
1592
|
|
1311
|
-
|
1312
|
-
|
1593
|
+
// check if the user needs data from this node
|
1594
|
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1313
1595
|
|
1314
|
-
|
1315
|
-
}
|
1596
|
+
int j1 = j0;
|
1316
1597
|
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1598
|
+
// determine the range [j0, j1] of nodes that can be computed together
|
1599
|
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
1600
|
+
t = split->graph.nodes[++j1];
|
1601
|
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1602
|
+
}
|
1322
1603
|
|
1323
|
-
|
1324
|
-
free(ctx);
|
1325
|
-
}
|
1604
|
+
struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
|
1326
1605
|
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
}
|
1332
|
-
}
|
1606
|
+
enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
|
1607
|
+
if (ec != LM_GGML_STATUS_SUCCESS) {
|
1608
|
+
return ec;
|
1609
|
+
}
|
1333
1610
|
|
1334
|
-
|
1335
|
-
|
1336
|
-
/* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
|
1337
|
-
/* .get_base = */ NULL,
|
1338
|
-
/* .init_tensor = */ NULL,
|
1339
|
-
/* .memset_tensor = */ NULL,
|
1340
|
-
/* .set_tensor = */ NULL,
|
1341
|
-
/* .get_tensor = */ NULL,
|
1342
|
-
/* .cpy_tensor = */ NULL,
|
1343
|
-
/* .clear = */ lm_ggml_backend_multi_buffer_clear,
|
1344
|
-
/* .reset = */ NULL,
|
1345
|
-
};
|
1611
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1612
|
+
lm_ggml_backend_synchronize(split_backend);
|
1346
1613
|
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
|
1614
|
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1615
|
+
break;
|
1616
|
+
}
|
1351
1617
|
|
1352
|
-
|
1618
|
+
j0 = j1;
|
1619
|
+
}
|
1620
|
+
}
|
1353
1621
|
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1622
|
+
// record the event of this copy
|
1623
|
+
if (split->n_inputs > 0) {
|
1624
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1625
|
+
lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
1626
|
+
}
|
1627
|
+
}
|
1358
1628
|
}
|
1359
1629
|
|
1360
|
-
|
1361
|
-
}
|
1630
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1362
1631
|
|
1363
|
-
|
1364
|
-
return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
|
1632
|
+
return LM_GGML_STATUS_SUCCESS;
|
1365
1633
|
}
|
1366
1634
|
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1635
|
+
lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
1636
|
+
lm_ggml_backend_t * backends,
|
1637
|
+
lm_ggml_backend_buffer_type_t * bufts,
|
1638
|
+
int n_backends,
|
1639
|
+
size_t graph_size,
|
1640
|
+
bool parallel) {
|
1641
|
+
LM_GGML_ASSERT(n_backends > 0);
|
1642
|
+
LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
|
1643
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1374
1644
|
|
1375
|
-
|
1376
|
-
static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
|
1377
|
-
struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
|
1378
|
-
for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
|
1379
|
-
dup->nb[i] = tensor->nb[i];
|
1380
|
-
}
|
1381
|
-
return dup;
|
1382
|
-
}
|
1645
|
+
struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
|
1383
1646
|
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1647
|
+
const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
|
1648
|
+
sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
|
1649
|
+
sched->n_backends = n_backends;
|
1650
|
+
sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
|
1387
1651
|
|
1388
|
-
//
|
1652
|
+
// initialize hash table
|
1653
|
+
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
1654
|
+
sched->hash_set = lm_ggml_hash_set_new(graph_size);
|
1655
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1656
|
+
sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1389
1657
|
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1658
|
+
const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1659
|
+
const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1660
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1661
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1662
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1663
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1393
1664
|
|
1394
|
-
|
1395
|
-
|
1396
|
-
#endif
|
1665
|
+
sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
|
1666
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
1397
1667
|
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1668
|
+
const int initial_splits_capacity = 16;
|
1669
|
+
sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1670
|
+
sched->splits_capacity = initial_splits_capacity;
|
1401
1671
|
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
1407
|
-
int n_inputs;
|
1408
|
-
// graph view of this split
|
1409
|
-
struct lm_ggml_cgraph graph;
|
1410
|
-
};
|
1672
|
+
for (int b = 0; b < n_backends; b++) {
|
1673
|
+
sched->backends[b] = backends[b];
|
1674
|
+
sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
|
1675
|
+
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
1411
1676
|
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1677
|
+
if (sched->n_copies > 1) {
|
1678
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1679
|
+
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
1680
|
+
}
|
1681
|
+
}
|
1682
|
+
}
|
1415
1683
|
|
1416
|
-
|
1684
|
+
sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
|
1417
1685
|
|
1418
|
-
|
1419
|
-
lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
|
1420
|
-
lm_ggml_gallocr_t galloc;
|
1686
|
+
lm_ggml_backend_sched_reset(sched);
|
1421
1687
|
|
1422
|
-
|
1423
|
-
|
1424
|
-
int * hv_tensor_backend_ids; // [hash_set.size]
|
1425
|
-
struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
1688
|
+
return sched;
|
1689
|
+
}
|
1426
1690
|
|
1427
|
-
|
1428
|
-
|
1691
|
+
void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
|
1692
|
+
if (sched == NULL) {
|
1693
|
+
return;
|
1694
|
+
}
|
1695
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1696
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1697
|
+
lm_ggml_backend_event_free(sched->events[b][c]);
|
1698
|
+
}
|
1699
|
+
}
|
1700
|
+
lm_ggml_gallocr_free(sched->galloc);
|
1701
|
+
lm_ggml_free(sched->ctx);
|
1702
|
+
lm_ggml_hash_set_free(&sched->hash_set);
|
1703
|
+
free(sched->splits);
|
1704
|
+
free(sched->hv_tensor_backend_ids);
|
1705
|
+
free(sched->hv_tensor_copies);
|
1706
|
+
free(sched->node_backend_ids);
|
1707
|
+
free(sched->leaf_backend_ids);
|
1708
|
+
free(sched->prev_node_backend_ids);
|
1709
|
+
free(sched->prev_leaf_backend_ids);
|
1710
|
+
free(sched->context_buffer);
|
1711
|
+
free(sched->graph.nodes);
|
1712
|
+
free(sched->graph.leafs);
|
1713
|
+
free(sched);
|
1714
|
+
}
|
1429
1715
|
|
1430
|
-
|
1431
|
-
|
1716
|
+
void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
|
1717
|
+
// reset state for the next run
|
1718
|
+
if (!sched->is_reset) {
|
1719
|
+
lm_ggml_hash_set_reset(&sched->hash_set);
|
1720
|
+
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1721
|
+
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1722
|
+
sched->is_reset = true;
|
1723
|
+
}
|
1724
|
+
sched->is_alloc = false;
|
1725
|
+
}
|
1432
1726
|
|
1433
|
-
|
1434
|
-
|
1727
|
+
bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
|
1728
|
+
LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
1435
1729
|
|
1436
|
-
|
1437
|
-
struct lm_ggml_backend_sched_split * splits;
|
1438
|
-
int n_splits;
|
1439
|
-
int splits_capacity;
|
1730
|
+
lm_ggml_backend_sched_split_graph(sched, measure_graph);
|
1440
1731
|
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
|
1445
|
-
struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
1446
|
-
int n_graph_inputs;
|
1732
|
+
if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1733
|
+
return false;
|
1734
|
+
}
|
1447
1735
|
|
1448
|
-
|
1736
|
+
lm_ggml_backend_sched_reset(sched);
|
1737
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1449
1738
|
|
1450
|
-
|
1451
|
-
|
1739
|
+
return true;
|
1740
|
+
}
|
1452
1741
|
|
1453
|
-
|
1454
|
-
|
1742
|
+
bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1743
|
+
LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
1455
1744
|
|
1456
|
-
|
1457
|
-
};
|
1745
|
+
lm_ggml_backend_sched_split_graph(sched, graph);
|
1458
1746
|
|
1459
|
-
#define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
1460
|
-
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
1461
|
-
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
1462
|
-
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
1463
1747
|
|
1464
|
-
|
1465
|
-
|
1466
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1467
|
-
if (sched->backends[i] == backend) {
|
1468
|
-
return i;
|
1469
|
-
}
|
1748
|
+
if (!lm_ggml_backend_sched_alloc_splits(sched)) {
|
1749
|
+
return false;
|
1470
1750
|
}
|
1471
|
-
|
1751
|
+
|
1752
|
+
sched->is_alloc = true;
|
1753
|
+
|
1754
|
+
return true;
|
1472
1755
|
}
|
1473
1756
|
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1757
|
+
enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1758
|
+
enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
|
1759
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1760
|
+
return err;
|
1761
|
+
}
|
1762
|
+
|
1763
|
+
enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1764
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1765
|
+
lm_ggml_backend_sched_reset(sched);
|
1478
1766
|
}
|
1479
1767
|
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
lm_ggml_backend_supports_op(sched->backends[i], op)) {
|
1484
|
-
return i;
|
1768
|
+
if (!sched->is_alloc) {
|
1769
|
+
if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
|
1770
|
+
return LM_GGML_STATUS_ALLOC_FAILED;
|
1485
1771
|
}
|
1486
1772
|
}
|
1487
1773
|
|
1488
|
-
|
1489
|
-
|
1490
|
-
__func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
|
1491
|
-
#endif
|
1774
|
+
return lm_ggml_backend_sched_compute_splits(sched);
|
1775
|
+
}
|
1492
1776
|
|
1493
|
-
|
1777
|
+
void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
|
1778
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1779
|
+
lm_ggml_backend_synchronize(sched->backends[i]);
|
1780
|
+
}
|
1494
1781
|
}
|
1495
1782
|
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
#define GET_CAUSE(node) causes[hash_id(node)]
|
1501
|
-
#else
|
1502
|
-
#define SET_CAUSE(node, ...)
|
1503
|
-
#define GET_CAUSE(node) ""
|
1504
|
-
#endif
|
1783
|
+
void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
|
1784
|
+
sched->callback_eval = callback;
|
1785
|
+
sched->callback_eval_user_data = user_data;
|
1786
|
+
}
|
1505
1787
|
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1788
|
+
int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
|
1789
|
+
return sched->n_splits;
|
1790
|
+
}
|
1509
1791
|
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
SET_CAUSE(tensor, "1.dst");
|
1514
|
-
return cur_backend_id;
|
1515
|
-
}
|
1792
|
+
int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
|
1793
|
+
return sched->n_copies;
|
1794
|
+
}
|
1516
1795
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
if (cur_backend_id != -1) {
|
1521
|
-
SET_CAUSE(tensor, "1.vsrc");
|
1522
|
-
return cur_backend_id;
|
1523
|
-
}
|
1524
|
-
}
|
1525
|
-
|
1526
|
-
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
1527
|
-
// since the tensor is pre-allocated, it cannot be moved to another backend
|
1528
|
-
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
1529
|
-
}
|
1796
|
+
int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
|
1797
|
+
return sched->n_backends;
|
1798
|
+
}
|
1530
1799
|
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
return cur_backend_id;
|
1536
|
-
}
|
1800
|
+
lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
|
1801
|
+
LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
1802
|
+
return sched->backends[i];
|
1803
|
+
}
|
1537
1804
|
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
if (src == NULL) {
|
1542
|
-
continue;
|
1543
|
-
}
|
1544
|
-
if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1545
|
-
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
1546
|
-
// check if a backend with higher prio wants to offload the op
|
1547
|
-
if (src_backend_id == sched->n_backends - 1) {
|
1548
|
-
for (int b = 0; b < src_backend_id; b++) {
|
1549
|
-
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1550
|
-
SET_CAUSE(tensor, "1.off");
|
1551
|
-
return b;
|
1552
|
-
}
|
1553
|
-
}
|
1554
|
-
}
|
1555
|
-
SET_CAUSE(tensor, "1.wgt%d", i);
|
1556
|
-
return src_backend_id;
|
1557
|
-
}
|
1558
|
-
}
|
1805
|
+
size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
|
1806
|
+
int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
|
1807
|
+
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1559
1808
|
|
1560
|
-
return
|
1809
|
+
return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1561
1810
|
}
|
1562
1811
|
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
}
|
1570
|
-
return buffer;
|
1812
|
+
void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
|
1813
|
+
int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
|
1814
|
+
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1815
|
+
tensor_backend_id(node) = backend_index;
|
1816
|
+
SET_CAUSE(node, "usr");
|
1817
|
+
sched->is_reset = false;
|
1571
1818
|
}
|
1572
1819
|
|
1573
|
-
|
1574
|
-
int
|
1575
|
-
|
1576
|
-
|
1577
|
-
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
1578
|
-
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
|
1579
|
-
sched->splits[cur_split].n_inputs);
|
1580
|
-
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
1581
|
-
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
1582
|
-
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
1583
|
-
}
|
1584
|
-
LM_GGML_LOG_DEBUG("\n");
|
1585
|
-
cur_split++;
|
1586
|
-
}
|
1587
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1588
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1589
|
-
continue;
|
1590
|
-
}
|
1591
|
-
lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
|
1592
|
-
LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
|
1593
|
-
fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1594
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1595
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1596
|
-
if (src == NULL) {
|
1597
|
-
continue;
|
1598
|
-
}
|
1599
|
-
lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
|
1600
|
-
LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1601
|
-
fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1602
|
-
}
|
1603
|
-
LM_GGML_LOG_DEBUG("\n");
|
1820
|
+
lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
|
1821
|
+
int backend_index = tensor_backend_id(node);
|
1822
|
+
if (backend_index == -1) {
|
1823
|
+
return NULL;
|
1604
1824
|
}
|
1825
|
+
return sched->backends[backend_index];
|
1605
1826
|
}
|
1606
1827
|
|
1607
|
-
|
1608
|
-
lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
1609
|
-
lm_ggml_backend_buffer_type_t buft = NULL;
|
1828
|
+
// utils
|
1610
1829
|
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
int tensor_backend_id = tensor_backend_id(t);
|
1617
|
-
if (tensor_backend_id == -1 && t->view_src) {
|
1618
|
-
tensor_backend_id = tensor_backend_id(t->view_src);
|
1619
|
-
}
|
1620
|
-
if (tensor_backend_id != -1) {
|
1621
|
-
buft = sched->bufts[tensor_backend_id];
|
1622
|
-
}
|
1623
|
-
}
|
1830
|
+
void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
|
1831
|
+
LM_GGML_ASSERT(tensor->buffer == NULL);
|
1832
|
+
LM_GGML_ASSERT(tensor->view_src != NULL);
|
1833
|
+
LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1834
|
+
LM_GGML_ASSERT(tensor->view_src->data != NULL);
|
1624
1835
|
|
1625
|
-
|
1836
|
+
tensor->buffer = tensor->view_src->buffer;
|
1837
|
+
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1838
|
+
lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1626
1839
|
}
|
1627
1840
|
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1841
|
+
void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
|
1842
|
+
LM_GGML_ASSERT(tensor->buffer == NULL);
|
1843
|
+
LM_GGML_ASSERT(tensor->data == NULL);
|
1844
|
+
LM_GGML_ASSERT(tensor->view_src == NULL);
|
1845
|
+
LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
|
1846
|
+
LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
1847
|
+
(char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
|
1848
|
+
|
1849
|
+
tensor->buffer = buffer;
|
1850
|
+
tensor->data = addr;
|
1851
|
+
lm_ggml_backend_buffer_init_tensor(buffer, tensor);
|
1633
1852
|
}
|
1634
1853
|
|
1635
|
-
|
1636
|
-
|
1637
|
-
// reset splits
|
1638
|
-
sched->n_splits = 0;
|
1639
|
-
sched->n_graph_inputs = 0;
|
1640
|
-
sched->is_reset = false;
|
1854
|
+
static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
|
1855
|
+
struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
|
1641
1856
|
|
1642
|
-
|
1643
|
-
|
1644
|
-
/* .mem_buffer = */ sched->context_buffer,
|
1645
|
-
/* .no_alloc = */ true
|
1646
|
-
};
|
1857
|
+
LM_GGML_ASSERT(src != NULL);
|
1858
|
+
LM_GGML_ASSERT(src->data && "graph must be allocated");
|
1647
1859
|
|
1648
|
-
|
1860
|
+
size_t id = lm_ggml_hash_insert(&hash_set, src);
|
1861
|
+
if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
|
1862
|
+
return node_copies[lm_ggml_hash_find(&hash_set, src)];
|
1863
|
+
}
|
1649
1864
|
|
1650
|
-
|
1651
|
-
if (
|
1652
|
-
|
1865
|
+
struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1866
|
+
if (src->view_src != NULL) {
|
1867
|
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1868
|
+
dst->view_offs = src->view_offs;
|
1653
1869
|
}
|
1870
|
+
dst->op = src->op;
|
1871
|
+
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
1872
|
+
lm_ggml_set_name(dst, src->name);
|
1654
1873
|
|
1655
|
-
//
|
1656
|
-
for (int i = 0; i <
|
1657
|
-
struct lm_ggml_tensor *
|
1658
|
-
|
1659
|
-
|
1660
|
-
if (*leaf_backend_id == -1) {
|
1661
|
-
*leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1874
|
+
// copy src
|
1875
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
1876
|
+
struct lm_ggml_tensor * s = src->src[i];
|
1877
|
+
if (s == NULL) {
|
1878
|
+
continue;
|
1662
1879
|
}
|
1880
|
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1663
1881
|
}
|
1664
1882
|
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
// do not overwrite user assignments
|
1669
|
-
if (*node_backend_id == -1) {
|
1670
|
-
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
|
1883
|
+
node_copies[id] = dst;
|
1884
|
+
return dst;
|
1885
|
+
}
|
1671
1886
|
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1887
|
+
static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
|
1888
|
+
size_t id = lm_ggml_hash_find(hash_set, src);
|
1889
|
+
if (node_init[id]) {
|
1890
|
+
return;
|
1891
|
+
}
|
1892
|
+
node_init[id] = true;
|
1677
1893
|
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
*src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
|
1686
|
-
}
|
1687
|
-
}
|
1688
|
-
#endif
|
1689
|
-
}
|
1894
|
+
struct lm_ggml_tensor * dst = node_copies[id];
|
1895
|
+
if (dst->view_src != NULL) {
|
1896
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1897
|
+
lm_ggml_backend_view_init(dst);
|
1898
|
+
}
|
1899
|
+
else {
|
1900
|
+
lm_ggml_backend_tensor_copy(src, dst);
|
1690
1901
|
}
|
1691
1902
|
|
1692
|
-
//
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
// expand gpu down
|
1698
|
-
{
|
1699
|
-
int cur_backend_id = -1;
|
1700
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1701
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1702
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1703
|
-
continue;
|
1704
|
-
}
|
1705
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1706
|
-
if (*node_backend_id != -1) {
|
1707
|
-
if (*node_backend_id == sched->n_backends - 1) {
|
1708
|
-
// skip cpu (lowest prio backend)
|
1709
|
-
cur_backend_id = -1;
|
1710
|
-
} else {
|
1711
|
-
cur_backend_id = *node_backend_id;
|
1712
|
-
}
|
1713
|
-
} else if (cur_backend_id != -1) {
|
1714
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1715
|
-
}
|
1903
|
+
// init src
|
1904
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
1905
|
+
struct lm_ggml_tensor * s = src->src[i];
|
1906
|
+
if (s == NULL) {
|
1907
|
+
continue;
|
1716
1908
|
}
|
1909
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1717
1910
|
}
|
1718
|
-
|
1719
|
-
{
|
1720
|
-
int cur_backend_id = -1;
|
1721
|
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1722
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1723
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1724
|
-
continue;
|
1725
|
-
}
|
1726
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1727
|
-
if (*node_backend_id != -1) {
|
1728
|
-
if (*node_backend_id == sched->n_backends - 1) {
|
1729
|
-
// skip cpu (lowest prio backend)
|
1730
|
-
cur_backend_id = -1;
|
1731
|
-
} else {
|
1732
|
-
cur_backend_id = *node_backend_id;
|
1733
|
-
}
|
1734
|
-
} else if (cur_backend_id != -1) {
|
1735
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1736
|
-
}
|
1737
|
-
}
|
1738
|
-
}
|
1739
|
-
// expand rest down
|
1740
|
-
{
|
1741
|
-
int cur_backend_id = -1;
|
1742
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1743
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1744
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1745
|
-
continue;
|
1746
|
-
}
|
1747
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1748
|
-
if (*node_backend_id != -1) {
|
1749
|
-
cur_backend_id = *node_backend_id;
|
1750
|
-
} else if (cur_backend_id != -1) {
|
1751
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1752
|
-
}
|
1753
|
-
}
|
1754
|
-
}
|
1755
|
-
// expand rest up
|
1756
|
-
{
|
1757
|
-
int cur_backend_id = -1;
|
1758
|
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1759
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1760
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1761
|
-
continue;
|
1762
|
-
}
|
1763
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1764
|
-
if (*node_backend_id != -1) {
|
1765
|
-
cur_backend_id = *node_backend_id;
|
1766
|
-
} else if (cur_backend_id != -1) {
|
1767
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1768
|
-
}
|
1769
|
-
}
|
1770
|
-
}
|
1771
|
-
|
1772
|
-
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
1773
|
-
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
1774
|
-
// however, we also need to verify that the sources are in compatible buffer types
|
1775
|
-
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
1776
|
-
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
1777
|
-
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
1778
|
-
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1779
|
-
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1780
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1781
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1782
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1783
|
-
continue;
|
1784
|
-
}
|
1785
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1786
|
-
if (*node_backend_id == -1) {
|
1787
|
-
// unassigned node: find the backend with the most supported inputs
|
1788
|
-
int n_supported_best = -1;
|
1789
|
-
for (int b = 0; b < sched->n_backends; b++) {
|
1790
|
-
if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1791
|
-
int n_supported = 0;
|
1792
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1793
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1794
|
-
if (src == NULL) {
|
1795
|
-
continue;
|
1796
|
-
}
|
1797
|
-
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1798
|
-
n_supported++;
|
1799
|
-
}
|
1800
|
-
}
|
1801
|
-
if (n_supported > n_supported_best) {
|
1802
|
-
n_supported_best = n_supported;
|
1803
|
-
*node_backend_id = b;
|
1804
|
-
SET_CAUSE(node, "3.best");
|
1805
|
-
}
|
1806
|
-
}
|
1807
|
-
}
|
1808
|
-
} else {
|
1809
|
-
// assigned node: upgrade to higher prio backend if possible
|
1810
|
-
for (int b = 0; b < *node_backend_id; b++) {
|
1811
|
-
if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1812
|
-
bool supported = true;
|
1813
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1814
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1815
|
-
if (src == NULL) {
|
1816
|
-
continue;
|
1817
|
-
}
|
1818
|
-
if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1819
|
-
supported = false;
|
1820
|
-
break;
|
1821
|
-
}
|
1822
|
-
}
|
1823
|
-
if (supported) {
|
1824
|
-
*node_backend_id = b;
|
1825
|
-
SET_CAUSE(node, "3.upg");
|
1826
|
-
break;
|
1827
|
-
}
|
1828
|
-
}
|
1829
|
-
}
|
1830
|
-
}
|
1831
|
-
}
|
1832
|
-
|
1833
|
-
// pass 4: assign backends to remaining src from dst and view_src
|
1834
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1835
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1836
|
-
int * cur_backend_id = &tensor_backend_id(node);
|
1837
|
-
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1838
|
-
*cur_backend_id = tensor_backend_id(node->view_src);
|
1839
|
-
SET_CAUSE(node, "4.vsrc");
|
1840
|
-
}
|
1841
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1842
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1843
|
-
if (src == NULL) {
|
1844
|
-
continue;
|
1845
|
-
}
|
1846
|
-
int * src_backend_id = &tensor_backend_id(src);
|
1847
|
-
if (*src_backend_id == -1) {
|
1848
|
-
if (src->view_src != NULL) {
|
1849
|
-
// views are always on the same backend as the source
|
1850
|
-
*src_backend_id = tensor_backend_id(src->view_src);
|
1851
|
-
SET_CAUSE(src, "4.vsrc");
|
1852
|
-
} else {
|
1853
|
-
*src_backend_id = *cur_backend_id;
|
1854
|
-
SET_CAUSE(src, "4.cur");
|
1855
|
-
}
|
1856
|
-
}
|
1857
|
-
}
|
1858
|
-
}
|
1859
|
-
|
1860
|
-
// pass 5: split graph, find tensors that need to be copied
|
1861
|
-
{
|
1862
|
-
int i_split = 0;
|
1863
|
-
struct lm_ggml_backend_sched_split * split = &sched->splits[0];
|
1864
|
-
// find the backend of the first split, skipping view ops
|
1865
|
-
int i = 0;
|
1866
|
-
for (; i < graph->n_nodes; i++) {
|
1867
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1868
|
-
if (!lm_ggml_is_view_op(node->op)) {
|
1869
|
-
split->backend_id = tensor_backend_id(node);
|
1870
|
-
break;
|
1871
|
-
}
|
1872
|
-
}
|
1873
|
-
split->i_start = 0;
|
1874
|
-
split->n_inputs = 0;
|
1875
|
-
int cur_backend_id = split->backend_id;
|
1876
|
-
for (; i < graph->n_nodes; i++) {
|
1877
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1878
|
-
|
1879
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1880
|
-
continue;
|
1881
|
-
}
|
1882
|
-
|
1883
|
-
const int node_backend_id = tensor_backend_id(node);
|
1884
|
-
|
1885
|
-
assert(node_backend_id != -1); // all nodes should be assigned by now
|
1886
|
-
|
1887
|
-
// check if we should start a new split based on the sources of the current node
|
1888
|
-
bool need_new_split = false;
|
1889
|
-
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1890
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1891
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1892
|
-
if (src == NULL) {
|
1893
|
-
continue;
|
1894
|
-
}
|
1895
|
-
// check if a weight is on a different backend
|
1896
|
-
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1897
|
-
if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1898
|
-
int src_backend_id = tensor_backend_id(src);
|
1899
|
-
if (src_backend_id != cur_backend_id) {
|
1900
|
-
need_new_split = true;
|
1901
|
-
break;
|
1902
|
-
}
|
1903
|
-
}
|
1904
|
-
// check if the split has too many inputs
|
1905
|
-
// FIXME: count the number of inputs instead of only checking when full
|
1906
|
-
if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1907
|
-
const size_t id = hash_id(src);
|
1908
|
-
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
1909
|
-
bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1910
|
-
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
1911
|
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
1912
|
-
need_new_split = true;
|
1913
|
-
break;
|
1914
|
-
}
|
1915
|
-
}
|
1916
|
-
}
|
1917
|
-
}
|
1911
|
+
}
|
1918
1912
|
|
1919
|
-
|
1920
|
-
|
1921
|
-
|
1922
|
-
|
1923
|
-
sched->splits_capacity *= 2;
|
1924
|
-
sched->splits = (lm_ggml_backend_sched_split *)
|
1925
|
-
realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1926
|
-
LM_GGML_ASSERT(sched->splits != NULL);
|
1927
|
-
}
|
1928
|
-
split = &sched->splits[i_split];
|
1929
|
-
split->backend_id = node_backend_id;
|
1930
|
-
split->i_start = i;
|
1931
|
-
split->n_inputs = 0;
|
1932
|
-
cur_backend_id = node_backend_id;
|
1933
|
-
}
|
1913
|
+
struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
|
1914
|
+
struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
|
1915
|
+
struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1916
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
1934
1917
|
|
1935
|
-
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
}
|
1918
|
+
struct lm_ggml_init_params params = {
|
1919
|
+
/* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
|
1920
|
+
/* .mem_buffer = */ NULL,
|
1921
|
+
/* .no_alloc = */ true
|
1922
|
+
};
|
1941
1923
|
|
1942
|
-
|
1943
|
-
|
1944
|
-
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1924
|
+
struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
|
1925
|
+
struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
|
1945
1926
|
|
1946
|
-
|
1947
|
-
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
}
|
1961
|
-
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
1962
|
-
SET_CAUSE(tensor_copy, "4.cpy");
|
1963
|
-
}
|
1964
|
-
int n_graph_inputs = sched->n_graph_inputs++;
|
1965
|
-
LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1966
|
-
sched->graph_inputs[n_graph_inputs] = src;
|
1967
|
-
}
|
1968
|
-
}
|
1927
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
1928
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
1929
|
+
lm_ggml_hash_set_free(&hash_set);
|
1930
|
+
free(node_copies);
|
1931
|
+
free(node_init);
|
1932
|
+
lm_ggml_free(ctx_allocated);
|
1933
|
+
lm_ggml_free(ctx_unallocated);
|
1934
|
+
return {
|
1935
|
+
/* .buffer = */ NULL,
|
1936
|
+
/* .ctx_allocated = */ NULL,
|
1937
|
+
/* .ctx_unallocated = */ NULL,
|
1938
|
+
/* .graph = */ NULL,
|
1939
|
+
};
|
1940
|
+
}
|
1969
1941
|
|
1970
|
-
|
1971
|
-
|
1972
|
-
|
1973
|
-
|
1974
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
1975
|
-
struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1976
|
-
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1977
|
-
if (sched->n_copies > 1) {
|
1978
|
-
lm_ggml_set_input(tensor_copy);
|
1979
|
-
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1980
|
-
}
|
1981
|
-
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
1982
|
-
SET_CAUSE(tensor_copy, "4.cpy");
|
1983
|
-
}
|
1984
|
-
int n_inputs = split->n_inputs++;
|
1985
|
-
LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1986
|
-
split->inputs[n_inputs] = src;
|
1987
|
-
}
|
1988
|
-
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
1989
|
-
}
|
1990
|
-
}
|
1991
|
-
}
|
1992
|
-
split->i_end = graph->n_nodes;
|
1993
|
-
sched->n_splits = i_split + 1;
|
1942
|
+
// dup nodes
|
1943
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1944
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1945
|
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1994
1946
|
}
|
1995
1947
|
|
1996
|
-
|
1997
|
-
|
1948
|
+
// allocate nodes
|
1949
|
+
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1950
|
+
if (buffer == NULL) {
|
1951
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
1952
|
+
lm_ggml_hash_set_free(&hash_set);
|
1953
|
+
free(node_copies);
|
1954
|
+
free(node_init);
|
1955
|
+
lm_ggml_free(ctx_allocated);
|
1956
|
+
lm_ggml_free(ctx_unallocated);
|
1957
|
+
return {
|
1958
|
+
/* .buffer = */ NULL,
|
1959
|
+
/* .ctx_allocated = */ NULL,
|
1960
|
+
/* .ctx_unallocated = */ NULL,
|
1961
|
+
/* .graph = */ NULL,
|
1962
|
+
};
|
1998
1963
|
}
|
1999
1964
|
|
2000
|
-
//
|
2001
|
-
{
|
2002
|
-
int * tmp = sched->node_backend_ids;
|
2003
|
-
sched->node_backend_ids = sched->prev_node_backend_ids;
|
2004
|
-
sched->prev_node_backend_ids = tmp;
|
1965
|
+
//printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
2005
1966
|
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
1967
|
+
// copy data and init views
|
1968
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1969
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1970
|
+
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
2009
1971
|
}
|
2010
1972
|
|
2011
|
-
|
2012
|
-
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2016
|
-
|
2017
|
-
LM_GGML_ASSERT(sched->graph.leafs != NULL);
|
1973
|
+
// build graph copy
|
1974
|
+
struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
1975
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1976
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1977
|
+
struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
|
1978
|
+
graph_copy->nodes[i] = node_copy;
|
2018
1979
|
}
|
2019
|
-
|
2020
|
-
sched->graph.n_leafs = 0;
|
1980
|
+
graph_copy->n_nodes = graph->n_nodes;
|
2021
1981
|
|
2022
|
-
|
1982
|
+
lm_ggml_hash_set_free(&hash_set);
|
1983
|
+
free(node_copies);
|
1984
|
+
free(node_init);
|
2023
1985
|
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
1986
|
+
return {
|
1987
|
+
/* .buffer = */ buffer,
|
1988
|
+
/* .ctx_allocated = */ ctx_allocated,
|
1989
|
+
/* .ctx_unallocated = */ ctx_unallocated,
|
1990
|
+
/* .graph = */ graph_copy,
|
1991
|
+
};
|
1992
|
+
}
|
2027
1993
|
|
2028
|
-
|
2029
|
-
|
2030
|
-
|
1994
|
+
void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
|
1995
|
+
lm_ggml_backend_buffer_free(copy.buffer);
|
1996
|
+
lm_ggml_free(copy.ctx_allocated);
|
1997
|
+
lm_ggml_free(copy.ctx_unallocated);
|
1998
|
+
}
|
2031
1999
|
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2000
|
+
bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
|
2001
|
+
struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
|
2002
|
+
if (copy.buffer == NULL) {
|
2003
|
+
return false;
|
2004
|
+
}
|
2035
2005
|
|
2036
|
-
|
2037
|
-
|
2038
|
-
input_dep->src[0] = input;
|
2039
|
-
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
2040
|
-
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
2006
|
+
struct lm_ggml_cgraph * g1 = graph;
|
2007
|
+
struct lm_ggml_cgraph * g2 = copy.graph;
|
2041
2008
|
|
2042
|
-
|
2043
|
-
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
2044
|
-
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
2045
|
-
}
|
2009
|
+
assert(g1->n_nodes == g2->n_nodes);
|
2046
2010
|
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
}
|
2052
|
-
}
|
2011
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
2012
|
+
//printf("eval %d/%d\n", i, g1->n_nodes);
|
2013
|
+
struct lm_ggml_tensor * t1 = g1->nodes[i];
|
2014
|
+
struct lm_ggml_tensor * t2 = g2->nodes[i];
|
2053
2015
|
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
2065
|
-
}
|
2016
|
+
assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
|
2017
|
+
|
2018
|
+
struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
|
2019
|
+
struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
|
2020
|
+
|
2021
|
+
lm_ggml_backend_graph_compute(backend1, &g1v);
|
2022
|
+
lm_ggml_backend_graph_compute(backend2, &g2v);
|
2023
|
+
|
2024
|
+
if (lm_ggml_is_view_op(t1->op)) {
|
2025
|
+
continue;
|
2066
2026
|
}
|
2067
2027
|
|
2068
|
-
|
2069
|
-
|
2070
|
-
|
2071
|
-
for (int j = 0; j < split->n_inputs; j++) {
|
2072
|
-
struct lm_ggml_tensor * input = split->inputs[j];
|
2073
|
-
size_t id = hash_id(input);
|
2074
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2075
|
-
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
2076
|
-
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
2077
|
-
assert(graph_copy->size > graph_copy->n_leafs);
|
2078
|
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
2079
|
-
}
|
2080
|
-
}
|
2028
|
+
// compare results, calculate rms etc
|
2029
|
+
if (!callback(i, t1, t2, user_data)) {
|
2030
|
+
break;
|
2081
2031
|
}
|
2082
2032
|
}
|
2083
2033
|
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
2088
|
-
assert(graph_copy->size > graph_copy->n_leafs);
|
2089
|
-
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
2090
|
-
}
|
2034
|
+
lm_ggml_backend_graph_copy_free(copy);
|
2035
|
+
|
2036
|
+
return true;
|
2091
2037
|
}
|
2092
2038
|
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2039
|
+
|
2040
|
+
|
2041
|
+
#include "ggml-backend.h"
|
2042
|
+
#include "ggml-backend-impl.h"
|
2043
|
+
#include "ggml-cpu.h"
|
2044
|
+
#include "ggml-impl.h"
|
2045
|
+
#include <cctype>
|
2046
|
+
#include <string>
|
2047
|
+
|
2048
|
+
// ggml-backend interface
|
2049
|
+
|
2050
|
+
// CPU backend - buffer
|
2051
|
+
|
2052
|
+
static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
2053
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
2054
|
+
|
2055
|
+
// align the buffer
|
2056
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
2057
|
+
data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
|
2101
2058
|
}
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2059
|
+
|
2060
|
+
return (void *)data;
|
2061
|
+
}
|
2062
|
+
|
2063
|
+
static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
2064
|
+
lm_ggml_aligned_free(buffer->context, buffer->size);
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
2068
|
+
memset((char *)tensor->data + offset, value, size);
|
2069
|
+
|
2070
|
+
LM_GGML_UNUSED(buffer);
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
2074
|
+
memcpy((char *)tensor->data + offset, data, size);
|
2075
|
+
|
2076
|
+
LM_GGML_UNUSED(buffer);
|
2077
|
+
}
|
2078
|
+
|
2079
|
+
static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
2080
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
2081
|
+
|
2082
|
+
LM_GGML_UNUSED(buffer);
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
|
2086
|
+
if (lm_ggml_backend_buffer_is_host(src->buffer)) {
|
2087
|
+
memcpy(dst->data, src->data, lm_ggml_nbytes(src));
|
2088
|
+
return true;
|
2110
2089
|
}
|
2090
|
+
return false;
|
2111
2091
|
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2092
|
+
LM_GGML_UNUSED(buffer);
|
2093
|
+
}
|
2094
|
+
|
2095
|
+
static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
2096
|
+
memset(buffer->context, value, buffer->size);
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
|
2100
|
+
/* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
|
2101
|
+
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
2102
|
+
/* .init_tensor = */ NULL, // no initialization required
|
2103
|
+
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
2104
|
+
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
2105
|
+
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
2106
|
+
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
2107
|
+
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
2108
|
+
/* .reset = */ NULL,
|
2109
|
+
};
|
2110
|
+
|
2111
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
|
2112
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
2113
|
+
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
2114
|
+
/* .init_tensor = */ NULL, // no initialization required
|
2115
|
+
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
2116
|
+
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
2117
|
+
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
2118
|
+
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
2119
|
+
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
2120
|
+
/* .reset = */ NULL,
|
2121
|
+
};
|
2122
|
+
|
2123
|
+
// CPU backend - buffer type
|
2124
|
+
|
2125
|
+
static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2126
|
+
return "CPU";
|
2127
|
+
|
2128
|
+
LM_GGML_UNUSED(buft);
|
2129
|
+
}
|
2130
|
+
|
2131
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
2132
|
+
void * data = lm_ggml_aligned_malloc(size);
|
2133
|
+
|
2134
|
+
if (data == NULL) {
|
2135
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
2136
|
+
return NULL;
|
2124
2137
|
}
|
2125
2138
|
|
2126
|
-
return
|
2139
|
+
return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
|
2127
2140
|
}
|
2128
2141
|
|
2129
|
-
static
|
2130
|
-
|
2142
|
+
static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
|
2143
|
+
return TENSOR_ALIGNMENT;
|
2131
2144
|
|
2132
|
-
|
2133
|
-
|
2134
|
-
int split_backend_id = split->backend_id;
|
2135
|
-
lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
|
2145
|
+
LM_GGML_UNUSED(buft);
|
2146
|
+
}
|
2136
2147
|
|
2137
|
-
|
2138
|
-
|
2139
|
-
lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
2140
|
-
struct lm_ggml_tensor * input = split->inputs[j];
|
2141
|
-
struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
2148
|
+
static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
|
2149
|
+
return true;
|
2142
2150
|
|
2143
|
-
|
2144
|
-
|
2145
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2146
|
-
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
2147
|
-
} else {
|
2148
|
-
lm_ggml_backend_synchronize(split_backend);
|
2149
|
-
}
|
2150
|
-
lm_ggml_backend_tensor_copy(input, input_cpy);
|
2151
|
-
} else {
|
2152
|
-
// wait for the split backend to finish using the input before overwriting it
|
2153
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2154
|
-
lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
2155
|
-
} else {
|
2156
|
-
lm_ggml_backend_synchronize(split_backend);
|
2157
|
-
}
|
2158
|
-
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
2159
|
-
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
2160
|
-
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
2161
|
-
lm_ggml_backend_synchronize(input_backend);
|
2162
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2163
|
-
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
2164
|
-
} else {
|
2165
|
-
lm_ggml_backend_synchronize(split_backend);
|
2166
|
-
}
|
2167
|
-
lm_ggml_backend_tensor_copy(input, input_cpy);
|
2168
|
-
}
|
2169
|
-
}
|
2170
|
-
}
|
2151
|
+
LM_GGML_UNUSED(buft);
|
2152
|
+
}
|
2171
2153
|
|
2172
|
-
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
2178
|
-
//
|
2179
|
-
|
2180
|
-
|
2154
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
|
2155
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
|
2156
|
+
/* .iface = */ {
|
2157
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
|
2158
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
2159
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2160
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2161
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2162
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2163
|
+
},
|
2164
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2165
|
+
/* .context = */ NULL,
|
2166
|
+
};
|
2181
2167
|
|
2182
|
-
|
2183
|
-
|
2168
|
+
return &lm_ggml_backend_cpu_buffer_type;
|
2169
|
+
}
|
2184
2170
|
|
2185
|
-
|
2171
|
+
static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2172
|
+
return "CPU_Mapped";
|
2186
2173
|
|
2187
|
-
|
2188
|
-
|
2189
|
-
t = split->graph.nodes[++j1];
|
2190
|
-
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
2191
|
-
}
|
2174
|
+
LM_GGML_UNUSED(buft);
|
2175
|
+
}
|
2192
2176
|
|
2193
|
-
|
2177
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
|
2178
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
|
2179
|
+
/* .iface = */ {
|
2180
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
2181
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
2182
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2183
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2184
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2185
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2186
|
+
},
|
2187
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2188
|
+
/* .context = */ NULL,
|
2189
|
+
};
|
2194
2190
|
|
2195
|
-
|
2196
|
-
|
2197
|
-
return ec;
|
2198
|
-
}
|
2191
|
+
return &lm_ggml_backend_cpu_buffer_type;
|
2192
|
+
}
|
2199
2193
|
|
2200
|
-
|
2201
|
-
lm_ggml_backend_synchronize(split_backend);
|
2194
|
+
#ifdef LM_GGML_USE_CPU_HBM
|
2202
2195
|
|
2203
|
-
|
2204
|
-
break;
|
2205
|
-
}
|
2196
|
+
// buffer type HBM
|
2206
2197
|
|
2207
|
-
|
2208
|
-
}
|
2209
|
-
}
|
2198
|
+
#include <hbwmalloc.h>
|
2210
2199
|
|
2211
|
-
|
2212
|
-
|
2213
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2214
|
-
lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
2215
|
-
}
|
2216
|
-
}
|
2217
|
-
}
|
2200
|
+
static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2201
|
+
return "CPU_HBM";
|
2218
2202
|
|
2219
|
-
|
2203
|
+
LM_GGML_UNUSED(buft);
|
2204
|
+
}
|
2220
2205
|
|
2221
|
-
|
2206
|
+
static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
2207
|
+
hbw_free(buffer->context);
|
2222
2208
|
}
|
2223
2209
|
|
2224
|
-
|
2225
|
-
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
|
2232
|
-
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
2210
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
2211
|
+
void * ptr;
|
2212
|
+
int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
2213
|
+
if (result != 0) {
|
2214
|
+
LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
2215
|
+
return NULL;
|
2216
|
+
}
|
2233
2217
|
|
2234
|
-
|
2218
|
+
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
2219
|
+
buffer->buft = buft;
|
2220
|
+
buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
|
2235
2221
|
|
2236
|
-
|
2237
|
-
|
2238
|
-
sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
|
2222
|
+
return buffer;
|
2223
|
+
}
|
2239
2224
|
|
2240
|
-
|
2241
|
-
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2225
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
|
2226
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
|
2227
|
+
/* .iface = */ {
|
2228
|
+
/* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
|
2229
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
2230
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2231
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2232
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2233
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2234
|
+
},
|
2235
|
+
/* .context = */ NULL,
|
2236
|
+
};
|
2245
2237
|
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
2250
|
-
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
2251
|
-
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
2238
|
+
return &lm_ggml_backend_cpu_buffer_type_hbm;
|
2239
|
+
}
|
2240
|
+
#endif
|
2252
2241
|
|
2253
|
-
|
2254
|
-
|
2242
|
+
static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
|
2243
|
+
static lm_ggml_backend_buffer_type_t bufts[] = {
|
2244
|
+
#ifdef LM_GGML_USE_CPU_HBM
|
2245
|
+
lm_ggml_backend_cpu_hbm_buffer_type(),
|
2246
|
+
#endif
|
2247
|
+
NULL
|
2248
|
+
};
|
2255
2249
|
|
2256
|
-
|
2257
|
-
sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
2258
|
-
sched->splits_capacity = initial_splits_capacity;
|
2250
|
+
return bufts;
|
2259
2251
|
|
2260
|
-
|
2261
|
-
|
2262
|
-
sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
|
2263
|
-
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
2252
|
+
LM_GGML_UNUSED(device);
|
2253
|
+
}
|
2264
2254
|
|
2265
|
-
|
2266
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2267
|
-
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
2268
|
-
}
|
2269
|
-
}
|
2270
|
-
}
|
2255
|
+
// CPU backend - backend (stream)
|
2271
2256
|
|
2272
|
-
|
2257
|
+
struct lm_ggml_backend_cpu_context {
|
2258
|
+
int n_threads;
|
2259
|
+
lm_ggml_threadpool_t threadpool;
|
2273
2260
|
|
2274
|
-
|
2261
|
+
uint8_t * work_data;
|
2262
|
+
size_t work_size;
|
2275
2263
|
|
2276
|
-
|
2277
|
-
|
2264
|
+
lm_ggml_abort_callback abort_callback;
|
2265
|
+
void * abort_callback_data;
|
2266
|
+
};
|
2278
2267
|
|
2279
|
-
|
2280
|
-
|
2281
|
-
|
2282
|
-
|
2283
|
-
for (int b = 0; b < sched->n_backends; b++) {
|
2284
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2285
|
-
lm_ggml_backend_event_free(sched->events[b][c]);
|
2286
|
-
}
|
2287
|
-
}
|
2288
|
-
lm_ggml_gallocr_free(sched->galloc);
|
2289
|
-
lm_ggml_free(sched->ctx);
|
2290
|
-
lm_ggml_hash_set_free(&sched->hash_set);
|
2291
|
-
free(sched->splits);
|
2292
|
-
free(sched->hv_tensor_backend_ids);
|
2293
|
-
free(sched->hv_tensor_copies);
|
2294
|
-
free(sched->node_backend_ids);
|
2295
|
-
free(sched->leaf_backend_ids);
|
2296
|
-
free(sched->prev_node_backend_ids);
|
2297
|
-
free(sched->prev_leaf_backend_ids);
|
2298
|
-
free(sched->context_buffer);
|
2299
|
-
free(sched->graph.nodes);
|
2300
|
-
free(sched->graph.leafs);
|
2301
|
-
free(sched);
|
2268
|
+
static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
|
2269
|
+
return "CPU";
|
2270
|
+
|
2271
|
+
LM_GGML_UNUSED(backend);
|
2302
2272
|
}
|
2303
2273
|
|
2304
|
-
void
|
2305
|
-
|
2306
|
-
|
2307
|
-
|
2308
|
-
|
2309
|
-
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
2310
|
-
sched->is_reset = true;
|
2311
|
-
}
|
2312
|
-
sched->is_alloc = false;
|
2274
|
+
static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
|
2275
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2276
|
+
delete[] cpu_ctx->work_data;
|
2277
|
+
delete cpu_ctx;
|
2278
|
+
delete backend;
|
2313
2279
|
}
|
2314
2280
|
|
2315
|
-
|
2316
|
-
|
2281
|
+
struct lm_ggml_backend_plan_cpu {
|
2282
|
+
struct lm_ggml_cplan cplan;
|
2283
|
+
struct lm_ggml_cgraph cgraph;
|
2284
|
+
};
|
2285
|
+
|
2286
|
+
static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
|
2287
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2317
2288
|
|
2318
|
-
|
2289
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
|
2319
2290
|
|
2320
|
-
|
2321
|
-
|
2291
|
+
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
2292
|
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
2293
|
+
|
2294
|
+
if (cpu_plan->cplan.work_size > 0) {
|
2295
|
+
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
2296
|
+
if (cpu_plan->cplan.work_data == NULL) {
|
2297
|
+
delete cpu_plan;
|
2298
|
+
return NULL;
|
2299
|
+
}
|
2322
2300
|
}
|
2323
2301
|
|
2324
|
-
|
2325
|
-
|
2302
|
+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
2303
|
+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
2326
2304
|
|
2327
|
-
return
|
2305
|
+
return cpu_plan;
|
2328
2306
|
}
|
2329
2307
|
|
2330
|
-
|
2331
|
-
|
2308
|
+
static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
2309
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
2332
2310
|
|
2333
|
-
|
2311
|
+
delete[] cpu_plan->cplan.work_data;
|
2312
|
+
delete cpu_plan;
|
2334
2313
|
|
2314
|
+
LM_GGML_UNUSED(backend);
|
2315
|
+
}
|
2335
2316
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
}
|
2317
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
2318
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
2339
2319
|
|
2340
|
-
|
2320
|
+
return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
2341
2321
|
|
2342
|
-
|
2322
|
+
LM_GGML_UNUSED(backend);
|
2343
2323
|
}
|
2344
2324
|
|
2345
|
-
enum lm_ggml_status
|
2346
|
-
|
2347
|
-
lm_ggml_backend_sched_synchronize(sched);
|
2348
|
-
return err;
|
2349
|
-
}
|
2325
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
2326
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2350
2327
|
|
2351
|
-
|
2352
|
-
if (!sched->is_reset && !sched->is_alloc) {
|
2353
|
-
lm_ggml_backend_sched_reset(sched);
|
2354
|
-
}
|
2328
|
+
struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
2355
2329
|
|
2356
|
-
if (
|
2357
|
-
|
2330
|
+
if (cpu_ctx->work_size < cplan.work_size) {
|
2331
|
+
delete[] cpu_ctx->work_data;
|
2332
|
+
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
2333
|
+
if (cpu_ctx->work_data == NULL) {
|
2334
|
+
cpu_ctx->work_size = 0;
|
2358
2335
|
return LM_GGML_STATUS_ALLOC_FAILED;
|
2359
2336
|
}
|
2337
|
+
cpu_ctx->work_size = cplan.work_size;
|
2360
2338
|
}
|
2339
|
+
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
2361
2340
|
|
2362
|
-
|
2363
|
-
|
2341
|
+
cplan.abort_callback = cpu_ctx->abort_callback;
|
2342
|
+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
2364
2343
|
|
2365
|
-
|
2366
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
2367
|
-
lm_ggml_backend_synchronize(sched->backends[i]);
|
2368
|
-
}
|
2344
|
+
return lm_ggml_graph_compute(cgraph, &cplan);
|
2369
2345
|
}
|
2370
2346
|
|
2371
|
-
|
2372
|
-
|
2373
|
-
|
2374
|
-
|
2347
|
+
static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
|
2348
|
+
/* .get_name = */ lm_ggml_backend_cpu_get_name,
|
2349
|
+
/* .free = */ lm_ggml_backend_cpu_free,
|
2350
|
+
/* .set_tensor_async = */ NULL,
|
2351
|
+
/* .get_tensor_async = */ NULL,
|
2352
|
+
/* .cpy_tensor_async = */ NULL,
|
2353
|
+
/* .synchronize = */ NULL,
|
2354
|
+
/* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
|
2355
|
+
/* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
|
2356
|
+
/* .graph_plan_update = */ NULL,
|
2357
|
+
/* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
|
2358
|
+
/* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
|
2359
|
+
/* .event_record = */ NULL,
|
2360
|
+
/* .event_wait = */ NULL,
|
2361
|
+
};
|
2375
2362
|
|
2376
|
-
|
2377
|
-
|
2363
|
+
static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
|
2364
|
+
static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
2365
|
+
return &guid;
|
2378
2366
|
}
|
2379
2367
|
|
2380
|
-
|
2381
|
-
|
2382
|
-
|
2368
|
+
lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
2369
|
+
// initialize CPU backend now to avoid slowing the first graph computation
|
2370
|
+
lm_ggml_cpu_init();
|
2383
2371
|
|
2384
|
-
|
2385
|
-
|
2386
|
-
|
2372
|
+
struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
|
2373
|
+
if (ctx == NULL) {
|
2374
|
+
return NULL;
|
2375
|
+
}
|
2387
2376
|
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2377
|
+
ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
|
2378
|
+
ctx->threadpool = NULL;
|
2379
|
+
ctx->work_data = NULL;
|
2380
|
+
ctx->work_size = 0;
|
2381
|
+
ctx->abort_callback = NULL;
|
2382
|
+
ctx->abort_callback_data = NULL;
|
2392
2383
|
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2384
|
+
lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
|
2385
|
+
/* .guid = */ lm_ggml_backend_cpu_guid(),
|
2386
|
+
/* .interface = */ lm_ggml_backend_cpu_i,
|
2387
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2388
|
+
/* .context = */ ctx,
|
2389
|
+
};
|
2396
2390
|
|
2397
|
-
|
2391
|
+
if (cpu_backend == NULL) {
|
2392
|
+
delete ctx;
|
2393
|
+
return NULL;
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
return cpu_backend;
|
2398
2397
|
}
|
2399
2398
|
|
2400
|
-
|
2401
|
-
|
2402
|
-
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
2403
|
-
tensor_backend_id(node) = backend_index;
|
2404
|
-
SET_CAUSE(node, "usr");
|
2405
|
-
sched->is_reset = false;
|
2399
|
+
bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
|
2400
|
+
return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
|
2406
2401
|
}
|
2407
2402
|
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
|
2413
|
-
return sched->backends[backend_index];
|
2403
|
+
void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
|
2404
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2405
|
+
|
2406
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2407
|
+
ctx->n_threads = n_threads;
|
2414
2408
|
}
|
2415
2409
|
|
2416
|
-
|
2410
|
+
void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
|
2411
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2417
2412
|
|
2418
|
-
|
2419
|
-
LM_GGML_ASSERT(tensor->buffer == NULL);
|
2420
|
-
LM_GGML_ASSERT(tensor->view_src != NULL);
|
2421
|
-
LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
|
2422
|
-
LM_GGML_ASSERT(tensor->view_src->data != NULL);
|
2413
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2423
2414
|
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2415
|
+
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
2416
|
+
// already had a different threadpool, pause/suspend it before switching
|
2417
|
+
lm_ggml_threadpool_pause(ctx->threadpool);
|
2418
|
+
}
|
2419
|
+
ctx->threadpool = threadpool;
|
2427
2420
|
}
|
2428
2421
|
|
2429
|
-
void
|
2430
|
-
LM_GGML_ASSERT(
|
2431
|
-
LM_GGML_ASSERT(tensor->data == NULL);
|
2432
|
-
LM_GGML_ASSERT(tensor->view_src == NULL);
|
2433
|
-
LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
|
2434
|
-
LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
2435
|
-
(char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
|
2422
|
+
void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
|
2423
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2436
2424
|
|
2437
|
-
|
2438
|
-
|
2439
|
-
|
2425
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2426
|
+
ctx->abort_callback = abort_callback;
|
2427
|
+
ctx->abort_callback_data = abort_callback_data;
|
2440
2428
|
}
|
2441
2429
|
|
2442
|
-
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2446
|
-
LM_GGML_ASSERT(src->data && "graph must be allocated");
|
2430
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
2431
|
+
LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
2432
|
+
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
2433
|
+
}
|
2447
2434
|
|
2448
|
-
|
2449
|
-
if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
|
2450
|
-
return node_copies[lm_ggml_hash_find(&hash_set, src)];
|
2451
|
-
}
|
2435
|
+
// CPU backend - device
|
2452
2436
|
|
2453
|
-
|
2454
|
-
|
2455
|
-
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
2456
|
-
dst->view_offs = src->view_offs;
|
2457
|
-
}
|
2458
|
-
dst->op = src->op;
|
2459
|
-
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
2460
|
-
lm_ggml_set_name(dst, src->name);
|
2437
|
+
struct lm_ggml_backend_cpu_device_context {
|
2438
|
+
std::string description = "CPU";
|
2461
2439
|
|
2462
|
-
|
2463
|
-
|
2464
|
-
|
2465
|
-
if (
|
2466
|
-
|
2440
|
+
lm_ggml_backend_cpu_device_context() {
|
2441
|
+
#ifdef __APPLE__
|
2442
|
+
size_t len = 0;
|
2443
|
+
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
2444
|
+
description.resize(len);
|
2445
|
+
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
2446
|
+
}
|
2447
|
+
#elif defined(__linux__)
|
2448
|
+
FILE * f = fopen("/proc/cpuinfo", "r");
|
2449
|
+
if (f) {
|
2450
|
+
char buf[1024];
|
2451
|
+
while (fgets(buf, sizeof(buf), f)) {
|
2452
|
+
if (strncmp(buf, "model name", 10) == 0) {
|
2453
|
+
char * p = strchr(buf, ':');
|
2454
|
+
if (p) {
|
2455
|
+
p++;
|
2456
|
+
while (std::isspace(*p)) {
|
2457
|
+
p++;
|
2458
|
+
}
|
2459
|
+
while (std::isspace(p[strlen(p) - 1])) {
|
2460
|
+
p[strlen(p) - 1] = '\0';
|
2461
|
+
}
|
2462
|
+
description = p;
|
2463
|
+
break;
|
2464
|
+
}
|
2465
|
+
}
|
2466
|
+
}
|
2467
|
+
fclose(f);
|
2468
|
+
}
|
2469
|
+
#elif defined(_WIN32)
|
2470
|
+
HKEY hKey;
|
2471
|
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
2472
|
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
2473
|
+
0,
|
2474
|
+
KEY_READ,
|
2475
|
+
&hKey) == ERROR_SUCCESS) {
|
2476
|
+
DWORD cpu_brand_size = 0;
|
2477
|
+
if (RegQueryValueExA(hKey,
|
2478
|
+
TEXT("ProcessorNameString"),
|
2479
|
+
NULL,
|
2480
|
+
NULL,
|
2481
|
+
NULL,
|
2482
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
2483
|
+
description.resize(cpu_brand_size);
|
2484
|
+
if (RegQueryValueExA(hKey,
|
2485
|
+
TEXT("ProcessorNameString"),
|
2486
|
+
NULL,
|
2487
|
+
NULL,
|
2488
|
+
(LPBYTE)&description[0], // NOLINT
|
2489
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
2490
|
+
if (description.find('\0') != std::string::npos) {
|
2491
|
+
description.resize(description.find('\0'));
|
2492
|
+
}
|
2493
|
+
}
|
2494
|
+
}
|
2495
|
+
RegCloseKey(hKey);
|
2467
2496
|
}
|
2468
|
-
|
2497
|
+
#endif
|
2469
2498
|
}
|
2499
|
+
};
|
2470
2500
|
|
2471
|
-
|
2472
|
-
return
|
2501
|
+
static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
|
2502
|
+
return "CPU";
|
2503
|
+
|
2504
|
+
LM_GGML_UNUSED(dev);
|
2473
2505
|
}
|
2474
2506
|
|
2475
|
-
static
|
2476
|
-
|
2477
|
-
if (node_init[id]) {
|
2478
|
-
return;
|
2479
|
-
}
|
2480
|
-
node_init[id] = true;
|
2507
|
+
static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
|
2508
|
+
struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
|
2481
2509
|
|
2482
|
-
|
2483
|
-
|
2484
|
-
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
2485
|
-
lm_ggml_backend_view_init(dst);
|
2486
|
-
}
|
2487
|
-
else {
|
2488
|
-
lm_ggml_backend_tensor_copy(src, dst);
|
2489
|
-
}
|
2510
|
+
return ctx->description.c_str();
|
2511
|
+
}
|
2490
2512
|
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2496
|
-
|
2497
|
-
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
2498
|
-
}
|
2513
|
+
static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
2514
|
+
// TODO
|
2515
|
+
*free = 0;
|
2516
|
+
*total = 0;
|
2517
|
+
|
2518
|
+
LM_GGML_UNUSED(dev);
|
2499
2519
|
}
|
2500
2520
|
|
2501
|
-
|
2502
|
-
|
2503
|
-
struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
2504
|
-
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
2521
|
+
static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
|
2522
|
+
return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
|
2505
2523
|
|
2506
|
-
|
2507
|
-
|
2508
|
-
|
2509
|
-
|
2524
|
+
LM_GGML_UNUSED(dev);
|
2525
|
+
}
|
2526
|
+
|
2527
|
+
static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
|
2528
|
+
props->name = lm_ggml_backend_cpu_device_get_name(dev);
|
2529
|
+
props->description = lm_ggml_backend_cpu_device_get_description(dev);
|
2530
|
+
props->type = lm_ggml_backend_cpu_device_get_type(dev);
|
2531
|
+
lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
2532
|
+
props->caps = {
|
2533
|
+
/* .async = */ false,
|
2534
|
+
/* .host_buffer = */ false,
|
2535
|
+
/* .buffer_from_host_ptr = */ true,
|
2536
|
+
/* .events = */ false,
|
2510
2537
|
};
|
2538
|
+
}
|
2511
2539
|
|
2512
|
-
|
2513
|
-
|
2540
|
+
static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
|
2541
|
+
return lm_ggml_backend_cpu_init();
|
2514
2542
|
|
2515
|
-
|
2516
|
-
|
2517
|
-
|
2518
|
-
free(node_copies);
|
2519
|
-
free(node_init);
|
2520
|
-
lm_ggml_free(ctx_allocated);
|
2521
|
-
lm_ggml_free(ctx_unallocated);
|
2522
|
-
return {
|
2523
|
-
/* .buffer = */ NULL,
|
2524
|
-
/* .ctx_allocated = */ NULL,
|
2525
|
-
/* .ctx_unallocated = */ NULL,
|
2526
|
-
/* .graph = */ NULL,
|
2527
|
-
};
|
2528
|
-
}
|
2543
|
+
LM_GGML_UNUSED(dev);
|
2544
|
+
LM_GGML_UNUSED(params);
|
2545
|
+
}
|
2529
2546
|
|
2530
|
-
|
2531
|
-
|
2532
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
2533
|
-
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
2534
|
-
}
|
2547
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
|
2548
|
+
return lm_ggml_backend_cpu_buffer_type();
|
2535
2549
|
|
2536
|
-
|
2537
|
-
|
2538
|
-
if (buffer == NULL) {
|
2539
|
-
LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
2540
|
-
lm_ggml_hash_set_free(&hash_set);
|
2541
|
-
free(node_copies);
|
2542
|
-
free(node_init);
|
2543
|
-
lm_ggml_free(ctx_allocated);
|
2544
|
-
lm_ggml_free(ctx_unallocated);
|
2545
|
-
return {
|
2546
|
-
/* .buffer = */ NULL,
|
2547
|
-
/* .ctx_allocated = */ NULL,
|
2548
|
-
/* .ctx_unallocated = */ NULL,
|
2549
|
-
/* .graph = */ NULL,
|
2550
|
-
};
|
2551
|
-
}
|
2550
|
+
LM_GGML_UNUSED(dev);
|
2551
|
+
}
|
2552
2552
|
|
2553
|
-
|
2553
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
2554
|
+
return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
2554
2555
|
|
2555
|
-
|
2556
|
-
|
2557
|
-
|
2558
|
-
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
2559
|
-
}
|
2556
|
+
LM_GGML_UNUSED(dev);
|
2557
|
+
LM_GGML_UNUSED(max_tensor_size);
|
2558
|
+
}
|
2560
2559
|
|
2561
|
-
|
2562
|
-
|
2563
|
-
|
2564
|
-
|
2565
|
-
|
2566
|
-
|
2560
|
+
static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
|
2561
|
+
switch (op->op) {
|
2562
|
+
case LM_GGML_OP_CPY:
|
2563
|
+
return
|
2564
|
+
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
2565
|
+
op->type != LM_GGML_TYPE_IQ2_XS &&
|
2566
|
+
op->type != LM_GGML_TYPE_IQ1_S &&
|
2567
|
+
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
2568
|
+
case LM_GGML_OP_MUL_MAT:
|
2569
|
+
return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
2570
|
+
case LM_GGML_OP_ROPE_BACK:
|
2571
|
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
2572
|
+
case LM_GGML_OP_IM2COL_BACK:
|
2573
|
+
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
2574
|
+
case LM_GGML_OP_OUT_PROD:
|
2575
|
+
return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
|
2576
|
+
default:
|
2577
|
+
return true;
|
2567
2578
|
}
|
2568
|
-
graph_copy->n_nodes = graph->n_nodes;
|
2569
|
-
|
2570
|
-
lm_ggml_hash_set_free(&hash_set);
|
2571
|
-
free(node_copies);
|
2572
|
-
free(node_init);
|
2573
2579
|
|
2574
|
-
|
2575
|
-
/* .buffer = */ buffer,
|
2576
|
-
/* .ctx_allocated = */ ctx_allocated,
|
2577
|
-
/* .ctx_unallocated = */ ctx_unallocated,
|
2578
|
-
/* .graph = */ graph_copy,
|
2579
|
-
};
|
2580
|
+
LM_GGML_UNUSED(dev);
|
2580
2581
|
}
|
2581
2582
|
|
2582
|
-
|
2583
|
-
|
2584
|
-
|
2585
|
-
|
2583
|
+
static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
|
2584
|
+
return lm_ggml_backend_buft_is_host(buft);
|
2585
|
+
|
2586
|
+
LM_GGML_UNUSED(dev);
|
2586
2587
|
}
|
2587
2588
|
|
2588
|
-
|
2589
|
-
|
2590
|
-
|
2591
|
-
|
2592
|
-
|
2589
|
+
static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
|
2590
|
+
/* .get_name = */ lm_ggml_backend_cpu_device_get_name,
|
2591
|
+
/* .get_description = */ lm_ggml_backend_cpu_device_get_description,
|
2592
|
+
/* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
|
2593
|
+
/* .get_type = */ lm_ggml_backend_cpu_device_get_type,
|
2594
|
+
/* .get_props = */ lm_ggml_backend_cpu_device_get_props,
|
2595
|
+
/* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
|
2596
|
+
/* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
|
2597
|
+
/* .get_host_buffer_type = */ NULL,
|
2598
|
+
/* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
|
2599
|
+
/* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
|
2600
|
+
/* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
|
2601
|
+
/* .offload_op = */ NULL,
|
2602
|
+
/* .event_new = */ NULL,
|
2603
|
+
/* .event_free = */ NULL,
|
2604
|
+
/* .event_synchronize = */ NULL,
|
2605
|
+
};
|
2593
2606
|
|
2594
|
-
|
2595
|
-
struct lm_ggml_cgraph * g2 = copy.graph;
|
2607
|
+
// CPU backend - backend (reg)
|
2596
2608
|
|
2597
|
-
|
2609
|
+
static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
|
2610
|
+
return "CPU";
|
2598
2611
|
|
2599
|
-
|
2600
|
-
|
2601
|
-
struct lm_ggml_tensor * t1 = g1->nodes[i];
|
2602
|
-
struct lm_ggml_tensor * t2 = g2->nodes[i];
|
2612
|
+
LM_GGML_UNUSED(reg);
|
2613
|
+
}
|
2603
2614
|
|
2604
|
-
|
2615
|
+
static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
|
2616
|
+
return 1;
|
2605
2617
|
|
2606
|
-
|
2607
|
-
|
2618
|
+
LM_GGML_UNUSED(reg);
|
2619
|
+
}
|
2608
2620
|
|
2609
|
-
|
2610
|
-
|
2621
|
+
static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
|
2622
|
+
LM_GGML_ASSERT(index == 0);
|
2611
2623
|
|
2612
|
-
|
2613
|
-
|
2614
|
-
|
2624
|
+
static lm_ggml_backend_cpu_device_context ctx;
|
2625
|
+
static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
|
2626
|
+
/* .iface = */ lm_ggml_backend_cpu_device_i,
|
2627
|
+
/* .reg = */ reg,
|
2628
|
+
/* .context = */ &ctx,
|
2629
|
+
};
|
2615
2630
|
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2631
|
+
return &lm_ggml_backend_cpu_device;
|
2632
|
+
}
|
2633
|
+
|
2634
|
+
static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
|
2635
|
+
if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
|
2636
|
+
return (void *)lm_ggml_backend_cpu_set_n_threads;
|
2637
|
+
}
|
2638
|
+
if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
|
2639
|
+
return (void *)lm_ggml_backend_cpu_get_extra_bufts;
|
2620
2640
|
}
|
2621
2641
|
|
2622
|
-
|
2642
|
+
return NULL;
|
2623
2643
|
|
2624
|
-
|
2644
|
+
LM_GGML_UNUSED(reg);
|
2645
|
+
}
|
2646
|
+
|
2647
|
+
static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
|
2648
|
+
/* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
|
2649
|
+
/* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
|
2650
|
+
/* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
|
2651
|
+
/* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
|
2652
|
+
};
|
2653
|
+
|
2654
|
+
lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
|
2655
|
+
static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
|
2656
|
+
/* .iface = */ lm_ggml_backend_cpu_reg_i,
|
2657
|
+
/* .context = */ NULL,
|
2658
|
+
};
|
2659
|
+
|
2660
|
+
return &lm_ggml_backend_cpu_reg;
|
2625
2661
|
}
|