cui-llama.rn 1.2.3 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +9 -11
- package/cpp/common.cpp +85 -75
- package/cpp/common.h +127 -91
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1697 -1626
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +95 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4468 -19500
- package/cpp/ggml.h +26 -146
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +742 -249
- package/cpp/llama-sampling.h +21 -2
- package/cpp/llama-vocab.cpp +49 -9
- package/cpp/llama-vocab.h +35 -11
- package/cpp/llama.cpp +2468 -2307
- package/cpp/llama.h +65 -32
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +117 -118
- package/cpp/sampling.h +20 -20
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/cpp/ggml-backend.cpp
CHANGED
@@ -8,6 +8,7 @@
|
|
8
8
|
#include <windows.h>
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#include "ggml-backend.h"
|
11
12
|
#include "ggml-backend-impl.h"
|
12
13
|
#include "ggml-alloc.h"
|
13
14
|
#include "ggml-impl.h"
|
@@ -34,6 +35,11 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
|
|
34
35
|
}
|
35
36
|
|
36
37
|
lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
38
|
+
if (size == 0) {
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
40
|
+
return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
|
41
|
+
}
|
42
|
+
|
37
43
|
return buft->iface.alloc_buffer(buft, size);
|
38
44
|
}
|
39
45
|
|
@@ -89,7 +95,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
|
|
89
95
|
}
|
90
96
|
|
91
97
|
const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
|
92
|
-
return
|
98
|
+
return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
|
93
99
|
}
|
94
100
|
|
95
101
|
void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
|
@@ -108,6 +114,11 @@ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
|
|
108
114
|
}
|
109
115
|
|
110
116
|
void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
118
|
+
if (buffer->size == 0) {
|
119
|
+
return NULL;
|
120
|
+
}
|
121
|
+
|
111
122
|
void * base = buffer->iface.get_base(buffer);
|
112
123
|
|
113
124
|
LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
@@ -122,6 +133,15 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct
|
|
122
133
|
}
|
123
134
|
}
|
124
135
|
|
136
|
+
void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
137
|
+
// clear is optional if the buffer is zero-sized
|
138
|
+
if (buffer->size == 0) {
|
139
|
+
return;
|
140
|
+
}
|
141
|
+
|
142
|
+
buffer->iface.clear(buffer, value);
|
143
|
+
}
|
144
|
+
|
125
145
|
size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
|
126
146
|
return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
|
127
147
|
}
|
@@ -134,10 +154,6 @@ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, st
|
|
134
154
|
return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
|
135
155
|
}
|
136
156
|
|
137
|
-
void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
138
|
-
buffer->iface.clear(buffer, value);
|
139
|
-
}
|
140
|
-
|
141
157
|
bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
|
142
158
|
return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
|
143
159
|
}
|
@@ -198,7 +214,7 @@ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
|
|
198
214
|
}
|
199
215
|
|
200
216
|
lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
|
201
|
-
return backend->
|
217
|
+
return lm_ggml_backend_dev_buffer_type(backend->device);
|
202
218
|
}
|
203
219
|
|
204
220
|
lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
|
@@ -238,43 +254,42 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
|
|
238
254
|
void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
239
255
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
240
256
|
|
257
|
+
if (size == 0) {
|
258
|
+
return;
|
259
|
+
}
|
260
|
+
|
241
261
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
242
262
|
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
243
263
|
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
244
264
|
|
245
|
-
if (!size) {
|
246
|
-
return;
|
247
|
-
}
|
248
|
-
|
249
265
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
250
266
|
}
|
251
267
|
|
252
268
|
void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
253
269
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
254
270
|
|
271
|
+
if (size == 0) {
|
272
|
+
return;
|
273
|
+
}
|
274
|
+
|
255
275
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
256
276
|
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
257
277
|
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
|
258
278
|
|
259
|
-
if (!size) {
|
260
|
-
return;
|
261
|
-
}
|
262
|
-
|
263
279
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
264
280
|
}
|
265
281
|
|
266
282
|
LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
267
283
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
268
284
|
|
269
|
-
|
270
|
-
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
271
|
-
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
272
|
-
|
273
|
-
if (!size) {
|
285
|
+
if (size == 0) {
|
274
286
|
return;
|
275
287
|
}
|
276
288
|
|
277
|
-
LM_GGML_ASSERT(buf
|
289
|
+
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
290
|
+
LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
291
|
+
LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
|
292
|
+
LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
278
293
|
|
279
294
|
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
280
295
|
}
|
@@ -316,33 +331,15 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
|
|
316
331
|
}
|
317
332
|
|
318
333
|
bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
319
|
-
|
320
|
-
if (backend->device) {
|
321
|
-
return lm_ggml_backend_dev_supports_op(backend->device, op);
|
322
|
-
}
|
323
|
-
|
324
|
-
return backend->iface.supports_op(backend, op);
|
334
|
+
return lm_ggml_backend_dev_supports_op(backend->device, op);
|
325
335
|
}
|
326
336
|
|
327
337
|
bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
|
328
|
-
|
329
|
-
if (backend->device) {
|
330
|
-
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
331
|
-
}
|
332
|
-
|
333
|
-
return backend->iface.supports_buft(backend, buft);
|
338
|
+
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
334
339
|
}
|
335
340
|
|
336
341
|
bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
337
|
-
|
338
|
-
if (backend->device) {
|
339
|
-
return lm_ggml_backend_dev_offload_op(backend->device, op);
|
340
|
-
}
|
341
|
-
|
342
|
-
if (backend->iface.offload_op != NULL) {
|
343
|
-
return backend->iface.offload_op(backend, op);
|
344
|
-
}
|
345
|
-
return false;
|
342
|
+
return lm_ggml_backend_dev_offload_op(backend->device, op);
|
346
343
|
}
|
347
344
|
|
348
345
|
lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
|
@@ -379,7 +376,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
|
|
379
376
|
lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
|
380
377
|
} else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
|
381
378
|
#ifndef NDEBUG
|
382
|
-
|
379
|
+
LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
|
383
380
|
#endif
|
384
381
|
size_t nbytes = lm_ggml_nbytes(src);
|
385
382
|
void * data = malloc(nbytes);
|
@@ -538,10 +535,40 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
|
|
538
535
|
#include "ggml-metal.h"
|
539
536
|
#endif
|
540
537
|
|
538
|
+
#ifdef LM_GGML_USE_SYCL
|
539
|
+
#include "ggml-sycl.h"
|
540
|
+
#endif
|
541
|
+
|
542
|
+
#ifdef LM_GGML_USE_VULKAN
|
543
|
+
#include "ggml-vulkan.h"
|
544
|
+
#endif
|
545
|
+
|
541
546
|
#ifdef LM_GGML_USE_BLAS
|
542
547
|
#include "ggml-blas.h"
|
543
548
|
#endif
|
544
549
|
|
550
|
+
#ifdef LM_GGML_USE_RPC
|
551
|
+
#include "ggml-rpc.h"
|
552
|
+
#endif
|
553
|
+
|
554
|
+
#ifndef __AMX_INT8__
|
555
|
+
#undef LM_GGML_USE_AMX
|
556
|
+
#endif
|
557
|
+
|
558
|
+
#ifdef LM_GGML_USE_AMX
|
559
|
+
# include "ggml-amx.h"
|
560
|
+
#endif
|
561
|
+
|
562
|
+
#ifdef LM_GGML_USE_CANN
|
563
|
+
#include "ggml-cann.h"
|
564
|
+
#endif
|
565
|
+
|
566
|
+
#ifdef LM_GGML_USE_KOMPUTE
|
567
|
+
#include "ggml-kompute.h"
|
568
|
+
#endif
|
569
|
+
|
570
|
+
#include "ggml-cpu.h"
|
571
|
+
|
545
572
|
struct lm_ggml_backend_registry {
|
546
573
|
std::vector<lm_ggml_backend_reg_t> backends;
|
547
574
|
std::vector<lm_ggml_backend_dev_t> devices;
|
@@ -553,18 +580,34 @@ struct lm_ggml_backend_registry {
|
|
553
580
|
#ifdef LM_GGML_USE_METAL
|
554
581
|
register_backend(lm_ggml_backend_metal_reg());
|
555
582
|
#endif
|
583
|
+
#ifdef LM_GGML_USE_SYCL
|
584
|
+
register_backend(lm_ggml_backend_sycl_reg());
|
585
|
+
#endif
|
586
|
+
#ifdef LM_GGML_USE_VULKAN
|
587
|
+
register_backend(lm_ggml_backend_vk_reg());
|
588
|
+
#endif
|
589
|
+
#ifdef LM_GGML_USE_CANN
|
590
|
+
register_backend(lm_ggml_backend_cann_reg());
|
591
|
+
#endif
|
556
592
|
#ifdef LM_GGML_USE_BLAS
|
557
593
|
register_backend(lm_ggml_backend_blas_reg());
|
558
594
|
#endif
|
559
|
-
|
560
|
-
|
595
|
+
#ifdef LM_GGML_USE_RPC
|
596
|
+
register_backend(lm_ggml_backend_rpc_reg());
|
597
|
+
#endif
|
598
|
+
#ifdef LM_GGML_USE_AMX
|
599
|
+
register_backend(lm_ggml_backend_amx_reg());
|
600
|
+
#endif
|
601
|
+
#ifdef LM_GGML_USE_KOMPUTE
|
602
|
+
register_backend(lm_ggml_backend_kompute_reg());
|
603
|
+
#endif
|
561
604
|
|
562
605
|
register_backend(lm_ggml_backend_cpu_reg());
|
563
606
|
}
|
564
607
|
|
565
608
|
void register_backend(lm_ggml_backend_reg_t reg) {
|
566
609
|
#ifndef NDEBUG
|
567
|
-
|
610
|
+
LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
568
611
|
__func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
|
569
612
|
#endif
|
570
613
|
backends.push_back(reg);
|
@@ -575,7 +618,7 @@ struct lm_ggml_backend_registry {
|
|
575
618
|
|
576
619
|
void register_device(lm_ggml_backend_dev_t device) {
|
577
620
|
#ifndef NDEBUG
|
578
|
-
|
621
|
+
LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
|
579
622
|
#endif
|
580
623
|
devices.push_back(device);
|
581
624
|
}
|
@@ -663,9 +706,9 @@ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type typ
|
|
663
706
|
}
|
664
707
|
|
665
708
|
lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
666
|
-
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(
|
709
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
|
667
710
|
if (!dev) {
|
668
|
-
dev = lm_ggml_backend_dev_by_type(
|
711
|
+
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
669
712
|
}
|
670
713
|
if (!dev) {
|
671
714
|
return NULL;
|
@@ -673,1918 +716,1946 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
|
673
716
|
return lm_ggml_backend_dev_init(dev, NULL);
|
674
717
|
}
|
675
718
|
|
676
|
-
//
|
719
|
+
// multi-buffer buffer
|
677
720
|
|
678
|
-
|
721
|
+
struct lm_ggml_backend_multi_buffer_context {
|
722
|
+
lm_ggml_backend_buffer_t * buffers;
|
723
|
+
size_t n_buffers;
|
724
|
+
};
|
679
725
|
|
680
|
-
static
|
681
|
-
|
726
|
+
static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
727
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
728
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
729
|
+
lm_ggml_backend_buffer_free(ctx->buffers[i]);
|
730
|
+
}
|
682
731
|
|
683
|
-
|
732
|
+
free(ctx->buffers);
|
733
|
+
free(ctx);
|
684
734
|
}
|
685
735
|
|
686
|
-
static void
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
if (data % TENSOR_ALIGNMENT != 0) {
|
691
|
-
data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
|
736
|
+
static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
737
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
738
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
739
|
+
lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
|
692
740
|
}
|
693
|
-
|
694
|
-
return (void *)data;
|
695
741
|
}
|
696
742
|
|
697
|
-
static
|
698
|
-
|
699
|
-
|
743
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
|
744
|
+
/* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
|
745
|
+
/* .get_base = */ NULL,
|
746
|
+
/* .init_tensor = */ NULL,
|
747
|
+
/* .memset_tensor = */ NULL,
|
748
|
+
/* .set_tensor = */ NULL,
|
749
|
+
/* .get_tensor = */ NULL,
|
750
|
+
/* .cpy_tensor = */ NULL,
|
751
|
+
/* .clear = */ lm_ggml_backend_multi_buffer_clear,
|
752
|
+
/* .reset = */ NULL,
|
753
|
+
};
|
700
754
|
|
701
|
-
|
702
|
-
|
755
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
756
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
|
757
|
+
ctx->n_buffers = n_buffers;
|
758
|
+
ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
|
703
759
|
|
704
|
-
|
705
|
-
}
|
760
|
+
LM_GGML_ASSERT(ctx->buffers != NULL);
|
706
761
|
|
707
|
-
|
708
|
-
|
762
|
+
size_t total_size = 0;
|
763
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
764
|
+
ctx->buffers[i] = buffers[i];
|
765
|
+
total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
|
766
|
+
}
|
709
767
|
|
710
|
-
|
768
|
+
return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
|
711
769
|
}
|
712
770
|
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
LM_GGML_UNUSED(buffer);
|
771
|
+
bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
|
772
|
+
return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
|
717
773
|
}
|
718
774
|
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
775
|
+
void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
|
776
|
+
LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
|
777
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
778
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
779
|
+
lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
723
780
|
}
|
724
|
-
|
781
|
+
}
|
725
782
|
|
726
|
-
|
783
|
+
// creates a copy of the tensor with the same memory layout
|
784
|
+
static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
|
785
|
+
struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
|
786
|
+
for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
|
787
|
+
dup->nb[i] = tensor->nb[i];
|
788
|
+
}
|
789
|
+
return dup;
|
727
790
|
}
|
728
791
|
|
729
|
-
static
|
730
|
-
|
792
|
+
static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
|
793
|
+
return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
|
731
794
|
}
|
732
795
|
|
733
|
-
|
734
|
-
/* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
|
735
|
-
/* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
|
736
|
-
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
737
|
-
/* .init_tensor = */ NULL, // no initialization required
|
738
|
-
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
739
|
-
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
740
|
-
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
741
|
-
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
742
|
-
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
743
|
-
/* .reset = */ NULL,
|
744
|
-
};
|
796
|
+
// scheduler
|
745
797
|
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
750
|
-
/* .init_tensor = */ NULL, // no initialization required
|
751
|
-
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
752
|
-
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
753
|
-
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
754
|
-
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
755
|
-
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
756
|
-
/* .reset = */ NULL,
|
757
|
-
};
|
798
|
+
#ifndef LM_GGML_SCHED_MAX_BACKENDS
|
799
|
+
#define LM_GGML_SCHED_MAX_BACKENDS 16
|
800
|
+
#endif
|
758
801
|
|
759
|
-
|
760
|
-
|
802
|
+
#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
|
803
|
+
#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
|
804
|
+
#endif
|
761
805
|
|
762
|
-
|
763
|
-
|
806
|
+
#ifndef LM_GGML_SCHED_MAX_COPIES
|
807
|
+
#define LM_GGML_SCHED_MAX_COPIES 4
|
808
|
+
#endif
|
764
809
|
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
810
|
+
struct lm_ggml_backend_sched_split {
|
811
|
+
int backend_id;
|
812
|
+
int i_start;
|
813
|
+
int i_end;
|
814
|
+
struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
815
|
+
int n_inputs;
|
816
|
+
// graph view of this split
|
817
|
+
struct lm_ggml_cgraph graph;
|
818
|
+
};
|
772
819
|
|
773
|
-
|
774
|
-
|
820
|
+
struct lm_ggml_backend_sched {
|
821
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
822
|
+
bool is_alloc;
|
775
823
|
|
776
|
-
|
777
|
-
return TENSOR_ALIGNMENT;
|
824
|
+
int n_backends;
|
778
825
|
|
779
|
-
|
780
|
-
|
826
|
+
lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
|
827
|
+
lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
|
828
|
+
lm_ggml_gallocr_t galloc;
|
781
829
|
|
782
|
-
|
783
|
-
|
830
|
+
// hash map of the nodes in the graph
|
831
|
+
struct lm_ggml_hash_set hash_set;
|
832
|
+
int * hv_tensor_backend_ids; // [hash_set.size]
|
833
|
+
struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
784
834
|
|
785
|
-
|
786
|
-
|
835
|
+
int * node_backend_ids; // [graph_size]
|
836
|
+
int * leaf_backend_ids; // [graph_size]
|
787
837
|
|
788
|
-
|
789
|
-
|
790
|
-
/* .iface = */ {
|
791
|
-
/* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
|
792
|
-
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
793
|
-
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
794
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
795
|
-
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
796
|
-
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
797
|
-
},
|
798
|
-
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
799
|
-
/* .context = */ NULL,
|
800
|
-
};
|
838
|
+
int * prev_node_backend_ids; // [graph_size]
|
839
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
801
840
|
|
802
|
-
|
803
|
-
|
841
|
+
// copy of the graph with modified inputs
|
842
|
+
struct lm_ggml_cgraph graph;
|
804
843
|
|
805
|
-
|
844
|
+
// graph splits
|
845
|
+
struct lm_ggml_backend_sched_split * splits;
|
846
|
+
int n_splits;
|
847
|
+
int splits_capacity;
|
806
848
|
|
807
|
-
//
|
849
|
+
// pipeline parallelism support
|
850
|
+
int n_copies;
|
851
|
+
int cur_copy;
|
852
|
+
lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
|
853
|
+
struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
854
|
+
int n_graph_inputs;
|
808
855
|
|
809
|
-
|
856
|
+
struct lm_ggml_context * ctx;
|
810
857
|
|
811
|
-
|
812
|
-
|
858
|
+
lm_ggml_backend_sched_eval_callback callback_eval;
|
859
|
+
void * callback_eval_user_data;
|
813
860
|
|
814
|
-
|
815
|
-
|
861
|
+
char * context_buffer;
|
862
|
+
size_t context_buffer_size;
|
816
863
|
|
817
|
-
|
818
|
-
|
864
|
+
int debug;
|
865
|
+
};
|
819
866
|
|
820
|
-
|
821
|
-
|
867
|
+
#define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
868
|
+
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
869
|
+
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
870
|
+
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
822
871
|
|
823
|
-
|
824
|
-
|
872
|
+
// returns the priority of the backend, lower id is higher priority
|
873
|
+
static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
|
874
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
875
|
+
if (sched->backends[i] == backend) {
|
876
|
+
return i;
|
877
|
+
}
|
878
|
+
}
|
879
|
+
return -1;
|
825
880
|
}
|
826
881
|
|
827
|
-
static
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
if (result != 0) {
|
832
|
-
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
833
|
-
return NULL;
|
882
|
+
static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
|
883
|
+
lm_ggml_backend_buffer_t buffer = tensor->buffer;
|
884
|
+
if (buffer == NULL) {
|
885
|
+
return -1;
|
834
886
|
}
|
835
887
|
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
}
|
888
|
+
// find highest prio backend that supports the buffer type and the op
|
889
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
890
|
+
if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
891
|
+
lm_ggml_backend_supports_op(sched->backends[i], op)) {
|
892
|
+
return i;
|
893
|
+
}
|
894
|
+
}
|
843
895
|
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
/* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
849
|
-
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
850
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
851
|
-
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
852
|
-
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
853
|
-
},
|
854
|
-
/* .context = */ NULL,
|
855
|
-
};
|
896
|
+
#ifndef NDEBUG
|
897
|
+
LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
898
|
+
__func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
|
899
|
+
#endif
|
856
900
|
|
857
|
-
return
|
901
|
+
return -1;
|
858
902
|
}
|
859
|
-
#endif
|
860
903
|
|
861
|
-
|
862
|
-
|
863
|
-
|
904
|
+
#if 0
|
905
|
+
#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
906
|
+
static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
907
|
+
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
908
|
+
#define GET_CAUSE(node) causes[hash_id(node)]
|
909
|
+
#else
|
910
|
+
#define SET_CAUSE(node, ...)
|
911
|
+
#define GET_CAUSE(node) ""
|
912
|
+
#endif
|
864
913
|
|
865
|
-
|
866
|
-
|
914
|
+
// returns the backend that should be used for the node based on the current locations
|
915
|
+
static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
|
916
|
+
// TODO: use supports_op to check if the backend supports the op
|
867
917
|
|
868
|
-
|
869
|
-
|
870
|
-
|
918
|
+
// assign pre-allocated nodes to their backend
|
919
|
+
int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
920
|
+
if (cur_backend_id != -1) {
|
921
|
+
SET_CAUSE(tensor, "1.dst");
|
922
|
+
return cur_backend_id;
|
923
|
+
}
|
871
924
|
|
872
|
-
|
873
|
-
|
925
|
+
// view_src
|
926
|
+
if (tensor->view_src != NULL) {
|
927
|
+
cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
928
|
+
if (cur_backend_id != -1) {
|
929
|
+
SET_CAUSE(tensor, "1.vsrc");
|
930
|
+
return cur_backend_id;
|
931
|
+
}
|
932
|
+
}
|
874
933
|
|
875
|
-
|
876
|
-
|
934
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
935
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
936
|
+
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
937
|
+
}
|
877
938
|
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
}
|
939
|
+
// graph input
|
940
|
+
if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
|
941
|
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
942
|
+
SET_CAUSE(tensor, "1.inp");
|
943
|
+
return cur_backend_id;
|
944
|
+
}
|
884
945
|
|
885
|
-
|
886
|
-
|
946
|
+
// operations with weights are preferably run on the same backend as the weights
|
947
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
948
|
+
const struct lm_ggml_tensor * src = tensor->src[i];
|
949
|
+
if (src == NULL) {
|
950
|
+
continue;
|
951
|
+
}
|
952
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
953
|
+
// not an ideal solution
|
954
|
+
if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
955
|
+
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
956
|
+
// check if a backend with higher prio wants to offload the op
|
957
|
+
if (src_backend_id == sched->n_backends - 1) {
|
958
|
+
for (int b = 0; b < src_backend_id; b++) {
|
959
|
+
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
960
|
+
SET_CAUSE(tensor, "1.off");
|
961
|
+
return b;
|
962
|
+
}
|
963
|
+
}
|
964
|
+
}
|
965
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
966
|
+
return src_backend_id;
|
967
|
+
}
|
968
|
+
}
|
887
969
|
|
888
|
-
|
970
|
+
return -1;
|
889
971
|
}
|
890
972
|
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
902
|
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
973
|
+
static char * fmt_size(size_t size) {
|
974
|
+
static char buffer[128];
|
975
|
+
if (size >= 1024*1024) {
|
976
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
977
|
+
} else {
|
978
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
979
|
+
}
|
980
|
+
return buffer;
|
981
|
+
}
|
903
982
|
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
983
|
+
static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
984
|
+
int cur_split = 0;
|
985
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
986
|
+
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
987
|
+
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
988
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
|
989
|
+
sched->splits[cur_split].n_inputs);
|
990
|
+
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
991
|
+
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
992
|
+
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
993
|
+
}
|
994
|
+
LM_GGML_LOG_DEBUG("\n");
|
995
|
+
cur_split++;
|
996
|
+
}
|
997
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
998
|
+
if (lm_ggml_is_view_op(node->op)) {
|
999
|
+
continue;
|
1000
|
+
}
|
1001
|
+
if (sched->debug > 1) {
|
1002
|
+
lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
|
1003
|
+
LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
|
1004
|
+
fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1005
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1006
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1007
|
+
if (src == NULL) {
|
1008
|
+
continue;
|
1009
|
+
}
|
1010
|
+
lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
|
1011
|
+
LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1012
|
+
fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1013
|
+
}
|
1014
|
+
LM_GGML_LOG_DEBUG("\n");
|
909
1015
|
}
|
910
1016
|
}
|
911
|
-
|
912
|
-
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
913
|
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
914
|
-
|
915
|
-
return cpu_plan;
|
916
1017
|
}
|
917
1018
|
|
918
|
-
static
|
919
|
-
|
1019
|
+
static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
|
1020
|
+
lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
1021
|
+
lm_ggml_backend_buffer_type_t buft = NULL;
|
920
1022
|
|
921
|
-
|
922
|
-
|
1023
|
+
if (buf) {
|
1024
|
+
// the tensor is already allocated
|
1025
|
+
buft = buf->buft;
|
1026
|
+
} else {
|
1027
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
1028
|
+
int tensor_backend_id = tensor_backend_id(t);
|
1029
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
1030
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
1031
|
+
}
|
1032
|
+
if (tensor_backend_id != -1) {
|
1033
|
+
buft = sched->bufts[tensor_backend_id];
|
1034
|
+
}
|
1035
|
+
}
|
923
1036
|
|
924
|
-
|
1037
|
+
return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
925
1038
|
}
|
926
1039
|
|
927
|
-
static
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
LM_GGML_UNUSED(backend);
|
1040
|
+
static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
1041
|
+
if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
1042
|
+
*node_backend_id = cur_backend_id;
|
1043
|
+
SET_CAUSE(node, "2.sup");
|
1044
|
+
}
|
933
1045
|
}
|
934
1046
|
|
935
|
-
|
936
|
-
|
1047
|
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
1048
|
+
static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1049
|
+
// reset splits
|
1050
|
+
sched->n_splits = 0;
|
1051
|
+
sched->n_graph_inputs = 0;
|
1052
|
+
sched->is_reset = false;
|
937
1053
|
|
938
|
-
struct
|
1054
|
+
struct lm_ggml_init_params params = {
|
1055
|
+
/* .mem_size = */ sched->context_buffer_size,
|
1056
|
+
/* .mem_buffer = */ sched->context_buffer,
|
1057
|
+
/* .no_alloc = */ true
|
1058
|
+
};
|
939
1059
|
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
return LM_GGML_STATUS_ALLOC_FAILED;
|
946
|
-
}
|
947
|
-
cpu_ctx->work_size = cplan.work_size;
|
1060
|
+
lm_ggml_free(sched->ctx);
|
1061
|
+
|
1062
|
+
sched->ctx = lm_ggml_init(params);
|
1063
|
+
if (sched->ctx == NULL) {
|
1064
|
+
LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
|
948
1065
|
}
|
949
|
-
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
950
1066
|
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
/* .free = */ lm_ggml_backend_cpu_free,
|
960
|
-
/* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
|
961
|
-
/* .set_tensor_async = */ NULL,
|
962
|
-
/* .get_tensor_async = */ NULL,
|
963
|
-
/* .cpy_tensor_async = */ NULL,
|
964
|
-
/* .synchronize = */ NULL,
|
965
|
-
/* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
|
966
|
-
/* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
|
967
|
-
/* .graph_plan_update = */ NULL,
|
968
|
-
/* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
|
969
|
-
/* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
|
970
|
-
/* .supports_op = */ NULL,
|
971
|
-
/* .supports_buft = */ NULL,
|
972
|
-
/* .offload_op = */ NULL,
|
973
|
-
/* .event_record = */ NULL,
|
974
|
-
/* .event_wait = */ NULL,
|
975
|
-
};
|
976
|
-
|
977
|
-
static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
|
978
|
-
static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
979
|
-
return &guid;
|
980
|
-
}
|
981
|
-
|
982
|
-
lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
983
|
-
struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
|
984
|
-
if (ctx == NULL) {
|
985
|
-
return NULL;
|
986
|
-
}
|
987
|
-
|
988
|
-
ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
|
989
|
-
ctx->threadpool = NULL;
|
990
|
-
ctx->work_data = NULL;
|
991
|
-
ctx->work_size = 0;
|
992
|
-
ctx->abort_callback = NULL;
|
993
|
-
ctx->abort_callback_data = NULL;
|
994
|
-
|
995
|
-
lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
|
996
|
-
/* .guid = */ lm_ggml_backend_cpu_guid(),
|
997
|
-
/* .interface = */ lm_ggml_backend_cpu_i,
|
998
|
-
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
999
|
-
/* .context = */ ctx,
|
1000
|
-
};
|
1001
|
-
|
1002
|
-
if (cpu_backend == NULL) {
|
1003
|
-
delete ctx;
|
1004
|
-
return NULL;
|
1067
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
1068
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1069
|
+
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1070
|
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
1071
|
+
// do not overwrite user assignments
|
1072
|
+
if (*leaf_backend_id == -1) {
|
1073
|
+
*leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1074
|
+
}
|
1005
1075
|
}
|
1006
1076
|
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
|
1015
|
-
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
1016
|
-
|
1017
|
-
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
1018
|
-
ctx->n_threads = n_threads;
|
1019
|
-
}
|
1020
|
-
|
1021
|
-
void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
|
1022
|
-
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
1077
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1078
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1079
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1080
|
+
// do not overwrite user assignments
|
1081
|
+
if (*node_backend_id == -1) {
|
1082
|
+
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
|
1023
1083
|
|
1024
|
-
|
1084
|
+
#if 0
|
1085
|
+
// src
|
1086
|
+
if (node->op == LM_GGML_OP_NONE) {
|
1087
|
+
continue;
|
1088
|
+
}
|
1025
1089
|
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1090
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1091
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1092
|
+
if (src == NULL) {
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1096
|
+
if (*src_backend_id == -1) {
|
1097
|
+
*src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
|
1098
|
+
}
|
1099
|
+
}
|
1100
|
+
#endif
|
1101
|
+
}
|
1029
1102
|
}
|
1030
|
-
ctx->threadpool = threadpool;
|
1031
|
-
}
|
1032
|
-
|
1033
|
-
void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
|
1034
|
-
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
1035
|
-
|
1036
|
-
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
1037
|
-
ctx->abort_callback = abort_callback;
|
1038
|
-
ctx->abort_callback_data = abort_callback_data;
|
1039
|
-
}
|
1040
|
-
|
1041
|
-
lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
1042
|
-
LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
1043
|
-
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
1044
|
-
}
|
1045
|
-
|
1046
|
-
////////////////////////
|
1047
|
-
|
1048
|
-
struct lm_ggml_backend_cpu_device_context {
|
1049
|
-
std::string description = "CPU";
|
1050
1103
|
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
p[strlen(p) - 1] = '\0';
|
1072
|
-
}
|
1073
|
-
description = p;
|
1074
|
-
break;
|
1075
|
-
}
|
1104
|
+
// pass 2: expand current backend assignments
|
1105
|
+
// assign the same backend to adjacent nodes
|
1106
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1107
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1108
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
1109
|
+
// expand gpu down
|
1110
|
+
{
|
1111
|
+
int cur_backend_id = -1;
|
1112
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1113
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1114
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1115
|
+
continue;
|
1116
|
+
}
|
1117
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1118
|
+
if (*node_backend_id != -1) {
|
1119
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1120
|
+
// skip cpu (lowest prio backend)
|
1121
|
+
cur_backend_id = -1;
|
1122
|
+
} else {
|
1123
|
+
cur_backend_id = *node_backend_id;
|
1076
1124
|
}
|
1125
|
+
} else if (cur_backend_id != -1) {
|
1126
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1077
1127
|
}
|
1078
|
-
fclose(f);
|
1079
1128
|
}
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
TEXT("ProcessorNameString"),
|
1097
|
-
NULL,
|
1098
|
-
NULL,
|
1099
|
-
(LPBYTE)&description[0], // NOLINT
|
1100
|
-
&cpu_brand_size) == ERROR_SUCCESS) {
|
1101
|
-
if (description.find('\0') != std::string::npos) {
|
1102
|
-
description.resize(description.find('\0'));
|
1103
|
-
}
|
1129
|
+
}
|
1130
|
+
// expand gpu up
|
1131
|
+
{
|
1132
|
+
int cur_backend_id = -1;
|
1133
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1134
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1135
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1136
|
+
continue;
|
1137
|
+
}
|
1138
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1139
|
+
if (*node_backend_id != -1) {
|
1140
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
1141
|
+
// skip cpu (lowest prio backend)
|
1142
|
+
cur_backend_id = -1;
|
1143
|
+
} else {
|
1144
|
+
cur_backend_id = *node_backend_id;
|
1104
1145
|
}
|
1146
|
+
} else if (cur_backend_id != -1) {
|
1147
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1148
|
+
}
|
1149
|
+
}
|
1150
|
+
}
|
1151
|
+
// expand rest down
|
1152
|
+
{
|
1153
|
+
int cur_backend_id = -1;
|
1154
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1155
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1156
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1157
|
+
continue;
|
1158
|
+
}
|
1159
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1160
|
+
if (*node_backend_id != -1) {
|
1161
|
+
cur_backend_id = *node_backend_id;
|
1162
|
+
} else if (cur_backend_id != -1) {
|
1163
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1164
|
+
}
|
1165
|
+
}
|
1166
|
+
}
|
1167
|
+
// expand rest up
|
1168
|
+
{
|
1169
|
+
int cur_backend_id = -1;
|
1170
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1171
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1172
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1173
|
+
continue;
|
1174
|
+
}
|
1175
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1176
|
+
if (*node_backend_id != -1) {
|
1177
|
+
cur_backend_id = *node_backend_id;
|
1178
|
+
} else if (cur_backend_id != -1) {
|
1179
|
+
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1105
1180
|
}
|
1106
|
-
RegCloseKey(hKey);
|
1107
1181
|
}
|
1108
|
-
#endif
|
1109
1182
|
}
|
1110
|
-
};
|
1111
|
-
|
1112
|
-
static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
|
1113
|
-
return "CPU";
|
1114
|
-
|
1115
|
-
LM_GGML_UNUSED(dev);
|
1116
|
-
}
|
1117
|
-
|
1118
|
-
static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
|
1119
|
-
struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
|
1120
|
-
|
1121
|
-
return ctx->description.c_str();
|
1122
|
-
}
|
1123
1183
|
|
1124
|
-
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
*
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
}
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
}
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
case LM_GGML_OP_IM2COL_BACK:
|
1184
|
-
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
1185
|
-
case LM_GGML_OP_OUT_PROD:
|
1186
|
-
return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
|
1187
|
-
default:
|
1188
|
-
return true;
|
1184
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
1185
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
1186
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
1187
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
1188
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
1189
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
1190
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1191
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1192
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1193
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1194
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1195
|
+
continue;
|
1196
|
+
}
|
1197
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1198
|
+
if (*node_backend_id == -1) {
|
1199
|
+
// unassigned node: find the backend with the most supported inputs
|
1200
|
+
int n_supported_best = -1;
|
1201
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1202
|
+
if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1203
|
+
int n_supported = 0;
|
1204
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1205
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1206
|
+
if (src == NULL) {
|
1207
|
+
continue;
|
1208
|
+
}
|
1209
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1210
|
+
n_supported++;
|
1211
|
+
}
|
1212
|
+
}
|
1213
|
+
if (n_supported > n_supported_best) {
|
1214
|
+
n_supported_best = n_supported;
|
1215
|
+
*node_backend_id = b;
|
1216
|
+
SET_CAUSE(node, "3.best");
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
}
|
1220
|
+
} else {
|
1221
|
+
// assigned node: upgrade to higher prio backend if possible
|
1222
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
1223
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1224
|
+
bool supported = true;
|
1225
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1226
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1227
|
+
if (src == NULL) {
|
1228
|
+
continue;
|
1229
|
+
}
|
1230
|
+
if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1231
|
+
supported = false;
|
1232
|
+
break;
|
1233
|
+
}
|
1234
|
+
}
|
1235
|
+
if (supported) {
|
1236
|
+
*node_backend_id = b;
|
1237
|
+
SET_CAUSE(node, "3.upg");
|
1238
|
+
break;
|
1239
|
+
}
|
1240
|
+
}
|
1241
|
+
}
|
1242
|
+
}
|
1189
1243
|
}
|
1190
1244
|
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
}
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
}
|
1217
|
-
|
1218
|
-
////////////////////////
|
1219
|
-
|
1220
|
-
static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
|
1221
|
-
return "CPU";
|
1222
|
-
|
1223
|
-
LM_GGML_UNUSED(reg);
|
1224
|
-
}
|
1225
|
-
|
1226
|
-
static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
|
1227
|
-
return 1;
|
1228
|
-
|
1229
|
-
LM_GGML_UNUSED(reg);
|
1230
|
-
}
|
1245
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
1246
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1247
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1248
|
+
int * cur_backend_id = &tensor_backend_id(node);
|
1249
|
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1250
|
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
1251
|
+
SET_CAUSE(node, "4.vsrc");
|
1252
|
+
}
|
1253
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1254
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1255
|
+
if (src == NULL) {
|
1256
|
+
continue;
|
1257
|
+
}
|
1258
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1259
|
+
if (*src_backend_id == -1) {
|
1260
|
+
if (src->view_src != NULL) {
|
1261
|
+
// views are always on the same backend as the source
|
1262
|
+
*src_backend_id = tensor_backend_id(src->view_src);
|
1263
|
+
SET_CAUSE(src, "4.vsrc");
|
1264
|
+
} else {
|
1265
|
+
*src_backend_id = *cur_backend_id;
|
1266
|
+
SET_CAUSE(src, "4.cur");
|
1267
|
+
}
|
1268
|
+
}
|
1269
|
+
}
|
1270
|
+
}
|
1231
1271
|
|
1232
|
-
|
1233
|
-
|
1272
|
+
// pass 5: split graph, find tensors that need to be copied
|
1273
|
+
{
|
1274
|
+
int i_split = 0;
|
1275
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[0];
|
1276
|
+
// find the backend of the first split, skipping view ops
|
1277
|
+
int i = 0;
|
1278
|
+
for (; i < graph->n_nodes; i++) {
|
1279
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1280
|
+
if (!lm_ggml_is_view_op(node->op)) {
|
1281
|
+
split->backend_id = tensor_backend_id(node);
|
1282
|
+
break;
|
1283
|
+
}
|
1284
|
+
}
|
1285
|
+
split->i_start = 0;
|
1286
|
+
split->n_inputs = 0;
|
1287
|
+
int cur_backend_id = split->backend_id;
|
1288
|
+
for (; i < graph->n_nodes; i++) {
|
1289
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1234
1290
|
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
/* .reg = */ reg,
|
1239
|
-
/* .context = */ &ctx,
|
1240
|
-
};
|
1291
|
+
if (lm_ggml_is_view_op(node->op)) {
|
1292
|
+
continue;
|
1293
|
+
}
|
1241
1294
|
|
1242
|
-
|
1243
|
-
}
|
1295
|
+
const int node_backend_id = tensor_backend_id(node);
|
1244
1296
|
|
1245
|
-
|
1246
|
-
if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
|
1247
|
-
return (void *)lm_ggml_backend_cpu_set_n_threads;
|
1248
|
-
}
|
1249
|
-
return NULL;
|
1297
|
+
assert(node_backend_id != -1); // all nodes should be assigned by now
|
1250
1298
|
|
1251
|
-
|
1252
|
-
|
1299
|
+
// check if we should start a new split based on the sources of the current node
|
1300
|
+
bool need_new_split = false;
|
1301
|
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1302
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1303
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1304
|
+
if (src == NULL) {
|
1305
|
+
continue;
|
1306
|
+
}
|
1307
|
+
// check if a weight is on a different and incompatible backend
|
1308
|
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1309
|
+
if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1310
|
+
int src_backend_id = tensor_backend_id(src);
|
1311
|
+
if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1312
|
+
need_new_split = true;
|
1313
|
+
break;
|
1314
|
+
}
|
1315
|
+
}
|
1316
|
+
// check if the split has too many inputs
|
1317
|
+
// FIXME: count the number of inputs instead of only checking when full
|
1318
|
+
if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1319
|
+
const size_t id = hash_id(src);
|
1320
|
+
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
1321
|
+
bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1322
|
+
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
1323
|
+
need_new_split = true;
|
1324
|
+
break;
|
1325
|
+
}
|
1326
|
+
}
|
1327
|
+
}
|
1328
|
+
}
|
1253
1329
|
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1330
|
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
1331
|
+
split->i_end = i;
|
1332
|
+
i_split++;
|
1333
|
+
if (i_split >= sched->splits_capacity) {
|
1334
|
+
sched->splits_capacity *= 2;
|
1335
|
+
sched->splits = (lm_ggml_backend_sched_split *)
|
1336
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1337
|
+
LM_GGML_ASSERT(sched->splits != NULL);
|
1338
|
+
}
|
1339
|
+
split = &sched->splits[i_split];
|
1340
|
+
split->backend_id = node_backend_id;
|
1341
|
+
split->i_start = i;
|
1342
|
+
split->n_inputs = 0;
|
1343
|
+
cur_backend_id = node_backend_id;
|
1344
|
+
}
|
1260
1345
|
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1346
|
+
// find inputs that are not on the same backend
|
1347
|
+
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1348
|
+
struct lm_ggml_tensor * src = node->src[j];
|
1349
|
+
if (src == NULL) {
|
1350
|
+
continue;
|
1351
|
+
}
|
1266
1352
|
|
1267
|
-
|
1268
|
-
|
1353
|
+
size_t src_id = hash_id(src);
|
1354
|
+
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
1355
|
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1269
1356
|
|
1270
|
-
|
1357
|
+
if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1358
|
+
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
1359
|
+
lm_ggml_backend_t backend = sched->backends[src_backend_id];
|
1360
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1361
|
+
struct lm_ggml_tensor * tensor_copy;
|
1362
|
+
if (c == sched->cur_copy) {
|
1363
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1364
|
+
} else {
|
1365
|
+
tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1366
|
+
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1367
|
+
}
|
1368
|
+
if (sched->n_copies > 1) {
|
1369
|
+
lm_ggml_set_input(tensor_copy);
|
1370
|
+
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1371
|
+
}
|
1372
|
+
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
1373
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1374
|
+
}
|
1375
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1376
|
+
LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1377
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1378
|
+
}
|
1379
|
+
}
|
1271
1380
|
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1381
|
+
if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1382
|
+
// create a copy of the input in the split's backend
|
1383
|
+
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
1384
|
+
lm_ggml_backend_t backend = sched->backends[cur_backend_id];
|
1385
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1386
|
+
struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1387
|
+
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1388
|
+
if (sched->n_copies > 1) {
|
1389
|
+
lm_ggml_set_input(tensor_copy);
|
1390
|
+
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1391
|
+
}
|
1392
|
+
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
1393
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1394
|
+
}
|
1395
|
+
int n_inputs = split->n_inputs++;
|
1396
|
+
LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1397
|
+
split->inputs[n_inputs] = src;
|
1398
|
+
}
|
1399
|
+
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
}
|
1403
|
+
split->i_end = graph->n_nodes;
|
1404
|
+
sched->n_splits = i_split + 1;
|
1405
|
+
}
|
1276
1406
|
|
1277
|
-
|
1278
|
-
|
1407
|
+
if (sched->debug) {
|
1408
|
+
lm_ggml_backend_sched_print_assignments(sched, graph);
|
1409
|
+
}
|
1279
1410
|
|
1280
|
-
|
1281
|
-
|
1411
|
+
// swap node_backend_ids and leaf _backend_ids with prevs
|
1412
|
+
{
|
1413
|
+
int * tmp = sched->node_backend_ids;
|
1414
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
1415
|
+
sched->prev_node_backend_ids = tmp;
|
1282
1416
|
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
lm_ggml_backend_buffer_free(ctx->buffers[i]);
|
1417
|
+
tmp = sched->leaf_backend_ids;
|
1418
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
1419
|
+
sched->prev_leaf_backend_ids = tmp;
|
1287
1420
|
}
|
1288
1421
|
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
|
1422
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
1423
|
+
if (sched->graph.size < graph_size) {
|
1424
|
+
sched->graph.size = graph_size;
|
1425
|
+
sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
|
1426
|
+
sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
|
1427
|
+
LM_GGML_ASSERT(sched->graph.nodes != NULL);
|
1428
|
+
LM_GGML_ASSERT(sched->graph.leafs != NULL);
|
1297
1429
|
}
|
1298
|
-
|
1430
|
+
sched->graph.n_nodes = 0;
|
1431
|
+
sched->graph.n_leafs = 0;
|
1299
1432
|
|
1300
|
-
|
1301
|
-
/* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
|
1302
|
-
/* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
|
1303
|
-
/* .get_base = */ NULL,
|
1304
|
-
/* .init_tensor = */ NULL,
|
1305
|
-
/* .memset_tensor = */ NULL,
|
1306
|
-
/* .set_tensor = */ NULL,
|
1307
|
-
/* .get_tensor = */ NULL,
|
1308
|
-
/* .cpy_tensor = */ NULL,
|
1309
|
-
/* .clear = */ lm_ggml_backend_multi_buffer_clear,
|
1310
|
-
/* .reset = */ NULL,
|
1311
|
-
};
|
1433
|
+
struct lm_ggml_cgraph * graph_copy = &sched->graph;
|
1312
1434
|
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
|
1435
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1436
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[i];
|
1437
|
+
split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
|
1317
1438
|
|
1318
|
-
|
1439
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1440
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1441
|
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
1319
1442
|
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
|
1324
|
-
}
|
1443
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1444
|
+
const size_t input_id = hash_id(input);
|
1445
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
1325
1446
|
|
1326
|
-
|
1327
|
-
|
1447
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
1448
|
+
struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
|
1449
|
+
input_dep->src[0] = input;
|
1450
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
1451
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1328
1452
|
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1453
|
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
1454
|
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
1455
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1456
|
+
}
|
1332
1457
|
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1458
|
+
for (int j = split->i_start; j < split->i_end; j++) {
|
1459
|
+
assert(graph_copy->size > graph_copy->n_nodes);
|
1460
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1461
|
+
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1462
|
+
}
|
1338
1463
|
}
|
1339
|
-
}
|
1340
1464
|
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1465
|
+
if (sched->n_copies > 1) {
|
1466
|
+
// add input copies as leafs so that they are allocated first
|
1467
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1468
|
+
struct lm_ggml_tensor * input = sched->graph_inputs[i];
|
1469
|
+
size_t id = hash_id(input);
|
1470
|
+
int backend_id = tensor_backend_id(input);
|
1471
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1472
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1473
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1474
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1475
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1476
|
+
}
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1480
|
+
struct lm_ggml_backend_sched_split * split = &sched->splits[i];
|
1481
|
+
int backend_id = split->backend_id;
|
1482
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1483
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1484
|
+
size_t id = hash_id(input);
|
1485
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1486
|
+
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1487
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1488
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1489
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1490
|
+
}
|
1491
|
+
}
|
1492
|
+
}
|
1346
1493
|
}
|
1347
|
-
return dup;
|
1348
|
-
}
|
1349
1494
|
|
1350
|
-
|
1351
|
-
|
1495
|
+
// add leafs from the original graph
|
1496
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1497
|
+
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1498
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1499
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1500
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1501
|
+
}
|
1352
1502
|
}
|
1353
1503
|
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1504
|
+
static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
|
1505
|
+
bool backend_ids_changed = false;
|
1506
|
+
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
1507
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
1508
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
1509
|
+
backend_ids_changed = true;
|
1510
|
+
break;
|
1511
|
+
}
|
1512
|
+
}
|
1513
|
+
if (!backend_ids_changed) {
|
1514
|
+
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
1515
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
1516
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
1517
|
+
backend_ids_changed = true;
|
1518
|
+
break;
|
1519
|
+
}
|
1520
|
+
}
|
1521
|
+
}
|
1363
1522
|
|
1364
|
-
|
1365
|
-
|
1523
|
+
// allocate graph
|
1524
|
+
if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1525
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1526
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1527
|
+
#ifndef NDEBUG
|
1528
|
+
LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
1366
1529
|
#endif
|
1530
|
+
lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1531
|
+
if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1532
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
1533
|
+
return false;
|
1534
|
+
}
|
1535
|
+
}
|
1367
1536
|
|
1368
|
-
|
1369
|
-
|
1370
|
-
int i_start;
|
1371
|
-
int i_end;
|
1372
|
-
struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
1373
|
-
int n_inputs;
|
1374
|
-
// graph view of this split
|
1375
|
-
struct lm_ggml_cgraph graph;
|
1376
|
-
};
|
1377
|
-
|
1378
|
-
struct lm_ggml_backend_sched {
|
1379
|
-
bool is_reset; // true if the scheduler has been reset since the last graph split
|
1380
|
-
bool is_alloc;
|
1381
|
-
|
1382
|
-
int n_backends;
|
1537
|
+
return true;
|
1538
|
+
}
|
1383
1539
|
|
1384
|
-
|
1385
|
-
|
1386
|
-
lm_ggml_gallocr_t galloc;
|
1540
|
+
static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
|
1541
|
+
struct lm_ggml_backend_sched_split * splits = sched->splits;
|
1387
1542
|
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
|
1543
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1544
|
+
struct lm_ggml_backend_sched_split * split = &splits[i];
|
1545
|
+
int split_backend_id = split->backend_id;
|
1546
|
+
lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1392
1547
|
|
1393
|
-
|
1394
|
-
|
1548
|
+
// copy the input tensors to the split backend
|
1549
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1550
|
+
lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1551
|
+
struct lm_ggml_tensor * input = split->inputs[j];
|
1552
|
+
struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
1395
1553
|
|
1396
|
-
|
1397
|
-
|
1554
|
+
if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
|
1555
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1556
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1557
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1558
|
+
} else {
|
1559
|
+
lm_ggml_backend_synchronize(split_backend);
|
1560
|
+
}
|
1561
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1562
|
+
} else {
|
1563
|
+
// wait for the split backend to finish using the input before overwriting it
|
1564
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1565
|
+
lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1566
|
+
} else {
|
1567
|
+
lm_ggml_backend_synchronize(split_backend);
|
1568
|
+
}
|
1569
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
1570
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
1571
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
1572
|
+
lm_ggml_backend_synchronize(input_backend);
|
1573
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1574
|
+
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1575
|
+
} else {
|
1576
|
+
lm_ggml_backend_synchronize(split_backend);
|
1577
|
+
}
|
1578
|
+
lm_ggml_backend_tensor_copy(input, input_cpy);
|
1579
|
+
}
|
1580
|
+
}
|
1581
|
+
}
|
1398
1582
|
|
1399
|
-
|
1400
|
-
|
1583
|
+
if (!sched->callback_eval) {
|
1584
|
+
enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1585
|
+
if (ec != LM_GGML_STATUS_SUCCESS) {
|
1586
|
+
return ec;
|
1587
|
+
}
|
1588
|
+
} else {
|
1589
|
+
// similar to lm_ggml_backend_compare_graph_backend
|
1590
|
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
1591
|
+
struct lm_ggml_tensor * t = split->graph.nodes[j0];
|
1401
1592
|
|
1402
|
-
|
1403
|
-
|
1404
|
-
int n_splits;
|
1405
|
-
int splits_capacity;
|
1593
|
+
// check if the user needs data from this node
|
1594
|
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1406
1595
|
|
1407
|
-
|
1408
|
-
int n_copies;
|
1409
|
-
int cur_copy;
|
1410
|
-
lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
|
1411
|
-
struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
|
1412
|
-
int n_graph_inputs;
|
1596
|
+
int j1 = j0;
|
1413
1597
|
|
1414
|
-
|
1598
|
+
// determine the range [j0, j1] of nodes that can be computed together
|
1599
|
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
1600
|
+
t = split->graph.nodes[++j1];
|
1601
|
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1602
|
+
}
|
1415
1603
|
|
1416
|
-
|
1417
|
-
void * callback_eval_user_data;
|
1604
|
+
struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
|
1418
1605
|
|
1419
|
-
|
1420
|
-
|
1606
|
+
enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
|
1607
|
+
if (ec != LM_GGML_STATUS_SUCCESS) {
|
1608
|
+
return ec;
|
1609
|
+
}
|
1421
1610
|
|
1422
|
-
|
1423
|
-
|
1611
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1612
|
+
lm_ggml_backend_synchronize(split_backend);
|
1424
1613
|
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
1614
|
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1615
|
+
break;
|
1616
|
+
}
|
1429
1617
|
|
1430
|
-
|
1431
|
-
|
1432
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
1433
|
-
if (sched->backends[i] == backend) {
|
1434
|
-
return i;
|
1618
|
+
j0 = j1;
|
1619
|
+
}
|
1435
1620
|
}
|
1436
|
-
}
|
1437
|
-
return -1;
|
1438
|
-
}
|
1439
|
-
|
1440
|
-
static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
|
1441
|
-
lm_ggml_backend_buffer_t buffer = tensor->buffer;
|
1442
|
-
if (buffer == NULL) {
|
1443
|
-
return -1;
|
1444
|
-
}
|
1445
1621
|
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
1450
|
-
|
1622
|
+
// record the event of this copy
|
1623
|
+
if (split->n_inputs > 0) {
|
1624
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1625
|
+
lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
1626
|
+
}
|
1451
1627
|
}
|
1452
1628
|
}
|
1453
1629
|
|
1454
|
-
|
1455
|
-
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
1456
|
-
__func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
|
1457
|
-
#endif
|
1630
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1458
1631
|
|
1459
|
-
return
|
1632
|
+
return LM_GGML_STATUS_SUCCESS;
|
1460
1633
|
}
|
1461
1634
|
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1635
|
+
lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
1636
|
+
lm_ggml_backend_t * backends,
|
1637
|
+
lm_ggml_backend_buffer_type_t * bufts,
|
1638
|
+
int n_backends,
|
1639
|
+
size_t graph_size,
|
1640
|
+
bool parallel) {
|
1641
|
+
LM_GGML_ASSERT(n_backends > 0);
|
1642
|
+
LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
|
1643
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1471
1644
|
|
1472
|
-
|
1473
|
-
static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
|
1474
|
-
// TODO: use supports_op to check if the backend supports the op
|
1645
|
+
struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
|
1475
1646
|
|
1476
|
-
|
1477
|
-
|
1478
|
-
|
1479
|
-
|
1480
|
-
return cur_backend_id;
|
1481
|
-
}
|
1647
|
+
const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
|
1648
|
+
sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
|
1649
|
+
sched->n_backends = n_backends;
|
1650
|
+
sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
|
1482
1651
|
|
1483
|
-
//
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
return cur_backend_id;
|
1489
|
-
}
|
1490
|
-
}
|
1652
|
+
// initialize hash table
|
1653
|
+
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
1654
|
+
sched->hash_set = lm_ggml_hash_set_new(graph_size);
|
1655
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1656
|
+
sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1491
1657
|
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1658
|
+
const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1659
|
+
const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1660
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1661
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1662
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1663
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1496
1664
|
|
1497
|
-
|
1498
|
-
|
1499
|
-
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
1500
|
-
SET_CAUSE(tensor, "1.inp");
|
1501
|
-
return cur_backend_id;
|
1502
|
-
}
|
1665
|
+
sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
|
1666
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
1503
1667
|
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
SET_CAUSE(tensor, "1.off");
|
1517
|
-
return b;
|
1518
|
-
}
|
1519
|
-
}
|
1668
|
+
const int initial_splits_capacity = 16;
|
1669
|
+
sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1670
|
+
sched->splits_capacity = initial_splits_capacity;
|
1671
|
+
|
1672
|
+
for (int b = 0; b < n_backends; b++) {
|
1673
|
+
sched->backends[b] = backends[b];
|
1674
|
+
sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
|
1675
|
+
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
1676
|
+
|
1677
|
+
if (sched->n_copies > 1) {
|
1678
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1679
|
+
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
1520
1680
|
}
|
1521
|
-
SET_CAUSE(tensor, "1.wgt%d", i);
|
1522
|
-
return src_backend_id;
|
1523
1681
|
}
|
1524
1682
|
}
|
1525
1683
|
|
1526
|
-
|
1684
|
+
sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
|
1685
|
+
|
1686
|
+
lm_ggml_backend_sched_reset(sched);
|
1687
|
+
|
1688
|
+
return sched;
|
1527
1689
|
}
|
1528
1690
|
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
1533
|
-
} else {
|
1534
|
-
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
1691
|
+
void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
|
1692
|
+
if (sched == NULL) {
|
1693
|
+
return;
|
1535
1694
|
}
|
1536
|
-
|
1695
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1696
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1697
|
+
lm_ggml_backend_event_free(sched->events[b][c]);
|
1698
|
+
}
|
1699
|
+
}
|
1700
|
+
lm_ggml_gallocr_free(sched->galloc);
|
1701
|
+
lm_ggml_free(sched->ctx);
|
1702
|
+
lm_ggml_hash_set_free(&sched->hash_set);
|
1703
|
+
free(sched->splits);
|
1704
|
+
free(sched->hv_tensor_backend_ids);
|
1705
|
+
free(sched->hv_tensor_copies);
|
1706
|
+
free(sched->node_backend_ids);
|
1707
|
+
free(sched->leaf_backend_ids);
|
1708
|
+
free(sched->prev_node_backend_ids);
|
1709
|
+
free(sched->prev_leaf_backend_ids);
|
1710
|
+
free(sched->context_buffer);
|
1711
|
+
free(sched->graph.nodes);
|
1712
|
+
free(sched->graph.leafs);
|
1713
|
+
free(sched);
|
1537
1714
|
}
|
1538
1715
|
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
1547
|
-
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
1548
|
-
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
1549
|
-
}
|
1550
|
-
fprintf(stderr, "\n");
|
1551
|
-
cur_split++;
|
1552
|
-
}
|
1553
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1554
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1555
|
-
continue;
|
1556
|
-
}
|
1557
|
-
lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
|
1558
|
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
|
1559
|
-
fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1560
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1561
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1562
|
-
if (src == NULL) {
|
1563
|
-
continue;
|
1564
|
-
}
|
1565
|
-
lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
|
1566
|
-
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1567
|
-
fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1568
|
-
}
|
1569
|
-
fprintf(stderr, "\n");
|
1716
|
+
void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
|
1717
|
+
// reset state for the next run
|
1718
|
+
if (!sched->is_reset) {
|
1719
|
+
lm_ggml_hash_set_reset(&sched->hash_set);
|
1720
|
+
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1721
|
+
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1722
|
+
sched->is_reset = true;
|
1570
1723
|
}
|
1724
|
+
sched->is_alloc = false;
|
1571
1725
|
}
|
1572
1726
|
|
1573
|
-
|
1574
|
-
|
1575
|
-
lm_ggml_backend_buffer_type_t buft = NULL;
|
1727
|
+
bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
|
1728
|
+
LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
1576
1729
|
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
1582
|
-
int tensor_backend_id = tensor_backend_id(t);
|
1583
|
-
if (tensor_backend_id == -1 && t->view_src) {
|
1584
|
-
tensor_backend_id = tensor_backend_id(t->view_src);
|
1585
|
-
}
|
1586
|
-
if (tensor_backend_id != -1) {
|
1587
|
-
buft = sched->bufts[tensor_backend_id];
|
1588
|
-
}
|
1730
|
+
lm_ggml_backend_sched_split_graph(sched, measure_graph);
|
1731
|
+
|
1732
|
+
if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1733
|
+
return false;
|
1589
1734
|
}
|
1590
1735
|
|
1591
|
-
|
1592
|
-
|
1736
|
+
lm_ggml_backend_sched_reset(sched);
|
1737
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1593
1738
|
|
1594
|
-
|
1595
|
-
if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
1596
|
-
*node_backend_id = cur_backend_id;
|
1597
|
-
SET_CAUSE(node, "2.sup");
|
1598
|
-
}
|
1739
|
+
return true;
|
1599
1740
|
}
|
1600
1741
|
|
1601
|
-
|
1602
|
-
|
1603
|
-
// reset splits
|
1604
|
-
sched->n_splits = 0;
|
1605
|
-
sched->n_graph_inputs = 0;
|
1606
|
-
sched->is_reset = false;
|
1742
|
+
bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1743
|
+
LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
1607
1744
|
|
1608
|
-
|
1609
|
-
/* .mem_size = */ sched->context_buffer_size,
|
1610
|
-
/* .mem_buffer = */ sched->context_buffer,
|
1611
|
-
/* .no_alloc = */ true
|
1612
|
-
};
|
1745
|
+
lm_ggml_backend_sched_split_graph(sched, graph);
|
1613
1746
|
|
1614
|
-
lm_ggml_free(sched->ctx);
|
1615
1747
|
|
1616
|
-
|
1617
|
-
|
1618
|
-
LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
|
1748
|
+
if (!lm_ggml_backend_sched_alloc_splits(sched)) {
|
1749
|
+
return false;
|
1619
1750
|
}
|
1620
1751
|
|
1621
|
-
|
1622
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
1623
|
-
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1624
|
-
int * leaf_backend_id = &tensor_backend_id(leaf);
|
1625
|
-
// do not overwrite user assignments
|
1626
|
-
if (*leaf_backend_id == -1) {
|
1627
|
-
*leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
1628
|
-
}
|
1629
|
-
}
|
1752
|
+
sched->is_alloc = true;
|
1630
1753
|
|
1631
|
-
|
1632
|
-
|
1633
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1634
|
-
// do not overwrite user assignments
|
1635
|
-
if (*node_backend_id == -1) {
|
1636
|
-
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
|
1754
|
+
return true;
|
1755
|
+
}
|
1637
1756
|
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1757
|
+
enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1758
|
+
enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
|
1759
|
+
lm_ggml_backend_sched_synchronize(sched);
|
1760
|
+
return err;
|
1761
|
+
}
|
1643
1762
|
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
continue;
|
1648
|
-
}
|
1649
|
-
int * src_backend_id = &tensor_backend_id(src);
|
1650
|
-
if (*src_backend_id == -1) {
|
1651
|
-
*src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
|
1652
|
-
}
|
1653
|
-
}
|
1654
|
-
#endif
|
1655
|
-
}
|
1763
|
+
enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
|
1764
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1765
|
+
lm_ggml_backend_sched_reset(sched);
|
1656
1766
|
}
|
1657
1767
|
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1662
|
-
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
1663
|
-
// expand gpu down
|
1664
|
-
{
|
1665
|
-
int cur_backend_id = -1;
|
1666
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1667
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1668
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1669
|
-
continue;
|
1670
|
-
}
|
1671
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1672
|
-
if (*node_backend_id != -1) {
|
1673
|
-
if (*node_backend_id == sched->n_backends - 1) {
|
1674
|
-
// skip cpu (lowest prio backend)
|
1675
|
-
cur_backend_id = -1;
|
1676
|
-
} else {
|
1677
|
-
cur_backend_id = *node_backend_id;
|
1678
|
-
}
|
1679
|
-
} else if (cur_backend_id != -1) {
|
1680
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1681
|
-
}
|
1682
|
-
}
|
1683
|
-
}
|
1684
|
-
// expand gpu up
|
1685
|
-
{
|
1686
|
-
int cur_backend_id = -1;
|
1687
|
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1688
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1689
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1690
|
-
continue;
|
1691
|
-
}
|
1692
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1693
|
-
if (*node_backend_id != -1) {
|
1694
|
-
if (*node_backend_id == sched->n_backends - 1) {
|
1695
|
-
// skip cpu (lowest prio backend)
|
1696
|
-
cur_backend_id = -1;
|
1697
|
-
} else {
|
1698
|
-
cur_backend_id = *node_backend_id;
|
1699
|
-
}
|
1700
|
-
} else if (cur_backend_id != -1) {
|
1701
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1702
|
-
}
|
1703
|
-
}
|
1704
|
-
}
|
1705
|
-
// expand rest down
|
1706
|
-
{
|
1707
|
-
int cur_backend_id = -1;
|
1708
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1709
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1710
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1711
|
-
continue;
|
1712
|
-
}
|
1713
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1714
|
-
if (*node_backend_id != -1) {
|
1715
|
-
cur_backend_id = *node_backend_id;
|
1716
|
-
} else if (cur_backend_id != -1) {
|
1717
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1718
|
-
}
|
1719
|
-
}
|
1720
|
-
}
|
1721
|
-
// expand rest up
|
1722
|
-
{
|
1723
|
-
int cur_backend_id = -1;
|
1724
|
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1725
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1726
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1727
|
-
continue;
|
1728
|
-
}
|
1729
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1730
|
-
if (*node_backend_id != -1) {
|
1731
|
-
cur_backend_id = *node_backend_id;
|
1732
|
-
} else if (cur_backend_id != -1) {
|
1733
|
-
lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1734
|
-
}
|
1768
|
+
if (!sched->is_alloc) {
|
1769
|
+
if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
|
1770
|
+
return LM_GGML_STATUS_ALLOC_FAILED;
|
1735
1771
|
}
|
1736
1772
|
}
|
1737
1773
|
|
1738
|
-
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
1743
|
-
|
1744
|
-
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1745
|
-
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1746
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1747
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1748
|
-
if (lm_ggml_is_view_op(node->op)) {
|
1749
|
-
continue;
|
1750
|
-
}
|
1751
|
-
int * node_backend_id = &tensor_backend_id(node);
|
1752
|
-
if (*node_backend_id == -1) {
|
1753
|
-
// unassigned node: find the backend with the most supported inputs
|
1754
|
-
int n_supported_best = -1;
|
1755
|
-
for (int b = 0; b < sched->n_backends; b++) {
|
1756
|
-
if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1757
|
-
int n_supported = 0;
|
1758
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1759
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1760
|
-
if (src == NULL) {
|
1761
|
-
continue;
|
1762
|
-
}
|
1763
|
-
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1764
|
-
n_supported++;
|
1765
|
-
}
|
1766
|
-
}
|
1767
|
-
if (n_supported > n_supported_best) {
|
1768
|
-
n_supported_best = n_supported;
|
1769
|
-
*node_backend_id = b;
|
1770
|
-
SET_CAUSE(node, "3.best");
|
1771
|
-
}
|
1772
|
-
}
|
1773
|
-
}
|
1774
|
-
} else {
|
1775
|
-
// assigned node: upgrade to higher prio backend if possible
|
1776
|
-
for (int b = 0; b < *node_backend_id; b++) {
|
1777
|
-
if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
|
1778
|
-
bool supported = true;
|
1779
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1780
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1781
|
-
if (src == NULL) {
|
1782
|
-
continue;
|
1783
|
-
}
|
1784
|
-
if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1785
|
-
supported = false;
|
1786
|
-
break;
|
1787
|
-
}
|
1788
|
-
}
|
1789
|
-
if (supported) {
|
1790
|
-
*node_backend_id = b;
|
1791
|
-
SET_CAUSE(node, "3.upg");
|
1792
|
-
break;
|
1793
|
-
}
|
1794
|
-
}
|
1795
|
-
}
|
1796
|
-
}
|
1774
|
+
return lm_ggml_backend_sched_compute_splits(sched);
|
1775
|
+
}
|
1776
|
+
|
1777
|
+
void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
|
1778
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1779
|
+
lm_ggml_backend_synchronize(sched->backends[i]);
|
1797
1780
|
}
|
1781
|
+
}
|
1782
|
+
|
1783
|
+
void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
|
1784
|
+
sched->callback_eval = callback;
|
1785
|
+
sched->callback_eval_user_data = user_data;
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
|
1789
|
+
return sched->n_splits;
|
1790
|
+
}
|
1791
|
+
|
1792
|
+
int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
|
1793
|
+
return sched->n_copies;
|
1794
|
+
}
|
1795
|
+
|
1796
|
+
int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
|
1797
|
+
return sched->n_backends;
|
1798
|
+
}
|
1799
|
+
|
1800
|
+
lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
|
1801
|
+
LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
1802
|
+
return sched->backends[i];
|
1803
|
+
}
|
1804
|
+
|
1805
|
+
size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
|
1806
|
+
int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
|
1807
|
+
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1808
|
+
|
1809
|
+
return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1810
|
+
}
|
1811
|
+
|
1812
|
+
void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
|
1813
|
+
int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
|
1814
|
+
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1815
|
+
tensor_backend_id(node) = backend_index;
|
1816
|
+
SET_CAUSE(node, "usr");
|
1817
|
+
sched->is_reset = false;
|
1818
|
+
}
|
1798
1819
|
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1804
|
-
*cur_backend_id = tensor_backend_id(node->view_src);
|
1805
|
-
SET_CAUSE(node, "4.vsrc");
|
1806
|
-
}
|
1807
|
-
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1808
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1809
|
-
if (src == NULL) {
|
1810
|
-
continue;
|
1811
|
-
}
|
1812
|
-
int * src_backend_id = &tensor_backend_id(src);
|
1813
|
-
if (*src_backend_id == -1) {
|
1814
|
-
if (src->view_src != NULL) {
|
1815
|
-
// views are always on the same backend as the source
|
1816
|
-
*src_backend_id = tensor_backend_id(src->view_src);
|
1817
|
-
SET_CAUSE(src, "4.vsrc");
|
1818
|
-
} else {
|
1819
|
-
*src_backend_id = *cur_backend_id;
|
1820
|
-
SET_CAUSE(src, "4.cur");
|
1821
|
-
}
|
1822
|
-
}
|
1823
|
-
}
|
1820
|
+
lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
|
1821
|
+
int backend_index = tensor_backend_id(node);
|
1822
|
+
if (backend_index == -1) {
|
1823
|
+
return NULL;
|
1824
1824
|
}
|
1825
|
+
return sched->backends[backend_index];
|
1826
|
+
}
|
1825
1827
|
|
1826
|
-
|
1827
|
-
{
|
1828
|
-
int i_split = 0;
|
1829
|
-
struct lm_ggml_backend_sched_split * split = &sched->splits[0];
|
1830
|
-
// find the backend of the first split, skipping view ops
|
1831
|
-
int i = 0;
|
1832
|
-
for (; i < graph->n_nodes; i++) {
|
1833
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1834
|
-
if (!lm_ggml_is_view_op(node->op)) {
|
1835
|
-
split->backend_id = tensor_backend_id(node);
|
1836
|
-
break;
|
1837
|
-
}
|
1838
|
-
}
|
1839
|
-
split->i_start = 0;
|
1840
|
-
split->n_inputs = 0;
|
1841
|
-
int cur_backend_id = split->backend_id;
|
1842
|
-
for (; i < graph->n_nodes; i++) {
|
1843
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
1828
|
+
// utils
|
1844
1829
|
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1830
|
+
void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
|
1831
|
+
LM_GGML_ASSERT(tensor->buffer == NULL);
|
1832
|
+
LM_GGML_ASSERT(tensor->view_src != NULL);
|
1833
|
+
LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1834
|
+
LM_GGML_ASSERT(tensor->view_src->data != NULL);
|
1848
1835
|
|
1849
|
-
|
1836
|
+
tensor->buffer = tensor->view_src->buffer;
|
1837
|
+
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1838
|
+
lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1839
|
+
}
|
1850
1840
|
|
1851
|
-
|
1841
|
+
void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
|
1842
|
+
LM_GGML_ASSERT(tensor->buffer == NULL);
|
1843
|
+
LM_GGML_ASSERT(tensor->data == NULL);
|
1844
|
+
LM_GGML_ASSERT(tensor->view_src == NULL);
|
1845
|
+
LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
|
1846
|
+
LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
1847
|
+
(char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
|
1852
1848
|
|
1853
|
-
|
1854
|
-
|
1855
|
-
|
1856
|
-
|
1857
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1858
|
-
if (src == NULL) {
|
1859
|
-
continue;
|
1860
|
-
}
|
1861
|
-
// check if a weight is on a different backend
|
1862
|
-
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1863
|
-
if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1864
|
-
int src_backend_id = tensor_backend_id(src);
|
1865
|
-
if (src_backend_id != cur_backend_id) {
|
1866
|
-
need_new_split = true;
|
1867
|
-
break;
|
1868
|
-
}
|
1869
|
-
}
|
1870
|
-
// check if the split has too many inputs
|
1871
|
-
// FIXME: count the number of inputs instead of only checking when full
|
1872
|
-
if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1873
|
-
const size_t id = hash_id(src);
|
1874
|
-
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
1875
|
-
bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1876
|
-
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
1877
|
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
1878
|
-
need_new_split = true;
|
1879
|
-
break;
|
1880
|
-
}
|
1881
|
-
}
|
1882
|
-
}
|
1883
|
-
}
|
1849
|
+
tensor->buffer = buffer;
|
1850
|
+
tensor->data = addr;
|
1851
|
+
lm_ggml_backend_buffer_init_tensor(buffer, tensor);
|
1852
|
+
}
|
1884
1853
|
|
1885
|
-
|
1886
|
-
|
1887
|
-
i_split++;
|
1888
|
-
if (i_split >= sched->splits_capacity) {
|
1889
|
-
sched->splits_capacity *= 2;
|
1890
|
-
sched->splits = (lm_ggml_backend_sched_split *)
|
1891
|
-
realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1892
|
-
LM_GGML_ASSERT(sched->splits != NULL);
|
1893
|
-
}
|
1894
|
-
split = &sched->splits[i_split];
|
1895
|
-
split->backend_id = node_backend_id;
|
1896
|
-
split->i_start = i;
|
1897
|
-
split->n_inputs = 0;
|
1898
|
-
cur_backend_id = node_backend_id;
|
1899
|
-
}
|
1854
|
+
static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
|
1855
|
+
struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
|
1900
1856
|
|
1901
|
-
|
1902
|
-
|
1903
|
-
struct lm_ggml_tensor * src = node->src[j];
|
1904
|
-
if (src == NULL) {
|
1905
|
-
continue;
|
1906
|
-
}
|
1857
|
+
LM_GGML_ASSERT(src != NULL);
|
1858
|
+
LM_GGML_ASSERT(src->data && "graph must be allocated");
|
1907
1859
|
|
1908
|
-
|
1909
|
-
|
1910
|
-
|
1860
|
+
size_t id = lm_ggml_hash_insert(&hash_set, src);
|
1861
|
+
if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
|
1862
|
+
return node_copies[lm_ggml_hash_find(&hash_set, src)];
|
1863
|
+
}
|
1911
1864
|
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1915
|
-
|
1916
|
-
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1921
|
-
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1922
|
-
}
|
1923
|
-
if (sched->n_copies > 1) {
|
1924
|
-
lm_ggml_set_input(tensor_copy);
|
1925
|
-
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1926
|
-
}
|
1927
|
-
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
1928
|
-
SET_CAUSE(tensor_copy, "4.cpy");
|
1929
|
-
}
|
1930
|
-
int n_graph_inputs = sched->n_graph_inputs++;
|
1931
|
-
LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1932
|
-
sched->graph_inputs[n_graph_inputs] = src;
|
1933
|
-
}
|
1934
|
-
}
|
1865
|
+
struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1866
|
+
if (src->view_src != NULL) {
|
1867
|
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1868
|
+
dst->view_offs = src->view_offs;
|
1869
|
+
}
|
1870
|
+
dst->op = src->op;
|
1871
|
+
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
1872
|
+
lm_ggml_set_name(dst, src->name);
|
1935
1873
|
|
1936
|
-
|
1937
|
-
|
1938
|
-
|
1939
|
-
|
1940
|
-
|
1941
|
-
struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
|
1942
|
-
lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
|
1943
|
-
if (sched->n_copies > 1) {
|
1944
|
-
lm_ggml_set_input(tensor_copy);
|
1945
|
-
lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1946
|
-
}
|
1947
|
-
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
1948
|
-
SET_CAUSE(tensor_copy, "4.cpy");
|
1949
|
-
}
|
1950
|
-
int n_inputs = split->n_inputs++;
|
1951
|
-
LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
|
1952
|
-
split->inputs[n_inputs] = src;
|
1953
|
-
}
|
1954
|
-
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
1955
|
-
}
|
1956
|
-
}
|
1874
|
+
// copy src
|
1875
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
1876
|
+
struct lm_ggml_tensor * s = src->src[i];
|
1877
|
+
if (s == NULL) {
|
1878
|
+
continue;
|
1957
1879
|
}
|
1958
|
-
|
1959
|
-
|
1880
|
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1881
|
+
}
|
1882
|
+
|
1883
|
+
node_copies[id] = dst;
|
1884
|
+
return dst;
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
|
1888
|
+
size_t id = lm_ggml_hash_find(hash_set, src);
|
1889
|
+
if (node_init[id]) {
|
1890
|
+
return;
|
1891
|
+
}
|
1892
|
+
node_init[id] = true;
|
1893
|
+
|
1894
|
+
struct lm_ggml_tensor * dst = node_copies[id];
|
1895
|
+
if (dst->view_src != NULL) {
|
1896
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1897
|
+
lm_ggml_backend_view_init(dst);
|
1898
|
+
}
|
1899
|
+
else {
|
1900
|
+
lm_ggml_backend_tensor_copy(src, dst);
|
1901
|
+
}
|
1902
|
+
|
1903
|
+
// init src
|
1904
|
+
for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
|
1905
|
+
struct lm_ggml_tensor * s = src->src[i];
|
1906
|
+
if (s == NULL) {
|
1907
|
+
continue;
|
1908
|
+
}
|
1909
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1910
|
+
}
|
1911
|
+
}
|
1912
|
+
|
1913
|
+
struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
|
1914
|
+
struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
|
1915
|
+
struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1916
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
1917
|
+
|
1918
|
+
struct lm_ggml_init_params params = {
|
1919
|
+
/* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
|
1920
|
+
/* .mem_buffer = */ NULL,
|
1921
|
+
/* .no_alloc = */ true
|
1922
|
+
};
|
1923
|
+
|
1924
|
+
struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
|
1925
|
+
struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
|
1926
|
+
|
1927
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
1928
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
1929
|
+
lm_ggml_hash_set_free(&hash_set);
|
1930
|
+
free(node_copies);
|
1931
|
+
free(node_init);
|
1932
|
+
lm_ggml_free(ctx_allocated);
|
1933
|
+
lm_ggml_free(ctx_unallocated);
|
1934
|
+
return {
|
1935
|
+
/* .buffer = */ NULL,
|
1936
|
+
/* .ctx_allocated = */ NULL,
|
1937
|
+
/* .ctx_unallocated = */ NULL,
|
1938
|
+
/* .graph = */ NULL,
|
1939
|
+
};
|
1940
|
+
}
|
1941
|
+
|
1942
|
+
// dup nodes
|
1943
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1944
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1945
|
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1960
1946
|
}
|
1961
1947
|
|
1962
|
-
|
1963
|
-
|
1948
|
+
// allocate nodes
|
1949
|
+
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1950
|
+
if (buffer == NULL) {
|
1951
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
1952
|
+
lm_ggml_hash_set_free(&hash_set);
|
1953
|
+
free(node_copies);
|
1954
|
+
free(node_init);
|
1955
|
+
lm_ggml_free(ctx_allocated);
|
1956
|
+
lm_ggml_free(ctx_unallocated);
|
1957
|
+
return {
|
1958
|
+
/* .buffer = */ NULL,
|
1959
|
+
/* .ctx_allocated = */ NULL,
|
1960
|
+
/* .ctx_unallocated = */ NULL,
|
1961
|
+
/* .graph = */ NULL,
|
1962
|
+
};
|
1964
1963
|
}
|
1965
1964
|
|
1966
|
-
//
|
1967
|
-
{
|
1968
|
-
int * tmp = sched->node_backend_ids;
|
1969
|
-
sched->node_backend_ids = sched->prev_node_backend_ids;
|
1970
|
-
sched->prev_node_backend_ids = tmp;
|
1965
|
+
//printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
1971
1966
|
|
1972
|
-
|
1973
|
-
|
1974
|
-
|
1967
|
+
// copy data and init views
|
1968
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1969
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1970
|
+
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
1975
1971
|
}
|
1976
1972
|
|
1977
|
-
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
LM_GGML_ASSERT(sched->graph.leafs != NULL);
|
1973
|
+
// build graph copy
|
1974
|
+
struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
1975
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1976
|
+
struct lm_ggml_tensor * node = graph->nodes[i];
|
1977
|
+
struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
|
1978
|
+
graph_copy->nodes[i] = node_copy;
|
1984
1979
|
}
|
1985
|
-
|
1986
|
-
sched->graph.n_leafs = 0;
|
1980
|
+
graph_copy->n_nodes = graph->n_nodes;
|
1987
1981
|
|
1988
|
-
|
1982
|
+
lm_ggml_hash_set_free(&hash_set);
|
1983
|
+
free(node_copies);
|
1984
|
+
free(node_init);
|
1989
1985
|
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1986
|
+
return {
|
1987
|
+
/* .buffer = */ buffer,
|
1988
|
+
/* .ctx_allocated = */ ctx_allocated,
|
1989
|
+
/* .ctx_unallocated = */ ctx_unallocated,
|
1990
|
+
/* .graph = */ graph_copy,
|
1991
|
+
};
|
1992
|
+
}
|
1993
1993
|
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1994
|
+
void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
|
1995
|
+
lm_ggml_backend_buffer_free(copy.buffer);
|
1996
|
+
lm_ggml_free(copy.ctx_allocated);
|
1997
|
+
lm_ggml_free(copy.ctx_unallocated);
|
1998
|
+
}
|
1997
1999
|
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2000
|
+
bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
|
2001
|
+
struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
|
2002
|
+
if (copy.buffer == NULL) {
|
2003
|
+
return false;
|
2004
|
+
}
|
2001
2005
|
|
2002
|
-
|
2003
|
-
|
2004
|
-
input_dep->src[0] = input;
|
2005
|
-
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
2006
|
-
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
2006
|
+
struct lm_ggml_cgraph * g1 = graph;
|
2007
|
+
struct lm_ggml_cgraph * g2 = copy.graph;
|
2007
2008
|
|
2008
|
-
|
2009
|
-
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
2010
|
-
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
2011
|
-
}
|
2009
|
+
assert(g1->n_nodes == g2->n_nodes);
|
2012
2010
|
|
2013
|
-
|
2014
|
-
|
2015
|
-
|
2016
|
-
|
2017
|
-
}
|
2018
|
-
}
|
2011
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
2012
|
+
//printf("eval %d/%d\n", i, g1->n_nodes);
|
2013
|
+
struct lm_ggml_tensor * t1 = g1->nodes[i];
|
2014
|
+
struct lm_ggml_tensor * t2 = g2->nodes[i];
|
2019
2015
|
|
2020
|
-
|
2021
|
-
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
|
2027
|
-
|
2028
|
-
|
2029
|
-
|
2030
|
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
2031
|
-
}
|
2016
|
+
assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
|
2017
|
+
|
2018
|
+
struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
|
2019
|
+
struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
|
2020
|
+
|
2021
|
+
lm_ggml_backend_graph_compute(backend1, &g1v);
|
2022
|
+
lm_ggml_backend_graph_compute(backend2, &g2v);
|
2023
|
+
|
2024
|
+
if (lm_ggml_is_view_op(t1->op)) {
|
2025
|
+
continue;
|
2032
2026
|
}
|
2033
2027
|
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2037
|
-
for (int j = 0; j < split->n_inputs; j++) {
|
2038
|
-
struct lm_ggml_tensor * input = split->inputs[j];
|
2039
|
-
size_t id = hash_id(input);
|
2040
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2041
|
-
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
2042
|
-
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
2043
|
-
assert(graph_copy->size > graph_copy->n_leafs);
|
2044
|
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
2045
|
-
}
|
2046
|
-
}
|
2028
|
+
// compare results, calculate rms etc
|
2029
|
+
if (!callback(i, t1, t2, user_data)) {
|
2030
|
+
break;
|
2047
2031
|
}
|
2048
2032
|
}
|
2049
2033
|
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
2054
|
-
assert(graph_copy->size > graph_copy->n_leafs);
|
2055
|
-
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
2056
|
-
}
|
2034
|
+
lm_ggml_backend_graph_copy_free(copy);
|
2035
|
+
|
2036
|
+
return true;
|
2057
2037
|
}
|
2058
2038
|
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2039
|
+
|
2040
|
+
|
2041
|
+
#include "ggml-backend.h"
|
2042
|
+
#include "ggml-backend-impl.h"
|
2043
|
+
#include "ggml-cpu.h"
|
2044
|
+
#include "ggml-impl.h"
|
2045
|
+
#include <cctype>
|
2046
|
+
#include <string>
|
2047
|
+
|
2048
|
+
// ggml-backend interface
|
2049
|
+
|
2050
|
+
// CPU backend - buffer
|
2051
|
+
|
2052
|
+
static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
2053
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
2054
|
+
|
2055
|
+
// align the buffer
|
2056
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
2057
|
+
data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
|
2067
2058
|
}
|
2068
|
-
|
2069
|
-
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2059
|
+
|
2060
|
+
return (void *)data;
|
2061
|
+
}
|
2062
|
+
|
2063
|
+
static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
2064
|
+
lm_ggml_aligned_free(buffer->context, buffer->size);
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
2068
|
+
memset((char *)tensor->data + offset, value, size);
|
2069
|
+
|
2070
|
+
LM_GGML_UNUSED(buffer);
|
2071
|
+
}
|
2072
|
+
|
2073
|
+
static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
2074
|
+
memcpy((char *)tensor->data + offset, data, size);
|
2075
|
+
|
2076
|
+
LM_GGML_UNUSED(buffer);
|
2077
|
+
}
|
2078
|
+
|
2079
|
+
static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
2080
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
2081
|
+
|
2082
|
+
LM_GGML_UNUSED(buffer);
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
|
2086
|
+
if (lm_ggml_backend_buffer_is_host(src->buffer)) {
|
2087
|
+
memcpy(dst->data, src->data, lm_ggml_nbytes(src));
|
2088
|
+
return true;
|
2076
2089
|
}
|
2090
|
+
return false;
|
2077
2091
|
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2092
|
+
LM_GGML_UNUSED(buffer);
|
2093
|
+
}
|
2094
|
+
|
2095
|
+
static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
2096
|
+
memset(buffer->context, value, buffer->size);
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
|
2100
|
+
/* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
|
2101
|
+
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
2102
|
+
/* .init_tensor = */ NULL, // no initialization required
|
2103
|
+
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
2104
|
+
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
2105
|
+
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
2106
|
+
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
2107
|
+
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
2108
|
+
/* .reset = */ NULL,
|
2109
|
+
};
|
2110
|
+
|
2111
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
|
2112
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
2113
|
+
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
2114
|
+
/* .init_tensor = */ NULL, // no initialization required
|
2115
|
+
/* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
|
2116
|
+
/* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
|
2117
|
+
/* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
|
2118
|
+
/* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
|
2119
|
+
/* .clear = */ lm_ggml_backend_cpu_buffer_clear,
|
2120
|
+
/* .reset = */ NULL,
|
2121
|
+
};
|
2122
|
+
|
2123
|
+
// CPU backend - buffer type
|
2124
|
+
|
2125
|
+
static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2126
|
+
return "CPU";
|
2127
|
+
|
2128
|
+
LM_GGML_UNUSED(buft);
|
2129
|
+
}
|
2130
|
+
|
2131
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
2132
|
+
void * data = lm_ggml_aligned_malloc(size);
|
2133
|
+
|
2134
|
+
if (data == NULL) {
|
2135
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
2136
|
+
return NULL;
|
2090
2137
|
}
|
2091
2138
|
|
2092
|
-
return
|
2139
|
+
return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
|
2093
2140
|
}
|
2094
2141
|
|
2095
|
-
static
|
2096
|
-
|
2142
|
+
static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
|
2143
|
+
return TENSOR_ALIGNMENT;
|
2097
2144
|
|
2098
|
-
|
2099
|
-
|
2100
|
-
int split_backend_id = split->backend_id;
|
2101
|
-
lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
|
2145
|
+
LM_GGML_UNUSED(buft);
|
2146
|
+
}
|
2102
2147
|
|
2103
|
-
|
2104
|
-
|
2105
|
-
lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
2106
|
-
struct lm_ggml_tensor * input = split->inputs[j];
|
2107
|
-
struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
2148
|
+
static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
|
2149
|
+
return true;
|
2108
2150
|
|
2109
|
-
|
2110
|
-
|
2111
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2112
|
-
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
2113
|
-
} else {
|
2114
|
-
lm_ggml_backend_synchronize(split_backend);
|
2115
|
-
}
|
2116
|
-
lm_ggml_backend_tensor_copy(input, input_cpy);
|
2117
|
-
} else {
|
2118
|
-
// wait for the split backend to finish using the input before overwriting it
|
2119
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2120
|
-
lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
2121
|
-
} else {
|
2122
|
-
lm_ggml_backend_synchronize(split_backend);
|
2123
|
-
}
|
2124
|
-
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
2125
|
-
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
2126
|
-
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
2127
|
-
lm_ggml_backend_synchronize(input_backend);
|
2128
|
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
2129
|
-
lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
2130
|
-
} else {
|
2131
|
-
lm_ggml_backend_synchronize(split_backend);
|
2132
|
-
}
|
2133
|
-
lm_ggml_backend_tensor_copy(input, input_cpy);
|
2134
|
-
}
|
2135
|
-
}
|
2136
|
-
}
|
2151
|
+
LM_GGML_UNUSED(buft);
|
2152
|
+
}
|
2137
2153
|
|
2138
|
-
|
2139
|
-
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2143
|
-
|
2144
|
-
//
|
2145
|
-
|
2146
|
-
|
2154
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
|
2155
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
|
2156
|
+
/* .iface = */ {
|
2157
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
|
2158
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
2159
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2160
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2161
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2162
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2163
|
+
},
|
2164
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2165
|
+
/* .context = */ NULL,
|
2166
|
+
};
|
2147
2167
|
|
2148
|
-
|
2149
|
-
|
2168
|
+
return &lm_ggml_backend_cpu_buffer_type;
|
2169
|
+
}
|
2150
2170
|
|
2151
|
-
|
2171
|
+
static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2172
|
+
return "CPU_Mapped";
|
2152
2173
|
|
2153
|
-
|
2154
|
-
|
2155
|
-
t = split->graph.nodes[++j1];
|
2156
|
-
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
2157
|
-
}
|
2174
|
+
LM_GGML_UNUSED(buft);
|
2175
|
+
}
|
2158
2176
|
|
2159
|
-
|
2177
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
|
2178
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
|
2179
|
+
/* .iface = */ {
|
2180
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
2181
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
2182
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2183
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2184
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2185
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2186
|
+
},
|
2187
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2188
|
+
/* .context = */ NULL,
|
2189
|
+
};
|
2160
2190
|
|
2161
|
-
|
2162
|
-
|
2163
|
-
return ec;
|
2164
|
-
}
|
2191
|
+
return &lm_ggml_backend_cpu_buffer_type;
|
2192
|
+
}
|
2165
2193
|
|
2166
|
-
|
2167
|
-
lm_ggml_backend_synchronize(split_backend);
|
2194
|
+
#ifdef LM_GGML_USE_CPU_HBM
|
2168
2195
|
|
2169
|
-
|
2170
|
-
break;
|
2171
|
-
}
|
2196
|
+
// buffer type HBM
|
2172
2197
|
|
2173
|
-
|
2174
|
-
}
|
2175
|
-
}
|
2198
|
+
#include <hbwmalloc.h>
|
2176
2199
|
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2200
|
+
static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
2201
|
+
return "CPU_HBM";
|
2202
|
+
|
2203
|
+
LM_GGML_UNUSED(buft);
|
2204
|
+
}
|
2205
|
+
|
2206
|
+
static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
2207
|
+
hbw_free(buffer->context);
|
2208
|
+
}
|
2209
|
+
|
2210
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
2211
|
+
void * ptr;
|
2212
|
+
int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
2213
|
+
if (result != 0) {
|
2214
|
+
LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
2215
|
+
return NULL;
|
2183
2216
|
}
|
2184
2217
|
|
2185
|
-
|
2218
|
+
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
2219
|
+
buffer->buft = buft;
|
2220
|
+
buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
|
2186
2221
|
|
2187
|
-
return
|
2222
|
+
return buffer;
|
2188
2223
|
}
|
2189
2224
|
|
2190
|
-
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
2194
|
-
|
2195
|
-
|
2196
|
-
|
2197
|
-
|
2198
|
-
|
2225
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
|
2226
|
+
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
|
2227
|
+
/* .iface = */ {
|
2228
|
+
/* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
|
2229
|
+
/* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
2230
|
+
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
2231
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
2232
|
+
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
2233
|
+
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
2234
|
+
},
|
2235
|
+
/* .context = */ NULL,
|
2236
|
+
};
|
2199
2237
|
|
2200
|
-
|
2238
|
+
return &lm_ggml_backend_cpu_buffer_type_hbm;
|
2239
|
+
}
|
2240
|
+
#endif
|
2201
2241
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2242
|
+
static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
|
2243
|
+
static lm_ggml_backend_buffer_type_t bufts[] = {
|
2244
|
+
#ifdef LM_GGML_USE_CPU_HBM
|
2245
|
+
lm_ggml_backend_cpu_hbm_buffer_type(),
|
2246
|
+
#endif
|
2247
|
+
NULL
|
2248
|
+
};
|
2205
2249
|
|
2206
|
-
|
2207
|
-
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
2208
|
-
sched->hash_set = lm_ggml_hash_set_new(graph_size);
|
2209
|
-
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
2210
|
-
sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
2250
|
+
return bufts;
|
2211
2251
|
|
2212
|
-
|
2213
|
-
|
2214
|
-
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
2215
|
-
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
2216
|
-
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
2217
|
-
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
2252
|
+
LM_GGML_UNUSED(device);
|
2253
|
+
}
|
2218
2254
|
|
2219
|
-
|
2220
|
-
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
2255
|
+
// CPU backend - backend (stream)
|
2221
2256
|
|
2222
|
-
|
2223
|
-
|
2224
|
-
|
2257
|
+
struct lm_ggml_backend_cpu_context {
|
2258
|
+
int n_threads;
|
2259
|
+
lm_ggml_threadpool_t threadpool;
|
2225
2260
|
|
2226
|
-
|
2227
|
-
|
2228
|
-
sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
|
2229
|
-
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
2230
|
-
if (sched->n_copies > 1) {
|
2231
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2232
|
-
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
2233
|
-
}
|
2234
|
-
}
|
2235
|
-
}
|
2261
|
+
uint8_t * work_data;
|
2262
|
+
size_t work_size;
|
2236
2263
|
|
2237
|
-
|
2264
|
+
lm_ggml_abort_callback abort_callback;
|
2265
|
+
void * abort_callback_data;
|
2266
|
+
};
|
2238
2267
|
|
2239
|
-
|
2268
|
+
static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
|
2269
|
+
return "CPU";
|
2240
2270
|
|
2241
|
-
|
2271
|
+
LM_GGML_UNUSED(backend);
|
2242
2272
|
}
|
2243
2273
|
|
2244
|
-
void
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
for (int c = 0; c < sched->n_copies; c++) {
|
2250
|
-
lm_ggml_backend_event_free(sched->events[b][c]);
|
2251
|
-
}
|
2252
|
-
}
|
2253
|
-
lm_ggml_gallocr_free(sched->galloc);
|
2254
|
-
lm_ggml_free(sched->ctx);
|
2255
|
-
lm_ggml_hash_set_free(&sched->hash_set);
|
2256
|
-
free(sched->splits);
|
2257
|
-
free(sched->hv_tensor_backend_ids);
|
2258
|
-
free(sched->hv_tensor_copies);
|
2259
|
-
free(sched->node_backend_ids);
|
2260
|
-
free(sched->leaf_backend_ids);
|
2261
|
-
free(sched->prev_node_backend_ids);
|
2262
|
-
free(sched->prev_leaf_backend_ids);
|
2263
|
-
free(sched->context_buffer);
|
2264
|
-
free(sched->graph.nodes);
|
2265
|
-
free(sched->graph.leafs);
|
2266
|
-
free(sched);
|
2274
|
+
static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
|
2275
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2276
|
+
delete[] cpu_ctx->work_data;
|
2277
|
+
delete cpu_ctx;
|
2278
|
+
delete backend;
|
2267
2279
|
}
|
2268
2280
|
|
2269
|
-
|
2270
|
-
|
2271
|
-
|
2272
|
-
|
2273
|
-
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
2274
|
-
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
2275
|
-
sched->is_reset = true;
|
2276
|
-
}
|
2277
|
-
sched->is_alloc = false;
|
2278
|
-
}
|
2281
|
+
struct lm_ggml_backend_plan_cpu {
|
2282
|
+
struct lm_ggml_cplan cplan;
|
2283
|
+
struct lm_ggml_cgraph cgraph;
|
2284
|
+
};
|
2279
2285
|
|
2280
|
-
|
2281
|
-
|
2286
|
+
static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
|
2287
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2282
2288
|
|
2283
|
-
|
2289
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
|
2284
2290
|
|
2285
|
-
|
2286
|
-
|
2291
|
+
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
2292
|
+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
2293
|
+
|
2294
|
+
if (cpu_plan->cplan.work_size > 0) {
|
2295
|
+
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
2296
|
+
if (cpu_plan->cplan.work_data == NULL) {
|
2297
|
+
delete cpu_plan;
|
2298
|
+
return NULL;
|
2299
|
+
}
|
2287
2300
|
}
|
2288
2301
|
|
2289
|
-
|
2290
|
-
|
2302
|
+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
2303
|
+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
2291
2304
|
|
2292
|
-
return
|
2305
|
+
return cpu_plan;
|
2293
2306
|
}
|
2294
2307
|
|
2295
|
-
|
2296
|
-
|
2308
|
+
static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
2309
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
2297
2310
|
|
2298
|
-
|
2311
|
+
delete[] cpu_plan->cplan.work_data;
|
2312
|
+
delete cpu_plan;
|
2299
2313
|
|
2314
|
+
LM_GGML_UNUSED(backend);
|
2315
|
+
}
|
2300
2316
|
|
2301
|
-
|
2302
|
-
|
2303
|
-
}
|
2317
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
2318
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
2304
2319
|
|
2305
|
-
|
2320
|
+
return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
2306
2321
|
|
2307
|
-
|
2322
|
+
LM_GGML_UNUSED(backend);
|
2308
2323
|
}
|
2309
2324
|
|
2310
|
-
enum lm_ggml_status
|
2311
|
-
|
2312
|
-
lm_ggml_backend_sched_synchronize(sched);
|
2313
|
-
return err;
|
2314
|
-
}
|
2325
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
2326
|
+
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
2315
2327
|
|
2316
|
-
|
2317
|
-
if (!sched->is_reset && !sched->is_alloc) {
|
2318
|
-
lm_ggml_backend_sched_reset(sched);
|
2319
|
-
}
|
2328
|
+
struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
2320
2329
|
|
2321
|
-
if (
|
2322
|
-
|
2330
|
+
if (cpu_ctx->work_size < cplan.work_size) {
|
2331
|
+
delete[] cpu_ctx->work_data;
|
2332
|
+
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
2333
|
+
if (cpu_ctx->work_data == NULL) {
|
2334
|
+
cpu_ctx->work_size = 0;
|
2323
2335
|
return LM_GGML_STATUS_ALLOC_FAILED;
|
2324
2336
|
}
|
2337
|
+
cpu_ctx->work_size = cplan.work_size;
|
2325
2338
|
}
|
2339
|
+
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
2326
2340
|
|
2327
|
-
|
2328
|
-
|
2341
|
+
cplan.abort_callback = cpu_ctx->abort_callback;
|
2342
|
+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
2329
2343
|
|
2330
|
-
|
2331
|
-
for (int i = 0; i < sched->n_backends; i++) {
|
2332
|
-
lm_ggml_backend_synchronize(sched->backends[i]);
|
2333
|
-
}
|
2344
|
+
return lm_ggml_graph_compute(cgraph, &cplan);
|
2334
2345
|
}
|
2335
2346
|
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2347
|
+
static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
|
2348
|
+
/* .get_name = */ lm_ggml_backend_cpu_get_name,
|
2349
|
+
/* .free = */ lm_ggml_backend_cpu_free,
|
2350
|
+
/* .set_tensor_async = */ NULL,
|
2351
|
+
/* .get_tensor_async = */ NULL,
|
2352
|
+
/* .cpy_tensor_async = */ NULL,
|
2353
|
+
/* .synchronize = */ NULL,
|
2354
|
+
/* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
|
2355
|
+
/* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
|
2356
|
+
/* .graph_plan_update = */ NULL,
|
2357
|
+
/* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
|
2358
|
+
/* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
|
2359
|
+
/* .event_record = */ NULL,
|
2360
|
+
/* .event_wait = */ NULL,
|
2361
|
+
};
|
2340
2362
|
|
2341
|
-
|
2342
|
-
|
2363
|
+
static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
|
2364
|
+
static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
2365
|
+
return &guid;
|
2343
2366
|
}
|
2344
2367
|
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2368
|
+
lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
2369
|
+
// initialize CPU backend now to avoid slowing the first graph computation
|
2370
|
+
lm_ggml_cpu_init();
|
2348
2371
|
|
2349
|
-
|
2350
|
-
|
2372
|
+
struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
|
2373
|
+
if (ctx == NULL) {
|
2374
|
+
return NULL;
|
2375
|
+
}
|
2376
|
+
|
2377
|
+
ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
|
2378
|
+
ctx->threadpool = NULL;
|
2379
|
+
ctx->work_data = NULL;
|
2380
|
+
ctx->work_size = 0;
|
2381
|
+
ctx->abort_callback = NULL;
|
2382
|
+
ctx->abort_callback_data = NULL;
|
2383
|
+
|
2384
|
+
lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
|
2385
|
+
/* .guid = */ lm_ggml_backend_cpu_guid(),
|
2386
|
+
/* .interface = */ lm_ggml_backend_cpu_i,
|
2387
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
2388
|
+
/* .context = */ ctx,
|
2389
|
+
};
|
2390
|
+
|
2391
|
+
if (cpu_backend == NULL) {
|
2392
|
+
delete ctx;
|
2393
|
+
return NULL;
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
return cpu_backend;
|
2351
2397
|
}
|
2352
2398
|
|
2353
|
-
|
2354
|
-
|
2355
|
-
return sched->backends[i];
|
2399
|
+
bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
|
2400
|
+
return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
|
2356
2401
|
}
|
2357
2402
|
|
2358
|
-
|
2359
|
-
|
2360
|
-
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
2403
|
+
void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
|
2404
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2361
2405
|
|
2362
|
-
|
2406
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2407
|
+
ctx->n_threads = n_threads;
|
2363
2408
|
}
|
2364
2409
|
|
2365
|
-
void
|
2366
|
-
|
2367
|
-
LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
2368
|
-
tensor_backend_id(node) = backend_index;
|
2369
|
-
SET_CAUSE(node, "usr");
|
2370
|
-
sched->is_reset = false;
|
2371
|
-
}
|
2410
|
+
void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
|
2411
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2372
2412
|
|
2373
|
-
|
2374
|
-
|
2375
|
-
if (
|
2376
|
-
|
2413
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2414
|
+
|
2415
|
+
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
2416
|
+
// already had a different threadpool, pause/suspend it before switching
|
2417
|
+
lm_ggml_threadpool_pause(ctx->threadpool);
|
2377
2418
|
}
|
2378
|
-
|
2419
|
+
ctx->threadpool = threadpool;
|
2379
2420
|
}
|
2380
2421
|
|
2381
|
-
|
2422
|
+
void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
|
2423
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
2382
2424
|
|
2383
|
-
|
2384
|
-
|
2385
|
-
|
2386
|
-
|
2387
|
-
LM_GGML_ASSERT(tensor->view_src->data != NULL);
|
2425
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
2426
|
+
ctx->abort_callback = abort_callback;
|
2427
|
+
ctx->abort_callback_data = abort_callback_data;
|
2428
|
+
}
|
2388
2429
|
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2430
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
2431
|
+
LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
2432
|
+
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
2392
2433
|
}
|
2393
2434
|
|
2394
|
-
|
2395
|
-
LM_GGML_ASSERT(tensor->buffer == NULL);
|
2396
|
-
LM_GGML_ASSERT(tensor->data == NULL);
|
2397
|
-
LM_GGML_ASSERT(tensor->view_src == NULL);
|
2398
|
-
LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
|
2399
|
-
LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
2400
|
-
(char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
|
2435
|
+
// CPU backend - device
|
2401
2436
|
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
2437
|
+
struct lm_ggml_backend_cpu_device_context {
|
2438
|
+
std::string description = "CPU";
|
2439
|
+
|
2440
|
+
lm_ggml_backend_cpu_device_context() {
|
2441
|
+
#ifdef __APPLE__
|
2442
|
+
size_t len = 0;
|
2443
|
+
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
2444
|
+
description.resize(len);
|
2445
|
+
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
2446
|
+
}
|
2447
|
+
#elif defined(__linux__)
|
2448
|
+
FILE * f = fopen("/proc/cpuinfo", "r");
|
2449
|
+
if (f) {
|
2450
|
+
char buf[1024];
|
2451
|
+
while (fgets(buf, sizeof(buf), f)) {
|
2452
|
+
if (strncmp(buf, "model name", 10) == 0) {
|
2453
|
+
char * p = strchr(buf, ':');
|
2454
|
+
if (p) {
|
2455
|
+
p++;
|
2456
|
+
while (std::isspace(*p)) {
|
2457
|
+
p++;
|
2458
|
+
}
|
2459
|
+
while (std::isspace(p[strlen(p) - 1])) {
|
2460
|
+
p[strlen(p) - 1] = '\0';
|
2461
|
+
}
|
2462
|
+
description = p;
|
2463
|
+
break;
|
2464
|
+
}
|
2465
|
+
}
|
2466
|
+
}
|
2467
|
+
fclose(f);
|
2468
|
+
}
|
2469
|
+
#elif defined(_WIN32)
|
2470
|
+
HKEY hKey;
|
2471
|
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
2472
|
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
2473
|
+
0,
|
2474
|
+
KEY_READ,
|
2475
|
+
&hKey) == ERROR_SUCCESS) {
|
2476
|
+
DWORD cpu_brand_size = 0;
|
2477
|
+
if (RegQueryValueExA(hKey,
|
2478
|
+
TEXT("ProcessorNameString"),
|
2479
|
+
NULL,
|
2480
|
+
NULL,
|
2481
|
+
NULL,
|
2482
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
2483
|
+
description.resize(cpu_brand_size);
|
2484
|
+
if (RegQueryValueExA(hKey,
|
2485
|
+
TEXT("ProcessorNameString"),
|
2486
|
+
NULL,
|
2487
|
+
NULL,
|
2488
|
+
(LPBYTE)&description[0], // NOLINT
|
2489
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
2490
|
+
if (description.find('\0') != std::string::npos) {
|
2491
|
+
description.resize(description.find('\0'));
|
2492
|
+
}
|
2493
|
+
}
|
2494
|
+
}
|
2495
|
+
RegCloseKey(hKey);
|
2496
|
+
}
|
2497
|
+
#endif
|
2498
|
+
}
|
2499
|
+
};
|
2500
|
+
|
2501
|
+
static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
|
2502
|
+
return "CPU";
|
2503
|
+
|
2504
|
+
LM_GGML_UNUSED(dev);
|
2405
2505
|
}
|
2406
2506
|
|
2407
|
-
static
|
2408
|
-
struct
|
2507
|
+
static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
|
2508
|
+
struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
|
2409
2509
|
|
2410
|
-
|
2411
|
-
|
2510
|
+
return ctx->description.c_str();
|
2511
|
+
}
|
2412
2512
|
|
2413
|
-
|
2414
|
-
|
2415
|
-
|
2416
|
-
|
2513
|
+
static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
2514
|
+
// TODO
|
2515
|
+
*free = 0;
|
2516
|
+
*total = 0;
|
2417
2517
|
|
2418
|
-
|
2419
|
-
|
2420
|
-
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
2421
|
-
dst->view_offs = src->view_offs;
|
2422
|
-
}
|
2423
|
-
dst->op = src->op;
|
2424
|
-
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
2425
|
-
lm_ggml_set_name(dst, src->name);
|
2518
|
+
LM_GGML_UNUSED(dev);
|
2519
|
+
}
|
2426
2520
|
|
2427
|
-
|
2428
|
-
|
2429
|
-
struct lm_ggml_tensor * s = src->src[i];
|
2430
|
-
if (s == NULL) {
|
2431
|
-
continue;
|
2432
|
-
}
|
2433
|
-
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
2434
|
-
}
|
2521
|
+
static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
|
2522
|
+
return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
|
2435
2523
|
|
2436
|
-
|
2437
|
-
return dst;
|
2524
|
+
LM_GGML_UNUSED(dev);
|
2438
2525
|
}
|
2439
2526
|
|
2440
|
-
static void
|
2441
|
-
|
2442
|
-
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2527
|
+
static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
|
2528
|
+
props->name = lm_ggml_backend_cpu_device_get_name(dev);
|
2529
|
+
props->description = lm_ggml_backend_cpu_device_get_description(dev);
|
2530
|
+
props->type = lm_ggml_backend_cpu_device_get_type(dev);
|
2531
|
+
lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
2532
|
+
props->caps = {
|
2533
|
+
/* .async = */ false,
|
2534
|
+
/* .host_buffer = */ false,
|
2535
|
+
/* .buffer_from_host_ptr = */ true,
|
2536
|
+
/* .events = */ false,
|
2537
|
+
};
|
2538
|
+
}
|
2446
2539
|
|
2447
|
-
|
2448
|
-
|
2449
|
-
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
2450
|
-
lm_ggml_backend_view_init(dst);
|
2451
|
-
}
|
2452
|
-
else {
|
2453
|
-
lm_ggml_backend_tensor_copy(src, dst);
|
2454
|
-
}
|
2540
|
+
static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
|
2541
|
+
return lm_ggml_backend_cpu_init();
|
2455
2542
|
|
2456
|
-
|
2457
|
-
|
2458
|
-
struct lm_ggml_tensor * s = src->src[i];
|
2459
|
-
if (s == NULL) {
|
2460
|
-
continue;
|
2461
|
-
}
|
2462
|
-
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
2463
|
-
}
|
2543
|
+
LM_GGML_UNUSED(dev);
|
2544
|
+
LM_GGML_UNUSED(params);
|
2464
2545
|
}
|
2465
2546
|
|
2466
|
-
|
2467
|
-
|
2468
|
-
struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
2469
|
-
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
2547
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
|
2548
|
+
return lm_ggml_backend_cpu_buffer_type();
|
2470
2549
|
|
2471
|
-
|
2472
|
-
|
2473
|
-
/* .mem_buffer = */ NULL,
|
2474
|
-
/* .no_alloc = */ true
|
2475
|
-
};
|
2550
|
+
LM_GGML_UNUSED(dev);
|
2551
|
+
}
|
2476
2552
|
|
2477
|
-
|
2478
|
-
|
2553
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
2554
|
+
return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
2479
2555
|
|
2480
|
-
|
2481
|
-
|
2482
|
-
|
2483
|
-
free(node_copies);
|
2484
|
-
free(node_init);
|
2485
|
-
lm_ggml_free(ctx_allocated);
|
2486
|
-
lm_ggml_free(ctx_unallocated);
|
2487
|
-
return {
|
2488
|
-
/* .buffer = */ NULL,
|
2489
|
-
/* .ctx_allocated = */ NULL,
|
2490
|
-
/* .ctx_unallocated = */ NULL,
|
2491
|
-
/* .graph = */ NULL,
|
2492
|
-
};
|
2493
|
-
}
|
2556
|
+
LM_GGML_UNUSED(dev);
|
2557
|
+
LM_GGML_UNUSED(max_tensor_size);
|
2558
|
+
}
|
2494
2559
|
|
2495
|
-
|
2496
|
-
|
2497
|
-
|
2498
|
-
|
2560
|
+
static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
|
2561
|
+
switch (op->op) {
|
2562
|
+
case LM_GGML_OP_CPY:
|
2563
|
+
return
|
2564
|
+
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
2565
|
+
op->type != LM_GGML_TYPE_IQ2_XS &&
|
2566
|
+
op->type != LM_GGML_TYPE_IQ1_S &&
|
2567
|
+
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
2568
|
+
case LM_GGML_OP_MUL_MAT:
|
2569
|
+
return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
2570
|
+
case LM_GGML_OP_ROPE_BACK:
|
2571
|
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
2572
|
+
case LM_GGML_OP_IM2COL_BACK:
|
2573
|
+
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
2574
|
+
case LM_GGML_OP_OUT_PROD:
|
2575
|
+
return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
|
2576
|
+
default:
|
2577
|
+
return true;
|
2499
2578
|
}
|
2500
2579
|
|
2501
|
-
|
2502
|
-
|
2503
|
-
if (buffer == NULL) {
|
2504
|
-
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
2505
|
-
lm_ggml_hash_set_free(&hash_set);
|
2506
|
-
free(node_copies);
|
2507
|
-
free(node_init);
|
2508
|
-
lm_ggml_free(ctx_allocated);
|
2509
|
-
lm_ggml_free(ctx_unallocated);
|
2510
|
-
return {
|
2511
|
-
/* .buffer = */ NULL,
|
2512
|
-
/* .ctx_allocated = */ NULL,
|
2513
|
-
/* .ctx_unallocated = */ NULL,
|
2514
|
-
/* .graph = */ NULL,
|
2515
|
-
};
|
2516
|
-
}
|
2580
|
+
LM_GGML_UNUSED(dev);
|
2581
|
+
}
|
2517
2582
|
|
2518
|
-
|
2583
|
+
static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
|
2584
|
+
return lm_ggml_backend_buft_is_host(buft);
|
2519
2585
|
|
2520
|
-
|
2521
|
-
|
2522
|
-
struct lm_ggml_tensor * node = graph->nodes[i];
|
2523
|
-
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
2524
|
-
}
|
2586
|
+
LM_GGML_UNUSED(dev);
|
2587
|
+
}
|
2525
2588
|
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
|
2589
|
+
static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
|
2590
|
+
/* .get_name = */ lm_ggml_backend_cpu_device_get_name,
|
2591
|
+
/* .get_description = */ lm_ggml_backend_cpu_device_get_description,
|
2592
|
+
/* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
|
2593
|
+
/* .get_type = */ lm_ggml_backend_cpu_device_get_type,
|
2594
|
+
/* .get_props = */ lm_ggml_backend_cpu_device_get_props,
|
2595
|
+
/* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
|
2596
|
+
/* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
|
2597
|
+
/* .get_host_buffer_type = */ NULL,
|
2598
|
+
/* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
|
2599
|
+
/* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
|
2600
|
+
/* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
|
2601
|
+
/* .offload_op = */ NULL,
|
2602
|
+
/* .event_new = */ NULL,
|
2603
|
+
/* .event_free = */ NULL,
|
2604
|
+
/* .event_synchronize = */ NULL,
|
2605
|
+
};
|
2534
2606
|
|
2535
|
-
|
2536
|
-
free(node_copies);
|
2537
|
-
free(node_init);
|
2607
|
+
// CPU backend - backend (reg)
|
2538
2608
|
|
2539
|
-
|
2540
|
-
|
2541
|
-
/* .ctx_allocated = */ ctx_allocated,
|
2542
|
-
/* .ctx_unallocated = */ ctx_unallocated,
|
2543
|
-
/* .graph = */ graph_copy,
|
2544
|
-
};
|
2545
|
-
}
|
2609
|
+
static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
|
2610
|
+
return "CPU";
|
2546
2611
|
|
2547
|
-
|
2548
|
-
lm_ggml_backend_buffer_free(copy.buffer);
|
2549
|
-
lm_ggml_free(copy.ctx_allocated);
|
2550
|
-
lm_ggml_free(copy.ctx_unallocated);
|
2612
|
+
LM_GGML_UNUSED(reg);
|
2551
2613
|
}
|
2552
2614
|
|
2553
|
-
|
2554
|
-
|
2555
|
-
if (copy.buffer == NULL) {
|
2556
|
-
return false;
|
2557
|
-
}
|
2615
|
+
static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
|
2616
|
+
return 1;
|
2558
2617
|
|
2559
|
-
|
2560
|
-
|
2618
|
+
LM_GGML_UNUSED(reg);
|
2619
|
+
}
|
2561
2620
|
|
2562
|
-
|
2621
|
+
static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
|
2622
|
+
LM_GGML_ASSERT(index == 0);
|
2563
2623
|
|
2564
|
-
|
2565
|
-
|
2566
|
-
|
2567
|
-
|
2624
|
+
static lm_ggml_backend_cpu_device_context ctx;
|
2625
|
+
static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
|
2626
|
+
/* .iface = */ lm_ggml_backend_cpu_device_i,
|
2627
|
+
/* .reg = */ reg,
|
2628
|
+
/* .context = */ &ctx,
|
2629
|
+
};
|
2568
2630
|
|
2569
|
-
|
2631
|
+
return &lm_ggml_backend_cpu_device;
|
2632
|
+
}
|
2570
2633
|
|
2571
|
-
|
2572
|
-
|
2634
|
+
static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
|
2635
|
+
if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
|
2636
|
+
return (void *)lm_ggml_backend_cpu_set_n_threads;
|
2637
|
+
}
|
2638
|
+
if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
|
2639
|
+
return (void *)lm_ggml_backend_cpu_get_extra_bufts;
|
2640
|
+
}
|
2573
2641
|
|
2574
|
-
|
2575
|
-
lm_ggml_backend_graph_compute(backend2, &g2v);
|
2642
|
+
return NULL;
|
2576
2643
|
|
2577
|
-
|
2578
|
-
|
2579
|
-
}
|
2644
|
+
LM_GGML_UNUSED(reg);
|
2645
|
+
}
|
2580
2646
|
|
2581
|
-
|
2582
|
-
|
2583
|
-
|
2584
|
-
|
2585
|
-
|
2647
|
+
static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
|
2648
|
+
/* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
|
2649
|
+
/* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
|
2650
|
+
/* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
|
2651
|
+
/* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
|
2652
|
+
};
|
2586
2653
|
|
2587
|
-
|
2654
|
+
lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
|
2655
|
+
static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
|
2656
|
+
/* .iface = */ lm_ggml_backend_cpu_reg_i,
|
2657
|
+
/* .context = */ NULL,
|
2658
|
+
};
|
2588
2659
|
|
2589
|
-
return
|
2660
|
+
return &lm_ggml_backend_cpu_reg;
|
2590
2661
|
}
|