whispercpp 1.3.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/LICENSE +1 -1
- data/README.md +165 -434
- data/Rakefile +60 -11
- data/ext/.gitignore +13 -0
- data/ext/cpu.mk +9 -0
- data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
- data/ext/extconf.rb +185 -16
- data/ext/ggml/include/ggml-alloc.h +76 -0
- data/ext/ggml/include/ggml-backend.h +352 -0
- data/ext/ggml/include/ggml-blas.h +25 -0
- data/ext/ggml/include/ggml-cann.h +123 -0
- data/ext/ggml/include/ggml-cpp.h +38 -0
- data/ext/ggml/include/ggml-cpu.h +135 -0
- data/ext/ggml/include/ggml-cuda.h +47 -0
- data/ext/ggml/include/ggml-kompute.h +50 -0
- data/ext/ggml/include/ggml-metal.h +66 -0
- data/ext/ggml/include/ggml-opencl.h +26 -0
- data/ext/ggml/include/ggml-opt.h +216 -0
- data/ext/ggml/include/ggml-rpc.h +28 -0
- data/ext/ggml/include/ggml-sycl.h +49 -0
- data/ext/ggml/include/ggml-vulkan.h +31 -0
- data/ext/{ggml.h → ggml/include/ggml.h} +479 -596
- data/ext/ggml/src/ggml-alloc.c +1037 -0
- data/ext/ggml/src/ggml-amx/common.h +94 -0
- data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
- data/ext/ggml/src/ggml-amx/mmq.h +17 -0
- data/ext/ggml/src/ggml-backend-impl.h +256 -0
- data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
- data/ext/ggml/src/ggml-backend.cpp +1999 -0
- data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- data/ext/ggml/src/ggml-cann/common.h +286 -0
- data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
- data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
- data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
- data/ext/ggml/src/ggml-common.h +1853 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
- data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
- data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- data/ext/ggml/src/ggml-impl.h +556 -0
- data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
- data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
- data/ext/ggml/src/ggml-opt.cpp +854 -0
- data/ext/ggml/src/ggml-quants.c +5238 -0
- data/ext/ggml/src/ggml-quants.h +100 -0
- data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
- data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
- data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
- data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
- data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
- data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
- data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
- data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
- data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
- data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
- data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
- data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
- data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
- data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- data/ext/ggml/src/ggml-threading.cpp +12 -0
- data/ext/ggml/src/ggml-threading.h +14 -0
- data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
- data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- data/ext/ggml/src/ggml.c +7694 -0
- data/ext/{whisper.h → include/whisper.h} +23 -22
- data/ext/metal-embed.mk +17 -0
- data/ext/metal.mk +6 -0
- data/ext/ruby_whisper.cpp +1492 -9
- data/ext/ruby_whisper.h +10 -0
- data/ext/scripts/get-flags.mk +38 -0
- data/ext/src/coreml/whisper-decoder-impl.h +146 -0
- data/ext/src/coreml/whisper-decoder-impl.m +201 -0
- data/ext/src/coreml/whisper-encoder-impl.h +142 -0
- data/ext/src/coreml/whisper-encoder-impl.m +197 -0
- data/ext/src/coreml/whisper-encoder.h +26 -0
- data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
- data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
- data/ext/{whisper.cpp → src/whisper.cpp} +661 -492
- data/extsources.rb +6 -0
- data/lib/whisper/model/uri.rb +157 -0
- data/lib/whisper.rb +2 -0
- data/tests/helper.rb +7 -0
- data/tests/jfk_reader/.gitignore +5 -0
- data/tests/jfk_reader/extconf.rb +3 -0
- data/tests/jfk_reader/jfk_reader.c +68 -0
- data/tests/test_callback.rb +160 -0
- data/tests/test_error.rb +20 -0
- data/tests/test_model.rb +71 -0
- data/tests/test_package.rb +31 -0
- data/tests/test_params.rb +160 -0
- data/tests/test_segment.rb +83 -0
- data/tests/test_whisper.rb +211 -123
- data/whispercpp.gemspec +36 -0
- metadata +137 -11
- data/ext/ggml.c +0 -21755
@@ -0,0 +1,1999 @@
|
|
1
|
+
// Note: porting this file to C++ is a work in progress
|
2
|
+
|
3
|
+
#ifdef _WIN32
|
4
|
+
#define WIN32_LEAN_AND_MEAN
|
5
|
+
#ifndef NOMINMAX
|
6
|
+
# define NOMINMAX
|
7
|
+
#endif
|
8
|
+
#include <windows.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#include "ggml-backend.h"
|
12
|
+
#include "ggml-backend-impl.h"
|
13
|
+
#include "ggml-alloc.h"
|
14
|
+
#include "ggml-impl.h"
|
15
|
+
|
16
|
+
#include <assert.h>
|
17
|
+
#include <limits.h>
|
18
|
+
#include <stdarg.h>
|
19
|
+
#include <stdio.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <string>
|
23
|
+
#include <vector>
|
24
|
+
|
25
|
+
#ifdef __APPLE__
|
26
|
+
#include <sys/types.h>
|
27
|
+
#include <sys/sysctl.h>
|
28
|
+
#endif
|
29
|
+
|
30
|
+
|
31
|
+
// backend buffer type
|
32
|
+
|
33
|
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
34
|
+
return buft->iface.get_name(buft);
|
35
|
+
}
|
36
|
+
|
37
|
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
38
|
+
if (size == 0) {
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
40
|
+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
41
|
+
}
|
42
|
+
|
43
|
+
return buft->iface.alloc_buffer(buft, size);
|
44
|
+
}
|
45
|
+
|
46
|
+
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
47
|
+
return buft->iface.get_alignment(buft);
|
48
|
+
}
|
49
|
+
|
50
|
+
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
51
|
+
// get_max_size is optional, defaults to SIZE_MAX
|
52
|
+
if (buft->iface.get_max_size) {
|
53
|
+
return buft->iface.get_max_size(buft);
|
54
|
+
}
|
55
|
+
return SIZE_MAX;
|
56
|
+
}
|
57
|
+
|
58
|
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
59
|
+
// get_alloc_size is optional, defaults to ggml_nbytes
|
60
|
+
if (buft->iface.get_alloc_size) {
|
61
|
+
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
62
|
+
assert(size >= ggml_nbytes(tensor));
|
63
|
+
return size;
|
64
|
+
}
|
65
|
+
return ggml_nbytes(tensor);
|
66
|
+
}
|
67
|
+
|
68
|
+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
69
|
+
if (buft->iface.is_host) {
|
70
|
+
return buft->iface.is_host(buft);
|
71
|
+
}
|
72
|
+
return false;
|
73
|
+
}
|
74
|
+
|
75
|
+
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
76
|
+
return buft->device;
|
77
|
+
}
|
78
|
+
|
79
|
+
// backend buffer
|
80
|
+
|
81
|
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
82
|
+
ggml_backend_buffer_type_t buft,
|
83
|
+
struct ggml_backend_buffer_i iface,
|
84
|
+
void * context,
|
85
|
+
size_t size) {
|
86
|
+
ggml_backend_buffer_t buffer = new ggml_backend_buffer {
|
87
|
+
/* .interface = */ iface,
|
88
|
+
/* .buft = */ buft,
|
89
|
+
/* .context = */ context,
|
90
|
+
/* .size = */ size,
|
91
|
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
92
|
+
};
|
93
|
+
|
94
|
+
return buffer;
|
95
|
+
}
|
96
|
+
|
97
|
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
98
|
+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
99
|
+
}
|
100
|
+
|
101
|
+
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
102
|
+
if (buffer == NULL) {
|
103
|
+
return;
|
104
|
+
}
|
105
|
+
|
106
|
+
if (buffer->iface.free_buffer != NULL) {
|
107
|
+
buffer->iface.free_buffer(buffer);
|
108
|
+
}
|
109
|
+
delete buffer;
|
110
|
+
}
|
111
|
+
|
112
|
+
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
113
|
+
return buffer->size;
|
114
|
+
}
|
115
|
+
|
116
|
+
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
118
|
+
if (buffer->size == 0) {
|
119
|
+
return NULL;
|
120
|
+
}
|
121
|
+
|
122
|
+
void * base = buffer->iface.get_base(buffer);
|
123
|
+
|
124
|
+
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
125
|
+
|
126
|
+
return base;
|
127
|
+
}
|
128
|
+
|
129
|
+
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
130
|
+
// init_tensor is optional
|
131
|
+
if (buffer->iface.init_tensor) {
|
132
|
+
buffer->iface.init_tensor(buffer, tensor);
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
137
|
+
// clear is optional if the buffer is zero-sized
|
138
|
+
if (buffer->size == 0) {
|
139
|
+
return;
|
140
|
+
}
|
141
|
+
|
142
|
+
buffer->iface.clear(buffer, value);
|
143
|
+
}
|
144
|
+
|
145
|
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
146
|
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
147
|
+
}
|
148
|
+
|
149
|
+
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
150
|
+
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
151
|
+
}
|
152
|
+
|
153
|
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
154
|
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
155
|
+
}
|
156
|
+
|
157
|
+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
158
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
159
|
+
}
|
160
|
+
|
161
|
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
162
|
+
buffer->usage = usage;
|
163
|
+
|
164
|
+
// FIXME: add a generic callback to the buffer interface
|
165
|
+
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
166
|
+
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
171
|
+
return buffer->usage;
|
172
|
+
}
|
173
|
+
|
174
|
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
175
|
+
return buffer->buft;
|
176
|
+
}
|
177
|
+
|
178
|
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
179
|
+
if (buffer->iface.reset) {
|
180
|
+
buffer->iface.reset(buffer);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
185
|
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
186
|
+
if (dst_buf->iface.cpy_tensor) {
|
187
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
188
|
+
}
|
189
|
+
return false;
|
190
|
+
}
|
191
|
+
|
192
|
+
// backend
|
193
|
+
|
194
|
+
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
195
|
+
if (backend == NULL) {
|
196
|
+
return NULL;
|
197
|
+
}
|
198
|
+
return backend->guid;
|
199
|
+
}
|
200
|
+
|
201
|
+
const char * ggml_backend_name(ggml_backend_t backend) {
|
202
|
+
if (backend == NULL) {
|
203
|
+
return "NULL";
|
204
|
+
}
|
205
|
+
return backend->iface.get_name(backend);
|
206
|
+
}
|
207
|
+
|
208
|
+
void ggml_backend_free(ggml_backend_t backend) {
|
209
|
+
if (backend == NULL) {
|
210
|
+
return;
|
211
|
+
}
|
212
|
+
|
213
|
+
backend->iface.free(backend);
|
214
|
+
}
|
215
|
+
|
216
|
+
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
217
|
+
return ggml_backend_dev_buffer_type(backend->device);
|
218
|
+
}
|
219
|
+
|
220
|
+
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
221
|
+
return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
|
222
|
+
}
|
223
|
+
|
224
|
+
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
225
|
+
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
226
|
+
}
|
227
|
+
|
228
|
+
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
229
|
+
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
230
|
+
}
|
231
|
+
|
232
|
+
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
233
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
234
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
235
|
+
|
236
|
+
if (backend->iface.set_tensor_async == NULL) {
|
237
|
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
238
|
+
} else {
|
239
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
244
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
245
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
246
|
+
|
247
|
+
if (backend->iface.get_tensor_async == NULL) {
|
248
|
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
249
|
+
} else {
|
250
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
251
|
+
}
|
252
|
+
}
|
253
|
+
|
254
|
+
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
255
|
+
GGML_ASSERT(tensor);
|
256
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
257
|
+
|
258
|
+
if (size == 0) {
|
259
|
+
return;
|
260
|
+
}
|
261
|
+
|
262
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
263
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
264
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
265
|
+
|
266
|
+
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
267
|
+
}
|
268
|
+
|
269
|
+
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
270
|
+
GGML_ASSERT(tensor);
|
271
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
272
|
+
|
273
|
+
if (size == 0) {
|
274
|
+
return;
|
275
|
+
}
|
276
|
+
|
277
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
278
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
279
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
280
|
+
|
281
|
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
282
|
+
}
|
283
|
+
|
284
|
+
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
285
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
286
|
+
|
287
|
+
if (size == 0) {
|
288
|
+
return;
|
289
|
+
}
|
290
|
+
|
291
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
292
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
293
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
294
|
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
295
|
+
|
296
|
+
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
297
|
+
}
|
298
|
+
|
299
|
+
void ggml_backend_synchronize(ggml_backend_t backend) {
|
300
|
+
if (backend->iface.synchronize == NULL) {
|
301
|
+
return;
|
302
|
+
}
|
303
|
+
|
304
|
+
backend->iface.synchronize(backend);
|
305
|
+
}
|
306
|
+
|
307
|
+
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
308
|
+
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
309
|
+
|
310
|
+
return backend->iface.graph_plan_create(backend, cgraph);
|
311
|
+
}
|
312
|
+
|
313
|
+
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
314
|
+
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
315
|
+
|
316
|
+
backend->iface.graph_plan_free(backend, plan);
|
317
|
+
}
|
318
|
+
|
319
|
+
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
320
|
+
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
321
|
+
|
322
|
+
return backend->iface.graph_plan_compute(backend, plan);
|
323
|
+
}
|
324
|
+
|
325
|
+
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
326
|
+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
327
|
+
ggml_backend_synchronize(backend);
|
328
|
+
return err;
|
329
|
+
}
|
330
|
+
|
331
|
+
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
332
|
+
return backend->iface.graph_compute(backend, cgraph);
|
333
|
+
}
|
334
|
+
|
335
|
+
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
336
|
+
return ggml_backend_dev_supports_op(backend->device, op);
|
337
|
+
}
|
338
|
+
|
339
|
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
340
|
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
341
|
+
}
|
342
|
+
|
343
|
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
344
|
+
return ggml_backend_dev_offload_op(backend->device, op);
|
345
|
+
}
|
346
|
+
|
347
|
+
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
348
|
+
return backend->device;
|
349
|
+
}
|
350
|
+
|
351
|
+
// backend copy
|
352
|
+
|
353
|
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
354
|
+
if (a->type != b->type) {
|
355
|
+
return false;
|
356
|
+
}
|
357
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
358
|
+
if (a->ne[i] != b->ne[i]) {
|
359
|
+
return false;
|
360
|
+
}
|
361
|
+
if (a->nb[i] != b->nb[i]) {
|
362
|
+
return false;
|
363
|
+
}
|
364
|
+
}
|
365
|
+
return true;
|
366
|
+
}
|
367
|
+
|
368
|
+
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
369
|
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
370
|
+
|
371
|
+
if (src == dst) {
|
372
|
+
return;
|
373
|
+
}
|
374
|
+
|
375
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
376
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
377
|
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
378
|
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
379
|
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
380
|
+
#ifndef NDEBUG
|
381
|
+
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
382
|
+
#endif
|
383
|
+
size_t nbytes = ggml_nbytes(src);
|
384
|
+
void * data = malloc(nbytes);
|
385
|
+
ggml_backend_tensor_get(src, data, 0, nbytes);
|
386
|
+
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
387
|
+
free(data);
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
392
|
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
393
|
+
|
394
|
+
if (src == dst) {
|
395
|
+
return;
|
396
|
+
}
|
397
|
+
|
398
|
+
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
399
|
+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
400
|
+
return;
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
// an async copy would normally happen after all the queued operations on both backends are completed
|
405
|
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
406
|
+
ggml_backend_synchronize(backend_src);
|
407
|
+
ggml_backend_synchronize(backend_dst);
|
408
|
+
ggml_backend_tensor_copy(src, dst);
|
409
|
+
}
|
410
|
+
|
411
|
+
// events
|
412
|
+
|
413
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
414
|
+
// null device is allowed for the transition period to the device interface
|
415
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
416
|
+
return NULL;
|
417
|
+
}
|
418
|
+
return device->iface.event_new(device);
|
419
|
+
}
|
420
|
+
|
421
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
422
|
+
if (event == NULL) {
|
423
|
+
return;
|
424
|
+
}
|
425
|
+
event->device->iface.event_free(event->device, event);
|
426
|
+
}
|
427
|
+
|
428
|
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
429
|
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
430
|
+
|
431
|
+
backend->iface.event_record(backend, event);
|
432
|
+
}
|
433
|
+
|
434
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
435
|
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
436
|
+
|
437
|
+
event->device->iface.event_synchronize(event->device, event);
|
438
|
+
}
|
439
|
+
|
440
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
441
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
442
|
+
|
443
|
+
backend->iface.event_wait(backend, event);
|
444
|
+
}
|
445
|
+
|
446
|
+
// Backend device
|
447
|
+
|
448
|
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
449
|
+
return device->iface.get_name(device);
|
450
|
+
}
|
451
|
+
|
452
|
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
453
|
+
return device->iface.get_description(device);
|
454
|
+
}
|
455
|
+
|
456
|
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
457
|
+
device->iface.get_memory(device, free, total);
|
458
|
+
}
|
459
|
+
|
460
|
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
461
|
+
return device->iface.get_type(device);
|
462
|
+
}
|
463
|
+
|
464
|
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
465
|
+
memset(props, 0, sizeof(*props));
|
466
|
+
device->iface.get_props(device, props);
|
467
|
+
}
|
468
|
+
|
469
|
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
470
|
+
return device->reg;
|
471
|
+
}
|
472
|
+
|
473
|
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
474
|
+
return device->iface.init_backend(device, params);
|
475
|
+
}
|
476
|
+
|
477
|
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
478
|
+
return device->iface.get_buffer_type(device);
|
479
|
+
}
|
480
|
+
|
481
|
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
482
|
+
if (device->iface.get_host_buffer_type == NULL) {
|
483
|
+
return NULL;
|
484
|
+
}
|
485
|
+
|
486
|
+
return device->iface.get_host_buffer_type(device);
|
487
|
+
}
|
488
|
+
|
489
|
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
490
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
491
|
+
}
|
492
|
+
|
493
|
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
494
|
+
return device->iface.supports_op(device, op);
|
495
|
+
}
|
496
|
+
|
497
|
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
498
|
+
return device->iface.supports_buft(device, buft);
|
499
|
+
}
|
500
|
+
|
501
|
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
502
|
+
if (device->iface.offload_op != NULL) {
|
503
|
+
return device->iface.offload_op(device, op);
|
504
|
+
}
|
505
|
+
|
506
|
+
return false;
|
507
|
+
}
|
508
|
+
|
509
|
+
// Backend (reg)
|
510
|
+
|
511
|
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
512
|
+
return reg->iface.get_name(reg);
|
513
|
+
}
|
514
|
+
|
515
|
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
516
|
+
return reg->iface.get_device_count(reg);
|
517
|
+
}
|
518
|
+
|
519
|
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
520
|
+
return reg->iface.get_device(reg, index);
|
521
|
+
}
|
522
|
+
|
523
|
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
524
|
+
if (!reg->iface.get_proc_address) {
|
525
|
+
return NULL;
|
526
|
+
}
|
527
|
+
return reg->iface.get_proc_address(reg, name);
|
528
|
+
}
|
529
|
+
|
530
|
+
// multi-buffer buffer
|
531
|
+
|
532
|
+
struct ggml_backend_multi_buffer_context {
|
533
|
+
ggml_backend_buffer_t * buffers;
|
534
|
+
size_t n_buffers;
|
535
|
+
};
|
536
|
+
|
537
|
+
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
538
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
539
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
540
|
+
ggml_backend_buffer_free(ctx->buffers[i]);
|
541
|
+
}
|
542
|
+
|
543
|
+
free(ctx->buffers);
|
544
|
+
free(ctx);
|
545
|
+
}
|
546
|
+
|
547
|
+
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
548
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
549
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
550
|
+
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
551
|
+
}
|
552
|
+
}
|
553
|
+
|
554
|
+
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
555
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
556
|
+
/* .get_base = */ NULL,
|
557
|
+
/* .init_tensor = */ NULL,
|
558
|
+
/* .memset_tensor = */ NULL,
|
559
|
+
/* .set_tensor = */ NULL,
|
560
|
+
/* .get_tensor = */ NULL,
|
561
|
+
/* .cpy_tensor = */ NULL,
|
562
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
563
|
+
/* .reset = */ NULL,
|
564
|
+
};
|
565
|
+
|
566
|
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
567
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
568
|
+
ctx->n_buffers = n_buffers;
|
569
|
+
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
570
|
+
|
571
|
+
GGML_ASSERT(ctx->buffers != NULL);
|
572
|
+
|
573
|
+
size_t total_size = 0;
|
574
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
575
|
+
ctx->buffers[i] = buffers[i];
|
576
|
+
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
577
|
+
}
|
578
|
+
|
579
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
|
580
|
+
}
|
581
|
+
|
582
|
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
583
|
+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
584
|
+
}
|
585
|
+
|
586
|
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
587
|
+
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
588
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
589
|
+
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
590
|
+
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
591
|
+
}
|
592
|
+
}
|
593
|
+
|
594
|
+
// creates a copy of the tensor with the same memory layout
|
595
|
+
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
596
|
+
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
597
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
598
|
+
dup->nb[i] = tensor->nb[i];
|
599
|
+
}
|
600
|
+
return dup;
|
601
|
+
}
|
602
|
+
|
603
|
+
static bool ggml_is_view_op(enum ggml_op op) {
|
604
|
+
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
605
|
+
}
|
606
|
+
|
607
|
+
// scheduler
|
608
|
+
|
609
|
+
#ifndef GGML_SCHED_MAX_BACKENDS
|
610
|
+
#define GGML_SCHED_MAX_BACKENDS 16
|
611
|
+
#endif
|
612
|
+
|
613
|
+
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
614
|
+
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
615
|
+
#endif
|
616
|
+
|
617
|
+
#ifndef GGML_SCHED_MAX_COPIES
|
618
|
+
#define GGML_SCHED_MAX_COPIES 4
|
619
|
+
#endif
|
620
|
+
|
621
|
+
struct ggml_backend_sched_split {
|
622
|
+
int backend_id;
|
623
|
+
int i_start;
|
624
|
+
int i_end;
|
625
|
+
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
626
|
+
int n_inputs;
|
627
|
+
// graph view of this split
|
628
|
+
struct ggml_cgraph graph;
|
629
|
+
};
|
630
|
+
|
631
|
+
struct ggml_backend_sched {
|
632
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
633
|
+
bool is_alloc;
|
634
|
+
|
635
|
+
int n_backends;
|
636
|
+
|
637
|
+
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
638
|
+
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
639
|
+
ggml_gallocr_t galloc;
|
640
|
+
|
641
|
+
// hash map of the nodes in the graph
|
642
|
+
struct ggml_hash_set hash_set;
|
643
|
+
int * hv_tensor_backend_ids; // [hash_set.size]
|
644
|
+
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
645
|
+
|
646
|
+
int * node_backend_ids; // [graph_size]
|
647
|
+
int * leaf_backend_ids; // [graph_size]
|
648
|
+
|
649
|
+
int * prev_node_backend_ids; // [graph_size]
|
650
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
651
|
+
|
652
|
+
// copy of the graph with modified inputs
|
653
|
+
struct ggml_cgraph graph;
|
654
|
+
|
655
|
+
// graph splits
|
656
|
+
struct ggml_backend_sched_split * splits;
|
657
|
+
int n_splits;
|
658
|
+
int splits_capacity;
|
659
|
+
|
660
|
+
// pipeline parallelism support
|
661
|
+
int n_copies;
|
662
|
+
int cur_copy;
|
663
|
+
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
664
|
+
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
665
|
+
int n_graph_inputs;
|
666
|
+
|
667
|
+
struct ggml_context * ctx;
|
668
|
+
|
669
|
+
ggml_backend_sched_eval_callback callback_eval;
|
670
|
+
void * callback_eval_user_data;
|
671
|
+
|
672
|
+
char * context_buffer;
|
673
|
+
size_t context_buffer_size;
|
674
|
+
|
675
|
+
int debug;
|
676
|
+
};
|
677
|
+
|
678
|
+
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
679
|
+
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
680
|
+
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
681
|
+
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
682
|
+
|
683
|
+
// returns the priority of the backend, lower id is higher priority
|
684
|
+
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
685
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
686
|
+
if (sched->backends[i] == backend) {
|
687
|
+
return i;
|
688
|
+
}
|
689
|
+
}
|
690
|
+
return -1;
|
691
|
+
}
|
692
|
+
|
693
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
694
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
695
|
+
if (buffer == NULL) {
|
696
|
+
return -1;
|
697
|
+
}
|
698
|
+
|
699
|
+
// find highest prio backend that supports the buffer type and the op
|
700
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
701
|
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
702
|
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
703
|
+
return i;
|
704
|
+
}
|
705
|
+
}
|
706
|
+
|
707
|
+
#ifndef NDEBUG
|
708
|
+
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
709
|
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
710
|
+
#endif
|
711
|
+
|
712
|
+
return -1;
|
713
|
+
}
|
714
|
+
|
715
|
+
#if 0
|
716
|
+
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
717
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
718
|
+
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
719
|
+
#define GET_CAUSE(node) causes[hash_id(node)]
|
720
|
+
#else
|
721
|
+
#define SET_CAUSE(node, ...)
|
722
|
+
#define GET_CAUSE(node) ""
|
723
|
+
#endif
|
724
|
+
|
725
|
+
// returns the backend that should be used for the node based on the current locations
|
726
|
+
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
727
|
+
// assign pre-allocated nodes to their backend
|
728
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
729
|
+
if (cur_backend_id != -1) {
|
730
|
+
SET_CAUSE(tensor, "1.dst");
|
731
|
+
return cur_backend_id;
|
732
|
+
}
|
733
|
+
|
734
|
+
// view_src
|
735
|
+
if (tensor->view_src != NULL) {
|
736
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
737
|
+
if (cur_backend_id != -1) {
|
738
|
+
SET_CAUSE(tensor, "1.vsrc");
|
739
|
+
return cur_backend_id;
|
740
|
+
}
|
741
|
+
}
|
742
|
+
|
743
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
744
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
745
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
746
|
+
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
|
747
|
+
}
|
748
|
+
|
749
|
+
// graph input
|
750
|
+
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
751
|
+
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
752
|
+
SET_CAUSE(tensor, "1.inp");
|
753
|
+
return cur_backend_id;
|
754
|
+
}
|
755
|
+
|
756
|
+
// operations with weights are preferably run on the same backend as the weights
|
757
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
758
|
+
const struct ggml_tensor * src = tensor->src[i];
|
759
|
+
if (src == NULL) {
|
760
|
+
continue;
|
761
|
+
}
|
762
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
763
|
+
// not an ideal solution
|
764
|
+
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
765
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
766
|
+
// check if a backend with higher prio wants to offload the op
|
767
|
+
if (src_backend_id == sched->n_backends - 1) {
|
768
|
+
for (int b = 0; b < src_backend_id; b++) {
|
769
|
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
770
|
+
SET_CAUSE(tensor, "1.off");
|
771
|
+
return b;
|
772
|
+
}
|
773
|
+
}
|
774
|
+
}
|
775
|
+
SET_CAUSE(tensor, "1.wgt%d", i);
|
776
|
+
return src_backend_id;
|
777
|
+
}
|
778
|
+
}
|
779
|
+
|
780
|
+
return -1;
|
781
|
+
}
|
782
|
+
|
783
|
+
static char * fmt_size(size_t size) {
|
784
|
+
static char buffer[128];
|
785
|
+
if (size >= 1024*1024) {
|
786
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
787
|
+
} else {
|
788
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
789
|
+
}
|
790
|
+
return buffer;
|
791
|
+
}
|
792
|
+
|
793
|
+
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
794
|
+
int cur_split = 0;
|
795
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
796
|
+
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
797
|
+
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
798
|
+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
799
|
+
sched->splits[cur_split].n_inputs);
|
800
|
+
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
801
|
+
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
802
|
+
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
803
|
+
}
|
804
|
+
GGML_LOG_DEBUG("\n");
|
805
|
+
cur_split++;
|
806
|
+
}
|
807
|
+
struct ggml_tensor * node = graph->nodes[i];
|
808
|
+
if (ggml_is_view_op(node->op)) {
|
809
|
+
continue;
|
810
|
+
}
|
811
|
+
if (sched->debug > 1) {
|
812
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
813
|
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
814
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
815
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
816
|
+
struct ggml_tensor * src = node->src[j];
|
817
|
+
if (src == NULL) {
|
818
|
+
continue;
|
819
|
+
}
|
820
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
821
|
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
822
|
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
823
|
+
}
|
824
|
+
GGML_LOG_DEBUG("\n");
|
825
|
+
}
|
826
|
+
}
|
827
|
+
}
|
828
|
+
|
829
|
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
830
|
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
831
|
+
ggml_backend_buffer_type_t buft = NULL;
|
832
|
+
|
833
|
+
if (buf) {
|
834
|
+
// the tensor is already allocated
|
835
|
+
buft = buf->buft;
|
836
|
+
} else {
|
837
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
838
|
+
int tensor_backend_id = tensor_backend_id(t);
|
839
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
840
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
841
|
+
}
|
842
|
+
if (tensor_backend_id != -1) {
|
843
|
+
buft = sched->bufts[tensor_backend_id];
|
844
|
+
}
|
845
|
+
}
|
846
|
+
|
847
|
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
848
|
+
}
|
849
|
+
|
850
|
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
851
|
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
852
|
+
*node_backend_id = cur_backend_id;
|
853
|
+
SET_CAUSE(node, "2.sup");
|
854
|
+
}
|
855
|
+
}
|
856
|
+
|
857
|
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
858
|
+
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
859
|
+
// reset splits
|
860
|
+
sched->n_splits = 0;
|
861
|
+
sched->n_graph_inputs = 0;
|
862
|
+
sched->is_reset = false;
|
863
|
+
|
864
|
+
struct ggml_init_params params = {
|
865
|
+
/* .mem_size = */ sched->context_buffer_size,
|
866
|
+
/* .mem_buffer = */ sched->context_buffer,
|
867
|
+
/* .no_alloc = */ true
|
868
|
+
};
|
869
|
+
|
870
|
+
ggml_free(sched->ctx);
|
871
|
+
|
872
|
+
sched->ctx = ggml_init(params);
|
873
|
+
if (sched->ctx == NULL) {
|
874
|
+
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
875
|
+
}
|
876
|
+
|
877
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
878
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
879
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
880
|
+
int * leaf_backend_id = &tensor_backend_id(leaf);
|
881
|
+
// do not overwrite user assignments
|
882
|
+
if (*leaf_backend_id == -1) {
|
883
|
+
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
884
|
+
}
|
885
|
+
}
|
886
|
+
|
887
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
888
|
+
struct ggml_tensor * node = graph->nodes[i];
|
889
|
+
int * node_backend_id = &tensor_backend_id(node);
|
890
|
+
// do not overwrite user assignments
|
891
|
+
if (*node_backend_id == -1) {
|
892
|
+
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
893
|
+
|
894
|
+
#if 0
|
895
|
+
// src
|
896
|
+
if (node->op == GGML_OP_NONE) {
|
897
|
+
continue;
|
898
|
+
}
|
899
|
+
|
900
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
901
|
+
struct ggml_tensor * src = node->src[j];
|
902
|
+
if (src == NULL) {
|
903
|
+
continue;
|
904
|
+
}
|
905
|
+
int * src_backend_id = &tensor_backend_id(src);
|
906
|
+
if (*src_backend_id == -1) {
|
907
|
+
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
908
|
+
}
|
909
|
+
}
|
910
|
+
#endif
|
911
|
+
}
|
912
|
+
}
|
913
|
+
|
914
|
+
// pass 2: expand current backend assignments
|
915
|
+
// assign the same backend to adjacent nodes
|
916
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
917
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
918
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
919
|
+
// expand gpu down
|
920
|
+
{
|
921
|
+
int cur_backend_id = -1;
|
922
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
923
|
+
struct ggml_tensor * node = graph->nodes[i];
|
924
|
+
if (ggml_is_view_op(node->op)) {
|
925
|
+
continue;
|
926
|
+
}
|
927
|
+
int * node_backend_id = &tensor_backend_id(node);
|
928
|
+
if (*node_backend_id != -1) {
|
929
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
930
|
+
// skip cpu (lowest prio backend)
|
931
|
+
cur_backend_id = -1;
|
932
|
+
} else {
|
933
|
+
cur_backend_id = *node_backend_id;
|
934
|
+
}
|
935
|
+
} else if (cur_backend_id != -1) {
|
936
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
937
|
+
}
|
938
|
+
}
|
939
|
+
}
|
940
|
+
// expand gpu up
|
941
|
+
{
|
942
|
+
int cur_backend_id = -1;
|
943
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
944
|
+
struct ggml_tensor * node = graph->nodes[i];
|
945
|
+
if (ggml_is_view_op(node->op)) {
|
946
|
+
continue;
|
947
|
+
}
|
948
|
+
int * node_backend_id = &tensor_backend_id(node);
|
949
|
+
if (*node_backend_id != -1) {
|
950
|
+
if (*node_backend_id == sched->n_backends - 1) {
|
951
|
+
// skip cpu (lowest prio backend)
|
952
|
+
cur_backend_id = -1;
|
953
|
+
} else {
|
954
|
+
cur_backend_id = *node_backend_id;
|
955
|
+
}
|
956
|
+
} else if (cur_backend_id != -1) {
|
957
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
958
|
+
}
|
959
|
+
}
|
960
|
+
}
|
961
|
+
// expand rest down
|
962
|
+
{
|
963
|
+
int cur_backend_id = -1;
|
964
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
965
|
+
struct ggml_tensor * node = graph->nodes[i];
|
966
|
+
if (ggml_is_view_op(node->op)) {
|
967
|
+
continue;
|
968
|
+
}
|
969
|
+
int * node_backend_id = &tensor_backend_id(node);
|
970
|
+
if (*node_backend_id != -1) {
|
971
|
+
cur_backend_id = *node_backend_id;
|
972
|
+
} else if (cur_backend_id != -1) {
|
973
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
974
|
+
}
|
975
|
+
}
|
976
|
+
}
|
977
|
+
// expand rest up
|
978
|
+
{
|
979
|
+
int cur_backend_id = -1;
|
980
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
981
|
+
struct ggml_tensor * node = graph->nodes[i];
|
982
|
+
if (ggml_is_view_op(node->op)) {
|
983
|
+
continue;
|
984
|
+
}
|
985
|
+
int * node_backend_id = &tensor_backend_id(node);
|
986
|
+
if (*node_backend_id != -1) {
|
987
|
+
cur_backend_id = *node_backend_id;
|
988
|
+
} else if (cur_backend_id != -1) {
|
989
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
990
|
+
}
|
991
|
+
}
|
992
|
+
}
|
993
|
+
|
994
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
995
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
996
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
997
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
998
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
999
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
1000
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1001
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1002
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1003
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1004
|
+
if (ggml_is_view_op(node->op)) {
|
1005
|
+
continue;
|
1006
|
+
}
|
1007
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1008
|
+
if (*node_backend_id == -1) {
|
1009
|
+
// unassigned node: find the backend with the most supported inputs
|
1010
|
+
int n_supported_best = -1;
|
1011
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1012
|
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
1013
|
+
int n_supported = 0;
|
1014
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1015
|
+
struct ggml_tensor * src = node->src[j];
|
1016
|
+
if (src == NULL) {
|
1017
|
+
continue;
|
1018
|
+
}
|
1019
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1020
|
+
n_supported++;
|
1021
|
+
}
|
1022
|
+
}
|
1023
|
+
if (n_supported > n_supported_best) {
|
1024
|
+
n_supported_best = n_supported;
|
1025
|
+
*node_backend_id = b;
|
1026
|
+
SET_CAUSE(node, "3.best");
|
1027
|
+
}
|
1028
|
+
}
|
1029
|
+
}
|
1030
|
+
} else {
|
1031
|
+
// assigned node: upgrade to higher prio backend if possible
|
1032
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
1033
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
1034
|
+
bool supported = true;
|
1035
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1036
|
+
struct ggml_tensor * src = node->src[j];
|
1037
|
+
if (src == NULL) {
|
1038
|
+
continue;
|
1039
|
+
}
|
1040
|
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1041
|
+
supported = false;
|
1042
|
+
break;
|
1043
|
+
}
|
1044
|
+
}
|
1045
|
+
if (supported) {
|
1046
|
+
*node_backend_id = b;
|
1047
|
+
SET_CAUSE(node, "3.upg");
|
1048
|
+
break;
|
1049
|
+
}
|
1050
|
+
}
|
1051
|
+
}
|
1052
|
+
}
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
1056
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1057
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1058
|
+
int * cur_backend_id = &tensor_backend_id(node);
|
1059
|
+
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1060
|
+
*cur_backend_id = tensor_backend_id(node->view_src);
|
1061
|
+
SET_CAUSE(node, "4.vsrc");
|
1062
|
+
}
|
1063
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1064
|
+
struct ggml_tensor * src = node->src[j];
|
1065
|
+
if (src == NULL) {
|
1066
|
+
continue;
|
1067
|
+
}
|
1068
|
+
int * src_backend_id = &tensor_backend_id(src);
|
1069
|
+
if (*src_backend_id == -1) {
|
1070
|
+
if (src->view_src != NULL) {
|
1071
|
+
// views are always on the same backend as the source
|
1072
|
+
*src_backend_id = tensor_backend_id(src->view_src);
|
1073
|
+
SET_CAUSE(src, "4.vsrc");
|
1074
|
+
} else {
|
1075
|
+
*src_backend_id = *cur_backend_id;
|
1076
|
+
SET_CAUSE(src, "4.cur");
|
1077
|
+
}
|
1078
|
+
}
|
1079
|
+
}
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
// pass 5: split graph, find tensors that need to be copied
|
1083
|
+
{
|
1084
|
+
int i_split = 0;
|
1085
|
+
struct ggml_backend_sched_split * split = &sched->splits[0];
|
1086
|
+
// find the backend of the first split, skipping view ops
|
1087
|
+
int i = 0;
|
1088
|
+
for (; i < graph->n_nodes; i++) {
|
1089
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1090
|
+
if (!ggml_is_view_op(node->op)) {
|
1091
|
+
split->backend_id = tensor_backend_id(node);
|
1092
|
+
break;
|
1093
|
+
}
|
1094
|
+
}
|
1095
|
+
split->i_start = 0;
|
1096
|
+
split->n_inputs = 0;
|
1097
|
+
int cur_backend_id = split->backend_id;
|
1098
|
+
for (; i < graph->n_nodes; i++) {
|
1099
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1100
|
+
|
1101
|
+
if (ggml_is_view_op(node->op)) {
|
1102
|
+
continue;
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
const int node_backend_id = tensor_backend_id(node);
|
1106
|
+
|
1107
|
+
assert(node_backend_id != -1); // all nodes should be assigned by now
|
1108
|
+
|
1109
|
+
// check if we should start a new split based on the sources of the current node
|
1110
|
+
bool need_new_split = false;
|
1111
|
+
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
1112
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1113
|
+
struct ggml_tensor * src = node->src[j];
|
1114
|
+
if (src == NULL) {
|
1115
|
+
continue;
|
1116
|
+
}
|
1117
|
+
// check if a weight is on a different and incompatible backend
|
1118
|
+
// by starting a new split, the memory of the previously offloaded weights can be reused
|
1119
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1120
|
+
int src_backend_id = tensor_backend_id(src);
|
1121
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1122
|
+
need_new_split = true;
|
1123
|
+
break;
|
1124
|
+
}
|
1125
|
+
}
|
1126
|
+
// check if the split has too many inputs
|
1127
|
+
// FIXME: count the number of inputs instead of only checking when full
|
1128
|
+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1129
|
+
const size_t id = hash_id(src);
|
1130
|
+
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
1131
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1132
|
+
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
1133
|
+
need_new_split = true;
|
1134
|
+
break;
|
1135
|
+
}
|
1136
|
+
}
|
1137
|
+
}
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
if (node_backend_id != cur_backend_id || need_new_split) {
|
1141
|
+
split->i_end = i;
|
1142
|
+
i_split++;
|
1143
|
+
if (i_split >= sched->splits_capacity) {
|
1144
|
+
sched->splits_capacity *= 2;
|
1145
|
+
sched->splits = (ggml_backend_sched_split *)
|
1146
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
1147
|
+
GGML_ASSERT(sched->splits != NULL);
|
1148
|
+
}
|
1149
|
+
split = &sched->splits[i_split];
|
1150
|
+
split->backend_id = node_backend_id;
|
1151
|
+
split->i_start = i;
|
1152
|
+
split->n_inputs = 0;
|
1153
|
+
cur_backend_id = node_backend_id;
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
// find inputs that are not on the same backend
|
1157
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1158
|
+
struct ggml_tensor * src = node->src[j];
|
1159
|
+
if (src == NULL) {
|
1160
|
+
continue;
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
size_t src_id = hash_id(src);
|
1164
|
+
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
1165
|
+
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1166
|
+
|
1167
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1168
|
+
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
1169
|
+
ggml_backend_t backend = sched->backends[src_backend_id];
|
1170
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1171
|
+
struct ggml_tensor * tensor_copy;
|
1172
|
+
if (c == sched->cur_copy) {
|
1173
|
+
tensor_copy = src; // use the original tensor as the current copy
|
1174
|
+
} else {
|
1175
|
+
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1176
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1177
|
+
}
|
1178
|
+
if (sched->n_copies > 1) {
|
1179
|
+
ggml_set_input(tensor_copy);
|
1180
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1181
|
+
}
|
1182
|
+
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
1183
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1184
|
+
}
|
1185
|
+
int n_graph_inputs = sched->n_graph_inputs++;
|
1186
|
+
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1187
|
+
sched->graph_inputs[n_graph_inputs] = src;
|
1188
|
+
}
|
1189
|
+
}
|
1190
|
+
|
1191
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
1192
|
+
// create a copy of the input in the split's backend
|
1193
|
+
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
1194
|
+
ggml_backend_t backend = sched->backends[cur_backend_id];
|
1195
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1196
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1197
|
+
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
1198
|
+
if (sched->n_copies > 1) {
|
1199
|
+
ggml_set_input(tensor_copy);
|
1200
|
+
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
1201
|
+
}
|
1202
|
+
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
1203
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1204
|
+
}
|
1205
|
+
int n_inputs = split->n_inputs++;
|
1206
|
+
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
1207
|
+
split->inputs[n_inputs] = src;
|
1208
|
+
}
|
1209
|
+
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
1210
|
+
}
|
1211
|
+
}
|
1212
|
+
}
|
1213
|
+
split->i_end = graph->n_nodes;
|
1214
|
+
sched->n_splits = i_split + 1;
|
1215
|
+
}
|
1216
|
+
|
1217
|
+
if (sched->debug) {
|
1218
|
+
ggml_backend_sched_print_assignments(sched, graph);
|
1219
|
+
}
|
1220
|
+
|
1221
|
+
// swap node_backend_ids and leaf _backend_ids with prevs
|
1222
|
+
{
|
1223
|
+
int * tmp = sched->node_backend_ids;
|
1224
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
1225
|
+
sched->prev_node_backend_ids = tmp;
|
1226
|
+
|
1227
|
+
tmp = sched->leaf_backend_ids;
|
1228
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
1229
|
+
sched->prev_leaf_backend_ids = tmp;
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
1233
|
+
if (sched->graph.size < graph_size) {
|
1234
|
+
sched->graph.size = graph_size;
|
1235
|
+
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
1236
|
+
sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
1237
|
+
GGML_ASSERT(sched->graph.nodes != NULL);
|
1238
|
+
GGML_ASSERT(sched->graph.leafs != NULL);
|
1239
|
+
}
|
1240
|
+
sched->graph.n_nodes = 0;
|
1241
|
+
sched->graph.n_leafs = 0;
|
1242
|
+
|
1243
|
+
struct ggml_cgraph * graph_copy = &sched->graph;
|
1244
|
+
|
1245
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1246
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1247
|
+
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
1248
|
+
|
1249
|
+
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
1250
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1251
|
+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
1252
|
+
|
1253
|
+
struct ggml_tensor * input = split->inputs[j];
|
1254
|
+
const size_t input_id = hash_id(input);
|
1255
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
1256
|
+
|
1257
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
1258
|
+
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
1259
|
+
input_dep->src[0] = input;
|
1260
|
+
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
1261
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
1262
|
+
|
1263
|
+
// add a dependency to the input copy so that it is allocated at the start of the split
|
1264
|
+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
1265
|
+
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1266
|
+
}
|
1267
|
+
|
1268
|
+
for (int j = split->i_start; j < split->i_end; j++) {
|
1269
|
+
assert(graph_copy->size > graph_copy->n_nodes);
|
1270
|
+
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
1271
|
+
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
1272
|
+
}
|
1273
|
+
}
|
1274
|
+
|
1275
|
+
if (sched->n_copies > 1) {
|
1276
|
+
// add input copies as leafs so that they are allocated first
|
1277
|
+
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
1278
|
+
struct ggml_tensor * input = sched->graph_inputs[i];
|
1279
|
+
size_t id = hash_id(input);
|
1280
|
+
int backend_id = tensor_backend_id(input);
|
1281
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1282
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1283
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1284
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1285
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1290
|
+
struct ggml_backend_sched_split * split = &sched->splits[i];
|
1291
|
+
int backend_id = split->backend_id;
|
1292
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1293
|
+
struct ggml_tensor * input = split->inputs[j];
|
1294
|
+
size_t id = hash_id(input);
|
1295
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1296
|
+
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1297
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1298
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1299
|
+
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1300
|
+
}
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
}
|
1304
|
+
|
1305
|
+
// add leafs from the original graph
|
1306
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
1307
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
1308
|
+
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1309
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1310
|
+
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1315
|
+
bool backend_ids_changed = false;
|
1316
|
+
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
1317
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
1318
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
1319
|
+
backend_ids_changed = true;
|
1320
|
+
break;
|
1321
|
+
}
|
1322
|
+
}
|
1323
|
+
if (!backend_ids_changed) {
|
1324
|
+
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
1325
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
1326
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
1327
|
+
backend_ids_changed = true;
|
1328
|
+
break;
|
1329
|
+
}
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
// allocate graph
|
1334
|
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1335
|
+
// the re-allocation may cause the split inputs to be moved to a different address
|
1336
|
+
ggml_backend_sched_synchronize(sched);
|
1337
|
+
#ifndef NDEBUG
|
1338
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
1339
|
+
#endif
|
1340
|
+
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
1341
|
+
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1342
|
+
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
1343
|
+
return false;
|
1344
|
+
}
|
1345
|
+
}
|
1346
|
+
|
1347
|
+
return true;
|
1348
|
+
}
|
1349
|
+
|
1350
|
+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
1351
|
+
struct ggml_backend_sched_split * splits = sched->splits;
|
1352
|
+
|
1353
|
+
for (int i = 0; i < sched->n_splits; i++) {
|
1354
|
+
struct ggml_backend_sched_split * split = &splits[i];
|
1355
|
+
int split_backend_id = split->backend_id;
|
1356
|
+
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
1357
|
+
|
1358
|
+
// copy the input tensors to the split backend
|
1359
|
+
for (int j = 0; j < split->n_inputs; j++) {
|
1360
|
+
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
1361
|
+
struct ggml_tensor * input = split->inputs[j];
|
1362
|
+
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
1363
|
+
|
1364
|
+
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
1365
|
+
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
1366
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1367
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1368
|
+
} else {
|
1369
|
+
ggml_backend_synchronize(split_backend);
|
1370
|
+
}
|
1371
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
1372
|
+
} else {
|
1373
|
+
// wait for the split backend to finish using the input before overwriting it
|
1374
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1375
|
+
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
1376
|
+
} else {
|
1377
|
+
ggml_backend_synchronize(split_backend);
|
1378
|
+
}
|
1379
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
1380
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
1381
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
1382
|
+
ggml_backend_synchronize(input_backend);
|
1383
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1384
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
1385
|
+
} else {
|
1386
|
+
ggml_backend_synchronize(split_backend);
|
1387
|
+
}
|
1388
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
1389
|
+
}
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
if (!sched->callback_eval) {
|
1394
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
1395
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1396
|
+
return ec;
|
1397
|
+
}
|
1398
|
+
} else {
|
1399
|
+
// similar to ggml_backend_compare_graph_backend
|
1400
|
+
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
1401
|
+
struct ggml_tensor * t = split->graph.nodes[j0];
|
1402
|
+
|
1403
|
+
// check if the user needs data from this node
|
1404
|
+
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1405
|
+
|
1406
|
+
int j1 = j0;
|
1407
|
+
|
1408
|
+
// determine the range [j0, j1] of nodes that can be computed together
|
1409
|
+
while (!need && j1 < split->graph.n_nodes - 1) {
|
1410
|
+
t = split->graph.nodes[++j1];
|
1411
|
+
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
1412
|
+
}
|
1413
|
+
|
1414
|
+
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
1415
|
+
|
1416
|
+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
1417
|
+
if (ec != GGML_STATUS_SUCCESS) {
|
1418
|
+
return ec;
|
1419
|
+
}
|
1420
|
+
|
1421
|
+
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
1422
|
+
ggml_backend_synchronize(split_backend);
|
1423
|
+
|
1424
|
+
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
1425
|
+
break;
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
j0 = j1;
|
1429
|
+
}
|
1430
|
+
}
|
1431
|
+
|
1432
|
+
// record the event of this copy
|
1433
|
+
if (split->n_inputs > 0) {
|
1434
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1435
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
1436
|
+
}
|
1437
|
+
}
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
1441
|
+
|
1442
|
+
return GGML_STATUS_SUCCESS;
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
ggml_backend_sched_t ggml_backend_sched_new(
|
1446
|
+
ggml_backend_t * backends,
|
1447
|
+
ggml_backend_buffer_type_t * bufts,
|
1448
|
+
int n_backends,
|
1449
|
+
size_t graph_size,
|
1450
|
+
bool parallel) {
|
1451
|
+
GGML_ASSERT(n_backends > 0);
|
1452
|
+
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1453
|
+
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
1454
|
+
|
1455
|
+
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
1456
|
+
|
1457
|
+
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
1458
|
+
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
1459
|
+
sched->n_backends = n_backends;
|
1460
|
+
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1461
|
+
|
1462
|
+
// initialize hash table
|
1463
|
+
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
1464
|
+
sched->hash_set = ggml_hash_set_new(graph_size);
|
1465
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1466
|
+
sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
1467
|
+
|
1468
|
+
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1469
|
+
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1470
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1471
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1472
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1473
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1474
|
+
|
1475
|
+
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
1476
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
1477
|
+
|
1478
|
+
const int initial_splits_capacity = 16;
|
1479
|
+
sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1480
|
+
sched->splits_capacity = initial_splits_capacity;
|
1481
|
+
|
1482
|
+
for (int b = 0; b < n_backends; b++) {
|
1483
|
+
sched->backends[b] = backends[b];
|
1484
|
+
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
1485
|
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
1486
|
+
|
1487
|
+
if (sched->n_copies > 1) {
|
1488
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1489
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
1490
|
+
}
|
1491
|
+
}
|
1492
|
+
}
|
1493
|
+
|
1494
|
+
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
1495
|
+
|
1496
|
+
ggml_backend_sched_reset(sched);
|
1497
|
+
|
1498
|
+
return sched;
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
1502
|
+
if (sched == NULL) {
|
1503
|
+
return;
|
1504
|
+
}
|
1505
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1506
|
+
for (int c = 0; c < sched->n_copies; c++) {
|
1507
|
+
ggml_backend_event_free(sched->events[b][c]);
|
1508
|
+
}
|
1509
|
+
}
|
1510
|
+
ggml_gallocr_free(sched->galloc);
|
1511
|
+
ggml_free(sched->ctx);
|
1512
|
+
ggml_hash_set_free(&sched->hash_set);
|
1513
|
+
free(sched->splits);
|
1514
|
+
free(sched->hv_tensor_backend_ids);
|
1515
|
+
free(sched->hv_tensor_copies);
|
1516
|
+
free(sched->node_backend_ids);
|
1517
|
+
free(sched->leaf_backend_ids);
|
1518
|
+
free(sched->prev_node_backend_ids);
|
1519
|
+
free(sched->prev_leaf_backend_ids);
|
1520
|
+
free(sched->context_buffer);
|
1521
|
+
free(sched->graph.nodes);
|
1522
|
+
free(sched->graph.leafs);
|
1523
|
+
free(sched);
|
1524
|
+
}
|
1525
|
+
|
1526
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1527
|
+
// reset state for the next run
|
1528
|
+
if (!sched->is_reset) {
|
1529
|
+
ggml_hash_set_reset(&sched->hash_set);
|
1530
|
+
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1531
|
+
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
1532
|
+
sched->is_reset = true;
|
1533
|
+
}
|
1534
|
+
sched->is_alloc = false;
|
1535
|
+
}
|
1536
|
+
|
1537
|
+
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1538
|
+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
1539
|
+
|
1540
|
+
ggml_backend_sched_split_graph(sched, measure_graph);
|
1541
|
+
|
1542
|
+
ggml_backend_sched_synchronize(sched);
|
1543
|
+
|
1544
|
+
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
1545
|
+
return false;
|
1546
|
+
}
|
1547
|
+
|
1548
|
+
ggml_backend_sched_reset(sched);
|
1549
|
+
|
1550
|
+
return true;
|
1551
|
+
}
|
1552
|
+
|
1553
|
+
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1554
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
1555
|
+
|
1556
|
+
ggml_backend_sched_split_graph(sched, graph);
|
1557
|
+
|
1558
|
+
|
1559
|
+
if (!ggml_backend_sched_alloc_splits(sched)) {
|
1560
|
+
return false;
|
1561
|
+
}
|
1562
|
+
|
1563
|
+
sched->is_alloc = true;
|
1564
|
+
|
1565
|
+
return true;
|
1566
|
+
}
|
1567
|
+
|
1568
|
+
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1569
|
+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
1570
|
+
ggml_backend_sched_synchronize(sched);
|
1571
|
+
return err;
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1575
|
+
if (!sched->is_reset && !sched->is_alloc) {
|
1576
|
+
ggml_backend_sched_reset(sched);
|
1577
|
+
}
|
1578
|
+
|
1579
|
+
if (!sched->is_alloc) {
|
1580
|
+
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
1581
|
+
return GGML_STATUS_ALLOC_FAILED;
|
1582
|
+
}
|
1583
|
+
}
|
1584
|
+
|
1585
|
+
return ggml_backend_sched_compute_splits(sched);
|
1586
|
+
}
|
1587
|
+
|
1588
|
+
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
1589
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1590
|
+
ggml_backend_synchronize(sched->backends[i]);
|
1591
|
+
}
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
1595
|
+
sched->callback_eval = callback;
|
1596
|
+
sched->callback_eval_user_data = user_data;
|
1597
|
+
}
|
1598
|
+
|
1599
|
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
1600
|
+
return sched->n_splits;
|
1601
|
+
}
|
1602
|
+
|
1603
|
+
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
1604
|
+
return sched->n_copies;
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
1608
|
+
return sched->n_backends;
|
1609
|
+
}
|
1610
|
+
|
1611
|
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
1612
|
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
1613
|
+
return sched->backends[i];
|
1614
|
+
}
|
1615
|
+
|
1616
|
+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1617
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1618
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1619
|
+
|
1620
|
+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
1621
|
+
}
|
1622
|
+
|
1623
|
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
1624
|
+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1625
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1626
|
+
tensor_backend_id(node) = backend_index;
|
1627
|
+
SET_CAUSE(node, "usr");
|
1628
|
+
sched->is_reset = false;
|
1629
|
+
}
|
1630
|
+
|
1631
|
+
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1632
|
+
int backend_index = tensor_backend_id(node);
|
1633
|
+
if (backend_index == -1) {
|
1634
|
+
return NULL;
|
1635
|
+
}
|
1636
|
+
return sched->backends[backend_index];
|
1637
|
+
}
|
1638
|
+
|
1639
|
+
// utils
|
1640
|
+
|
1641
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
1642
|
+
GGML_ASSERT(tensor->buffer == NULL);
|
1643
|
+
GGML_ASSERT(tensor->view_src != NULL);
|
1644
|
+
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1645
|
+
GGML_ASSERT(tensor->view_src->data != NULL);
|
1646
|
+
|
1647
|
+
tensor->buffer = tensor->view_src->buffer;
|
1648
|
+
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1649
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1650
|
+
}
|
1651
|
+
|
1652
|
+
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
1653
|
+
GGML_ASSERT(tensor->buffer == NULL);
|
1654
|
+
GGML_ASSERT(tensor->data == NULL);
|
1655
|
+
GGML_ASSERT(tensor->view_src == NULL);
|
1656
|
+
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
|
1657
|
+
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
1658
|
+
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
1659
|
+
|
1660
|
+
tensor->buffer = buffer;
|
1661
|
+
tensor->data = addr;
|
1662
|
+
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
1666
|
+
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
1667
|
+
|
1668
|
+
GGML_ASSERT(src != NULL);
|
1669
|
+
GGML_ASSERT(src->data && "graph must be allocated");
|
1670
|
+
|
1671
|
+
size_t id = ggml_hash_insert(&hash_set, src);
|
1672
|
+
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
1673
|
+
return node_copies[ggml_hash_find(&hash_set, src)];
|
1674
|
+
}
|
1675
|
+
|
1676
|
+
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1677
|
+
if (src->view_src != NULL) {
|
1678
|
+
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1679
|
+
dst->view_offs = src->view_offs;
|
1680
|
+
}
|
1681
|
+
dst->op = src->op;
|
1682
|
+
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
1683
|
+
ggml_set_name(dst, src->name);
|
1684
|
+
|
1685
|
+
// copy src
|
1686
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1687
|
+
struct ggml_tensor * s = src->src[i];
|
1688
|
+
if (s == NULL) {
|
1689
|
+
continue;
|
1690
|
+
}
|
1691
|
+
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
node_copies[id] = dst;
|
1695
|
+
return dst;
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
1699
|
+
size_t id = ggml_hash_find(hash_set, src);
|
1700
|
+
if (node_init[id]) {
|
1701
|
+
return;
|
1702
|
+
}
|
1703
|
+
node_init[id] = true;
|
1704
|
+
|
1705
|
+
struct ggml_tensor * dst = node_copies[id];
|
1706
|
+
if (dst->view_src != NULL) {
|
1707
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1708
|
+
ggml_backend_view_init(dst);
|
1709
|
+
}
|
1710
|
+
else {
|
1711
|
+
ggml_backend_tensor_copy(src, dst);
|
1712
|
+
}
|
1713
|
+
|
1714
|
+
// init src
|
1715
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1716
|
+
struct ggml_tensor * s = src->src[i];
|
1717
|
+
if (s == NULL) {
|
1718
|
+
continue;
|
1719
|
+
}
|
1720
|
+
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
1721
|
+
}
|
1722
|
+
}
|
1723
|
+
|
1724
|
+
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1725
|
+
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
1726
|
+
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1727
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
1728
|
+
|
1729
|
+
struct ggml_init_params params = {
|
1730
|
+
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
1731
|
+
/* .mem_buffer = */ NULL,
|
1732
|
+
/* .no_alloc = */ true
|
1733
|
+
};
|
1734
|
+
|
1735
|
+
struct ggml_context * ctx_allocated = ggml_init(params);
|
1736
|
+
struct ggml_context * ctx_unallocated = ggml_init(params);
|
1737
|
+
|
1738
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
1739
|
+
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
1740
|
+
ggml_hash_set_free(&hash_set);
|
1741
|
+
free(node_copies);
|
1742
|
+
free(node_init);
|
1743
|
+
ggml_free(ctx_allocated);
|
1744
|
+
ggml_free(ctx_unallocated);
|
1745
|
+
return {
|
1746
|
+
/* .buffer = */ NULL,
|
1747
|
+
/* .ctx_allocated = */ NULL,
|
1748
|
+
/* .ctx_unallocated = */ NULL,
|
1749
|
+
/* .graph = */ NULL,
|
1750
|
+
};
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
// dup nodes
|
1754
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1755
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1756
|
+
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1757
|
+
}
|
1758
|
+
|
1759
|
+
// allocate nodes
|
1760
|
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1761
|
+
if (buffer == NULL) {
|
1762
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
1763
|
+
ggml_hash_set_free(&hash_set);
|
1764
|
+
free(node_copies);
|
1765
|
+
free(node_init);
|
1766
|
+
ggml_free(ctx_allocated);
|
1767
|
+
ggml_free(ctx_unallocated);
|
1768
|
+
return {
|
1769
|
+
/* .buffer = */ NULL,
|
1770
|
+
/* .ctx_allocated = */ NULL,
|
1771
|
+
/* .ctx_unallocated = */ NULL,
|
1772
|
+
/* .graph = */ NULL,
|
1773
|
+
};
|
1774
|
+
}
|
1775
|
+
|
1776
|
+
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
1777
|
+
|
1778
|
+
// copy data and init views
|
1779
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1780
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1781
|
+
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
1782
|
+
}
|
1783
|
+
|
1784
|
+
// build graph copy
|
1785
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
1786
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1787
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1788
|
+
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
1789
|
+
graph_copy->nodes[i] = node_copy;
|
1790
|
+
}
|
1791
|
+
graph_copy->n_nodes = graph->n_nodes;
|
1792
|
+
|
1793
|
+
ggml_hash_set_free(&hash_set);
|
1794
|
+
free(node_copies);
|
1795
|
+
free(node_init);
|
1796
|
+
|
1797
|
+
return {
|
1798
|
+
/* .buffer = */ buffer,
|
1799
|
+
/* .ctx_allocated = */ ctx_allocated,
|
1800
|
+
/* .ctx_unallocated = */ ctx_unallocated,
|
1801
|
+
/* .graph = */ graph_copy,
|
1802
|
+
};
|
1803
|
+
}
|
1804
|
+
|
1805
|
+
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
1806
|
+
ggml_backend_buffer_free(copy.buffer);
|
1807
|
+
ggml_free(copy.ctx_allocated);
|
1808
|
+
ggml_free(copy.ctx_unallocated);
|
1809
|
+
}
|
1810
|
+
|
1811
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
1812
|
+
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
1813
|
+
if (copy.buffer == NULL) {
|
1814
|
+
return false;
|
1815
|
+
}
|
1816
|
+
|
1817
|
+
struct ggml_cgraph * g1 = graph;
|
1818
|
+
struct ggml_cgraph * g2 = copy.graph;
|
1819
|
+
|
1820
|
+
assert(g1->n_nodes == g2->n_nodes);
|
1821
|
+
|
1822
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
1823
|
+
//printf("eval %d/%d\n", i, g1->n_nodes);
|
1824
|
+
struct ggml_tensor * t1 = g1->nodes[i];
|
1825
|
+
struct ggml_tensor * t2 = g2->nodes[i];
|
1826
|
+
|
1827
|
+
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
1828
|
+
|
1829
|
+
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
1830
|
+
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
1831
|
+
|
1832
|
+
ggml_backend_graph_compute(backend1, &g1v);
|
1833
|
+
ggml_backend_graph_compute(backend2, &g2v);
|
1834
|
+
|
1835
|
+
if (ggml_is_view_op(t1->op)) {
|
1836
|
+
continue;
|
1837
|
+
}
|
1838
|
+
|
1839
|
+
// compare results, calculate rms etc
|
1840
|
+
if (!callback(i, t1, t2, user_data)) {
|
1841
|
+
break;
|
1842
|
+
}
|
1843
|
+
}
|
1844
|
+
|
1845
|
+
ggml_backend_graph_copy_free(copy);
|
1846
|
+
|
1847
|
+
return true;
|
1848
|
+
}
|
1849
|
+
|
1850
|
+
// CPU backend - buffer
|
1851
|
+
|
1852
|
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
1853
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
1854
|
+
|
1855
|
+
// align the buffer
|
1856
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
1857
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
1858
|
+
}
|
1859
|
+
|
1860
|
+
return (void *)data;
|
1861
|
+
}
|
1862
|
+
|
1863
|
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
1864
|
+
ggml_aligned_free(buffer->context, buffer->size);
|
1865
|
+
}
|
1866
|
+
|
1867
|
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
1868
|
+
memset((char *)tensor->data + offset, value, size);
|
1869
|
+
|
1870
|
+
GGML_UNUSED(buffer);
|
1871
|
+
}
|
1872
|
+
|
1873
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
1874
|
+
memcpy((char *)tensor->data + offset, data, size);
|
1875
|
+
|
1876
|
+
GGML_UNUSED(buffer);
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
1880
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
1881
|
+
|
1882
|
+
GGML_UNUSED(buffer);
|
1883
|
+
}
|
1884
|
+
|
1885
|
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
1886
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
1887
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
1888
|
+
return true;
|
1889
|
+
}
|
1890
|
+
return false;
|
1891
|
+
|
1892
|
+
GGML_UNUSED(buffer);
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
1896
|
+
memset(buffer->context, value, buffer->size);
|
1897
|
+
}
|
1898
|
+
|
1899
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
1900
|
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
1901
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
1902
|
+
/* .init_tensor = */ NULL, // no initialization required
|
1903
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
1904
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
1905
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
1906
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
1907
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
1908
|
+
/* .reset = */ NULL,
|
1909
|
+
};
|
1910
|
+
|
1911
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
1912
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
1913
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
1914
|
+
/* .init_tensor = */ NULL, // no initialization required
|
1915
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
1916
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
1917
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
1918
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
1919
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
1920
|
+
/* .reset = */ NULL,
|
1921
|
+
};
|
1922
|
+
|
1923
|
+
// CPU backend buffer type
|
1924
|
+
|
1925
|
+
// this buffer type is defined here to make it available to all backends
|
1926
|
+
|
1927
|
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
1928
|
+
return "CPU";
|
1929
|
+
|
1930
|
+
GGML_UNUSED(buft);
|
1931
|
+
}
|
1932
|
+
|
1933
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
1934
|
+
void * data = ggml_aligned_malloc(size);
|
1935
|
+
|
1936
|
+
if (data == NULL) {
|
1937
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
1938
|
+
return NULL;
|
1939
|
+
}
|
1940
|
+
|
1941
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
1942
|
+
}
|
1943
|
+
|
1944
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
1945
|
+
return TENSOR_ALIGNMENT;
|
1946
|
+
|
1947
|
+
GGML_UNUSED(buft);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
1951
|
+
return true;
|
1952
|
+
|
1953
|
+
GGML_UNUSED(buft);
|
1954
|
+
}
|
1955
|
+
|
1956
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
1957
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
1958
|
+
/* .iface = */ {
|
1959
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
1960
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
1961
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
1962
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
1963
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1964
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
1965
|
+
},
|
1966
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
1967
|
+
/* .context = */ NULL,
|
1968
|
+
};
|
1969
|
+
|
1970
|
+
return &ggml_backend_cpu_buffer_type;
|
1971
|
+
}
|
1972
|
+
|
1973
|
+
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
1974
|
+
return "CPU_Mapped";
|
1975
|
+
|
1976
|
+
GGML_UNUSED(buft);
|
1977
|
+
}
|
1978
|
+
|
1979
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
1980
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
1981
|
+
/* .iface = */ {
|
1982
|
+
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
1983
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
1984
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
1985
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
1986
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1987
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
1988
|
+
},
|
1989
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
1990
|
+
/* .context = */ NULL,
|
1991
|
+
};
|
1992
|
+
|
1993
|
+
return &ggml_backend_cpu_buffer_type;
|
1994
|
+
}
|
1995
|
+
|
1996
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
1997
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
1998
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
1999
|
+
}
|