llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
|
@@ -15,7 +15,11 @@
|
|
|
15
15
|
|
|
16
16
|
// backend buffer type
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
19
|
+
return buft->iface.get_name(buft);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
19
23
|
return buft->iface.alloc_buffer(buft, size);
|
|
20
24
|
}
|
|
21
25
|
|
|
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
23
27
|
return buft->iface.get_alignment(buft);
|
|
24
28
|
}
|
|
25
29
|
|
|
26
|
-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
|
30
|
+
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
|
27
31
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
|
28
32
|
if (buft->iface.get_alloc_size) {
|
|
29
33
|
return buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
|
44
48
|
|
|
45
49
|
// backend buffer
|
|
46
50
|
|
|
47
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
51
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
48
52
|
ggml_backend_buffer_type_t buft,
|
|
49
53
|
struct ggml_backend_buffer_i iface,
|
|
50
54
|
ggml_backend_buffer_context_t context,
|
|
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
58
62
|
/* .buft = */ buft,
|
|
59
63
|
/* .context = */ context,
|
|
60
64
|
/* .size = */ size,
|
|
65
|
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
|
61
66
|
};
|
|
62
67
|
|
|
63
68
|
return buffer;
|
|
64
69
|
}
|
|
65
70
|
|
|
71
|
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
|
72
|
+
return buffer->iface.get_name(buffer);
|
|
73
|
+
}
|
|
74
|
+
|
|
66
75
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
67
76
|
if (buffer == NULL) {
|
|
68
77
|
return;
|
|
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
86
95
|
return base;
|
|
87
96
|
}
|
|
88
97
|
|
|
89
|
-
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
98
|
+
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
90
99
|
// init_tensor is optional
|
|
91
100
|
if (buffer->iface.init_tensor) {
|
|
92
101
|
buffer->iface.init_tensor(buffer, tensor);
|
|
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
|
94
103
|
}
|
|
95
104
|
|
|
96
105
|
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
|
97
|
-
return ggml_backend_buft_get_alignment(
|
|
106
|
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
|
98
107
|
}
|
|
99
108
|
|
|
100
109
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
101
|
-
return ggml_backend_buft_get_alloc_size(
|
|
110
|
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
|
102
111
|
}
|
|
103
112
|
|
|
104
113
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
106
115
|
}
|
|
107
116
|
|
|
108
117
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
109
|
-
return ggml_backend_buft_is_host(
|
|
118
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
|
110
119
|
}
|
|
111
120
|
|
|
112
|
-
|
|
121
|
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
122
|
+
buffer->usage = usage;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
|
113
126
|
return buffer->buft;
|
|
114
127
|
}
|
|
115
128
|
|
|
129
|
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
130
|
+
if (buffer->iface.reset) {
|
|
131
|
+
buffer->iface.reset(buffer);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
136
|
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
137
|
+
if (dst_buf->iface.cpy_tensor) {
|
|
138
|
+
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
|
139
|
+
}
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
|
|
116
143
|
// backend
|
|
117
144
|
|
|
118
145
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
|
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|
|
146
173
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
147
174
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
148
175
|
|
|
149
|
-
backend->iface.set_tensor_async
|
|
176
|
+
if (backend->iface.set_tensor_async == NULL) {
|
|
177
|
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
|
178
|
+
} else {
|
|
179
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
|
180
|
+
}
|
|
150
181
|
}
|
|
151
182
|
|
|
152
183
|
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
153
184
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
154
185
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
155
186
|
|
|
156
|
-
backend->iface.get_tensor_async
|
|
187
|
+
if (backend->iface.get_tensor_async == NULL) {
|
|
188
|
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
|
189
|
+
} else {
|
|
190
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
|
191
|
+
}
|
|
157
192
|
}
|
|
158
193
|
|
|
159
|
-
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
194
|
+
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
195
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
196
|
+
|
|
160
197
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
161
|
-
GGML_ASSERT(
|
|
198
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
162
199
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
163
200
|
|
|
164
|
-
tensor->buffer->iface.set_tensor(
|
|
201
|
+
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
|
165
202
|
}
|
|
166
203
|
|
|
167
|
-
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
204
|
+
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
205
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
206
|
+
|
|
168
207
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
169
208
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
|
170
209
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
171
210
|
|
|
172
|
-
tensor->buffer->iface.get_tensor(
|
|
211
|
+
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
|
173
212
|
}
|
|
174
213
|
|
|
175
214
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
@@ -190,16 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
|
190
229
|
|
|
191
230
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
192
231
|
backend->iface.graph_plan_compute(backend, plan);
|
|
193
|
-
|
|
194
|
-
// TODO: optional sync
|
|
195
|
-
ggml_backend_synchronize(backend);
|
|
196
232
|
}
|
|
197
233
|
|
|
198
|
-
|
|
199
|
-
backend->iface.graph_compute(backend, cgraph);
|
|
200
|
-
|
|
201
|
-
// TODO: optional sync
|
|
202
|
-
ggml_backend_synchronize(backend);
|
|
234
|
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
235
|
+
return backend->iface.graph_compute(backend, cgraph);
|
|
203
236
|
}
|
|
204
237
|
|
|
205
238
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
@@ -224,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
|
224
257
|
}
|
|
225
258
|
|
|
226
259
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
227
|
-
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
|
228
|
-
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
|
229
260
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
|
230
261
|
|
|
231
|
-
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
|
232
|
-
|
|
233
262
|
if (src == dst) {
|
|
234
263
|
return;
|
|
235
264
|
}
|
|
236
265
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
if (dst->buffer
|
|
240
|
-
|
|
241
|
-
} else if (src
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
#ifndef NDEBUG
|
|
246
|
-
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
|
|
247
|
-
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
|
|
248
|
-
#endif
|
|
266
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
267
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
|
268
|
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
|
269
|
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
|
270
|
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
|
271
|
+
#ifndef NDEBUG
|
|
272
|
+
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
|
273
|
+
#endif
|
|
249
274
|
size_t nbytes = ggml_nbytes(src);
|
|
250
275
|
void * data = malloc(nbytes);
|
|
251
276
|
ggml_backend_tensor_get(src, data, 0, nbytes);
|
|
@@ -254,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
|
254
279
|
}
|
|
255
280
|
}
|
|
256
281
|
|
|
282
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
283
|
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
|
284
|
+
|
|
285
|
+
if (src == dst) {
|
|
286
|
+
return;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
|
|
290
|
+
if (backend->iface.cpy_tensor_async != NULL) {
|
|
291
|
+
if (backend->iface.cpy_tensor_async(backend, src, dst)) {
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
size_t nbytes = ggml_nbytes(src);
|
|
298
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
299
|
+
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
|
|
300
|
+
}
|
|
301
|
+
else {
|
|
302
|
+
ggml_backend_tensor_copy(src, dst);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
|
|
257
307
|
// backend registry
|
|
258
308
|
|
|
259
309
|
#define GGML_MAX_BACKENDS_REG 16
|
|
@@ -268,9 +318,9 @@ struct ggml_backend_reg {
|
|
|
268
318
|
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
|
269
319
|
static size_t ggml_backend_registry_count = 0;
|
|
270
320
|
|
|
271
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
|
321
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
|
272
322
|
|
|
273
|
-
static void ggml_backend_registry_init(void) {
|
|
323
|
+
GGML_CALL static void ggml_backend_registry_init(void) {
|
|
274
324
|
static bool initialized = false;
|
|
275
325
|
|
|
276
326
|
if (initialized) {
|
|
@@ -283,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
|
|
283
333
|
|
|
284
334
|
// add forward decls here to avoid including the backend headers
|
|
285
335
|
#ifdef GGML_USE_CUBLAS
|
|
286
|
-
extern void ggml_backend_cuda_reg_devices(void);
|
|
336
|
+
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
|
287
337
|
ggml_backend_cuda_reg_devices();
|
|
288
338
|
#endif
|
|
289
339
|
|
|
290
340
|
#ifdef GGML_USE_METAL
|
|
291
|
-
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
|
292
|
-
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
341
|
+
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
|
342
|
+
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
293
343
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
|
294
344
|
#endif
|
|
295
345
|
}
|
|
296
346
|
|
|
297
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
|
347
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
|
298
348
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
|
299
349
|
|
|
300
350
|
size_t id = ggml_backend_registry_count;
|
|
@@ -389,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
|
389
439
|
|
|
390
440
|
// backend CPU
|
|
391
441
|
|
|
392
|
-
static
|
|
442
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
|
443
|
+
return "CPU";
|
|
444
|
+
|
|
445
|
+
GGML_UNUSED(buffer);
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
393
449
|
return (void *)buffer->context;
|
|
394
450
|
}
|
|
395
451
|
|
|
396
|
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
452
|
+
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
397
453
|
free(buffer->context);
|
|
398
454
|
}
|
|
399
455
|
|
|
400
|
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
456
|
+
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
401
457
|
memcpy((char *)tensor->data + offset, data, size);
|
|
402
458
|
|
|
403
459
|
GGML_UNUSED(buffer);
|
|
404
460
|
}
|
|
405
461
|
|
|
406
|
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
462
|
+
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
407
463
|
memcpy(data, (const char *)tensor->data + offset, size);
|
|
408
464
|
|
|
409
465
|
GGML_UNUSED(buffer);
|
|
410
466
|
}
|
|
411
467
|
|
|
412
|
-
static
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
419
|
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
|
468
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
469
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
470
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
471
|
+
return true;
|
|
472
|
+
}
|
|
473
|
+
return false;
|
|
420
474
|
|
|
421
475
|
GGML_UNUSED(buffer);
|
|
422
476
|
}
|
|
423
477
|
|
|
424
|
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
478
|
+
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
425
479
|
memset(buffer->context, value, buffer->size);
|
|
426
480
|
}
|
|
427
481
|
|
|
428
482
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
483
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
|
429
484
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
430
485
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
431
486
|
/* .init_tensor = */ NULL, // no initialization required
|
|
432
487
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
433
488
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
434
|
-
/* .
|
|
435
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
|
489
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
436
490
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
491
|
+
/* .reset = */ NULL,
|
|
437
492
|
};
|
|
438
493
|
|
|
439
494
|
// for buffers from ptr, free is not called
|
|
440
495
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
496
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
|
441
497
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
442
498
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
443
499
|
/* .init_tensor = */ NULL, // no initialization required
|
|
444
500
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
445
501
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
446
|
-
/* .
|
|
447
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
|
502
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
448
503
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
504
|
+
/* .reset = */ NULL,
|
|
449
505
|
};
|
|
450
506
|
|
|
451
507
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
|
452
508
|
|
|
453
|
-
static
|
|
509
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
510
|
+
return "CPU";
|
|
511
|
+
|
|
512
|
+
GGML_UNUSED(buft);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
454
516
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
|
455
517
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
|
456
518
|
|
|
@@ -459,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
|
|
459
521
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
|
460
522
|
}
|
|
461
523
|
|
|
462
|
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
524
|
+
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
463
525
|
return TENSOR_ALIGNMENT;
|
|
464
526
|
|
|
465
527
|
GGML_UNUSED(buft);
|
|
466
528
|
}
|
|
467
529
|
|
|
468
|
-
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
530
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
469
531
|
return ggml_backend_is_cpu(backend);
|
|
470
532
|
|
|
471
533
|
GGML_UNUSED(buft);
|
|
472
534
|
}
|
|
473
535
|
|
|
474
|
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
536
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
475
537
|
return true;
|
|
476
538
|
|
|
477
539
|
GGML_UNUSED(buft);
|
|
478
540
|
}
|
|
479
541
|
|
|
480
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
542
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
481
543
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
482
544
|
/* .iface = */ {
|
|
545
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
483
546
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
484
547
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
485
548
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
@@ -498,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
|
498
561
|
|
|
499
562
|
#include <hbwmalloc.h>
|
|
500
563
|
|
|
501
|
-
static
|
|
564
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
565
|
+
return "CPU_HBM";
|
|
566
|
+
|
|
567
|
+
GGML_UNUSED(buft);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
|
571
|
+
return "CPU_HBM";
|
|
572
|
+
|
|
573
|
+
GGML_UNUSED(buf);
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
502
577
|
hbw_free(buffer->context);
|
|
503
578
|
}
|
|
504
579
|
|
|
505
|
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
580
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
506
581
|
//void * ptr = hbw_malloc(size);
|
|
507
582
|
void * ptr;
|
|
508
583
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
@@ -511,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
|
511
586
|
return NULL;
|
|
512
587
|
}
|
|
513
588
|
|
|
514
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
|
515
589
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
516
590
|
buffer->buft = buft;
|
|
591
|
+
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
|
517
592
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
518
593
|
|
|
519
594
|
return buffer;
|
|
520
595
|
}
|
|
521
596
|
|
|
522
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
|
597
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
523
598
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
524
599
|
/* .iface = */ {
|
|
600
|
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
525
601
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
526
602
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
527
603
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
@@ -541,20 +617,20 @@ struct ggml_backend_cpu_context {
|
|
|
541
617
|
size_t work_size;
|
|
542
618
|
};
|
|
543
619
|
|
|
544
|
-
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
|
620
|
+
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
|
545
621
|
return "CPU";
|
|
546
622
|
|
|
547
623
|
GGML_UNUSED(backend);
|
|
548
624
|
}
|
|
549
625
|
|
|
550
|
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
626
|
+
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
551
627
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
552
628
|
free(cpu_ctx->work_data);
|
|
553
629
|
free(cpu_ctx);
|
|
554
630
|
free(backend);
|
|
555
631
|
}
|
|
556
632
|
|
|
557
|
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
633
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
558
634
|
return ggml_backend_cpu_buffer_type();
|
|
559
635
|
|
|
560
636
|
GGML_UNUSED(backend);
|
|
@@ -565,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
|
|
565
641
|
struct ggml_cgraph cgraph;
|
|
566
642
|
};
|
|
567
643
|
|
|
568
|
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
644
|
+
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
569
645
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
570
646
|
|
|
571
647
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
@@ -580,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
|
580
656
|
return cpu_plan;
|
|
581
657
|
}
|
|
582
658
|
|
|
583
|
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
659
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
584
660
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
585
661
|
|
|
586
662
|
free(cpu_plan->cplan.work_data);
|
|
@@ -589,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
|
|
589
665
|
GGML_UNUSED(backend);
|
|
590
666
|
}
|
|
591
667
|
|
|
592
|
-
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
668
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
593
669
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
594
670
|
|
|
595
671
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
@@ -597,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
|
597
673
|
GGML_UNUSED(backend);
|
|
598
674
|
}
|
|
599
675
|
|
|
600
|
-
static
|
|
676
|
+
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
601
677
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
602
678
|
|
|
603
679
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
@@ -611,9 +687,10 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
|
611
687
|
cplan.work_data = cpu_ctx->work_data;
|
|
612
688
|
|
|
613
689
|
ggml_graph_compute(cgraph, &cplan);
|
|
690
|
+
return true;
|
|
614
691
|
}
|
|
615
692
|
|
|
616
|
-
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
693
|
+
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
617
694
|
switch (op->op) {
|
|
618
695
|
case GGML_OP_MUL_MAT:
|
|
619
696
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
@@ -630,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
|
630
707
|
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
631
708
|
/* .set_tensor_async = */ NULL,
|
|
632
709
|
/* .get_tensor_async = */ NULL,
|
|
633
|
-
/* .
|
|
634
|
-
/* .cpy_tensor_to_async = */ NULL,
|
|
710
|
+
/* .cpy_tensor_async = */ NULL,
|
|
635
711
|
/* .synchronize = */ NULL,
|
|
636
712
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
637
713
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
@@ -656,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
|
656
732
|
return cpu_backend;
|
|
657
733
|
}
|
|
658
734
|
|
|
659
|
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
660
|
-
return backend->iface.get_name == ggml_backend_cpu_name;
|
|
735
|
+
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
736
|
+
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
|
661
737
|
}
|
|
662
738
|
|
|
663
739
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
@@ -667,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
|
667
743
|
ctx->n_threads = n_threads;
|
|
668
744
|
}
|
|
669
745
|
|
|
670
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
746
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
671
747
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
|
672
748
|
}
|
|
673
749
|
|
|
674
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
|
750
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
|
675
751
|
return ggml_backend_cpu_init();
|
|
676
752
|
|
|
677
753
|
GGML_UNUSED(params);
|
|
@@ -681,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
|
|
|
681
757
|
|
|
682
758
|
// scheduler
|
|
683
759
|
|
|
684
|
-
#define GGML_MAX_BACKENDS
|
|
760
|
+
#define GGML_MAX_BACKENDS 16
|
|
685
761
|
#define GGML_MAX_SPLITS 256
|
|
686
762
|
#define GGML_MAX_SPLIT_INPUTS 16
|
|
687
763
|
|
|
@@ -691,21 +767,29 @@ struct ggml_backend_sched_split {
|
|
|
691
767
|
int i_end;
|
|
692
768
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
|
693
769
|
int n_inputs;
|
|
770
|
+
// graph view of this split
|
|
694
771
|
struct ggml_cgraph graph;
|
|
695
772
|
};
|
|
696
773
|
|
|
697
774
|
struct ggml_backend_sched {
|
|
775
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
|
776
|
+
|
|
698
777
|
int n_backends;
|
|
699
778
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
|
779
|
+
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
|
700
780
|
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
|
701
781
|
|
|
702
782
|
ggml_gallocr_t galloc;
|
|
703
783
|
|
|
784
|
+
// hash keys of the nodes in the graph
|
|
704
785
|
struct ggml_hash_set hash_set;
|
|
705
|
-
|
|
706
|
-
|
|
786
|
+
// hash values (arrays of [hash_set.size])
|
|
787
|
+
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
|
|
788
|
+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
|
|
707
789
|
|
|
790
|
+
// copy of the graph with modified inputs
|
|
708
791
|
struct ggml_cgraph * graph;
|
|
792
|
+
|
|
709
793
|
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
|
710
794
|
int n_splits;
|
|
711
795
|
|
|
@@ -746,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
|
|
|
746
830
|
return INT_MAX;
|
|
747
831
|
}
|
|
748
832
|
|
|
749
|
-
static
|
|
833
|
+
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
|
750
834
|
if (buffer == NULL) {
|
|
751
835
|
return NULL;
|
|
752
836
|
}
|
|
837
|
+
|
|
838
|
+
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
|
839
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
|
840
|
+
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
|
841
|
+
return sched->tallocs[i];
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
753
845
|
// find highest prio backend that supports the buffer type
|
|
754
846
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
755
847
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
|
756
|
-
return sched->
|
|
848
|
+
return sched->tallocs[i];
|
|
757
849
|
}
|
|
758
850
|
}
|
|
759
851
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
|
@@ -763,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
|
763
855
|
if (allocr == NULL) {
|
|
764
856
|
return NULL;
|
|
765
857
|
}
|
|
766
|
-
// find highest prio backend that supports the buffer type
|
|
767
858
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
768
859
|
if (sched->tallocs[i] == allocr) {
|
|
769
860
|
return sched->backends[i];
|
|
@@ -773,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
|
773
864
|
}
|
|
774
865
|
|
|
775
866
|
#if 0
|
|
776
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*
|
|
867
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
|
777
868
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
778
869
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
|
779
870
|
#else
|
|
@@ -782,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
|
|
|
782
873
|
#endif
|
|
783
874
|
|
|
784
875
|
// returns the backend that should be used for the node based on the current locations
|
|
785
|
-
static
|
|
786
|
-
//
|
|
787
|
-
// ie. kv cache updates
|
|
788
|
-
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
|
876
|
+
static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
877
|
+
// assign pre-allocated nodes to their backend
|
|
789
878
|
// dst
|
|
790
|
-
|
|
791
|
-
if (
|
|
879
|
+
ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
|
|
880
|
+
if (cur_allocr != NULL) {
|
|
792
881
|
SET_CAUSE(node, "1.dst");
|
|
793
|
-
return
|
|
882
|
+
return cur_allocr;
|
|
794
883
|
}
|
|
795
|
-
|
|
796
884
|
// view_src
|
|
797
|
-
if (node->view_src != NULL
|
|
798
|
-
|
|
799
|
-
|
|
885
|
+
if (node->view_src != NULL) {
|
|
886
|
+
cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
|
|
887
|
+
if (cur_allocr != NULL) {
|
|
888
|
+
SET_CAUSE(node, "1.vsrc");
|
|
889
|
+
return cur_allocr;
|
|
890
|
+
}
|
|
800
891
|
}
|
|
801
|
-
|
|
802
|
-
// src
|
|
803
|
-
int cur_prio = INT_MAX;
|
|
804
|
-
size_t cur_size = 0;
|
|
805
|
-
|
|
892
|
+
// assign nodes that use weights to the backend of the weights
|
|
806
893
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
807
894
|
const struct ggml_tensor * src = node->src[i];
|
|
808
895
|
if (src == NULL) {
|
|
809
896
|
break;
|
|
810
897
|
}
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
cur_prio = src_prio;
|
|
817
|
-
cur_size = src_size;
|
|
818
|
-
cur_backend = src_backend;
|
|
819
|
-
SET_CAUSE(node, "1.src%d", i);
|
|
820
|
-
}
|
|
898
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
899
|
+
ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
|
|
900
|
+
// operations with weights are always run on the same backend as the weights
|
|
901
|
+
SET_CAUSE(node, "1.wgt%d", i);
|
|
902
|
+
return src_allocr;
|
|
821
903
|
}
|
|
822
904
|
}
|
|
823
|
-
|
|
905
|
+
|
|
906
|
+
return NULL;
|
|
824
907
|
}
|
|
825
908
|
|
|
826
909
|
static char * fmt_size(size_t size) {
|
|
@@ -853,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
853
936
|
}
|
|
854
937
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
855
938
|
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
|
856
|
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%
|
|
939
|
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
|
857
940
|
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
|
858
941
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
859
942
|
struct ggml_tensor * src = node->src[j];
|
|
@@ -862,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
|
862
945
|
}
|
|
863
946
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
|
864
947
|
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
|
865
|
-
fprintf(stderr, " %20.20s (%
|
|
948
|
+
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
866
949
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
867
950
|
}
|
|
868
951
|
fprintf(stderr, "\n");
|
|
@@ -878,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
|
|
878
961
|
return dup;
|
|
879
962
|
}
|
|
880
963
|
|
|
964
|
+
|
|
965
|
+
//#define DEBUG_PASS1
|
|
966
|
+
//#define DEBUG_PASS2
|
|
967
|
+
//#define DEBUG_PASS3
|
|
968
|
+
//#define DEBUG_PASS4
|
|
969
|
+
|
|
881
970
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
882
|
-
// TODO: merge passes
|
|
883
971
|
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
884
|
-
// reset
|
|
885
|
-
size_t hash_size = sched->hash_set.size;
|
|
886
|
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
|
887
|
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
|
888
|
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
|
972
|
+
// reset splits
|
|
889
973
|
sched->n_splits = 0;
|
|
974
|
+
sched->is_reset = false;
|
|
890
975
|
|
|
891
976
|
struct ggml_init_params params = {
|
|
892
977
|
/* .mem_size = */ sizeof(sched->context_buffer),
|
|
@@ -894,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
894
979
|
/* .no_alloc = */ true
|
|
895
980
|
};
|
|
896
981
|
|
|
897
|
-
|
|
898
|
-
ggml_free(sched->ctx);
|
|
899
|
-
}
|
|
982
|
+
ggml_free(sched->ctx);
|
|
900
983
|
|
|
901
984
|
sched->ctx = ggml_init(params);
|
|
985
|
+
if (sched->ctx == NULL) {
|
|
986
|
+
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
|
987
|
+
GGML_ASSERT(false);
|
|
988
|
+
}
|
|
902
989
|
|
|
903
|
-
// pass 1: assign backends to ops with allocated inputs
|
|
990
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
|
904
991
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
905
992
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
906
993
|
if (node_allocr(leaf) != NULL) {
|
|
907
994
|
// do not overwrite user assignments
|
|
908
995
|
continue;
|
|
909
996
|
}
|
|
910
|
-
|
|
911
|
-
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
|
912
|
-
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
|
|
913
|
-
}
|
|
914
|
-
if (leaf_backend != NULL) {
|
|
915
|
-
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
|
916
|
-
}
|
|
997
|
+
node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
|
|
917
998
|
}
|
|
918
999
|
|
|
919
1000
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -922,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
922
1003
|
// do not overwrite user assignments
|
|
923
1004
|
continue;
|
|
924
1005
|
}
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
1006
|
+
node_allocr(node) = sched_allocr_from_cur(sched, node);
|
|
1007
|
+
// src
|
|
1008
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1009
|
+
struct ggml_tensor * src = node->src[j];
|
|
1010
|
+
if (src == NULL) {
|
|
1011
|
+
break;
|
|
1012
|
+
}
|
|
1013
|
+
if (node_allocr(src) == NULL) {
|
|
1014
|
+
node_allocr(src) = sched_allocr_from_cur(sched, src);
|
|
1015
|
+
}
|
|
928
1016
|
}
|
|
929
1017
|
}
|
|
930
|
-
|
|
1018
|
+
#ifdef DEBUG_PASS1
|
|
1019
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
|
1020
|
+
#endif
|
|
931
1021
|
|
|
932
|
-
// pass 2:
|
|
933
|
-
//
|
|
934
|
-
//
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1022
|
+
// pass 2: expand current backend assignments
|
|
1023
|
+
// assign the same backend to adjacent nodes
|
|
1024
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
|
1025
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
|
1026
|
+
|
|
1027
|
+
// pass 2.1 expand gpu up
|
|
1028
|
+
{
|
|
1029
|
+
ggml_tallocr_t cur_allocr = NULL;
|
|
1030
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
1031
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1032
|
+
if (ggml_is_view_op(node->op)) {
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
1036
|
+
if (node_allocr != NULL) {
|
|
1037
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
|
1038
|
+
// skip cpu (lowest prio backend)
|
|
1039
|
+
cur_allocr = NULL;
|
|
1040
|
+
} else {
|
|
1041
|
+
cur_allocr = node_allocr;
|
|
945
1042
|
}
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
1043
|
+
} else {
|
|
1044
|
+
node_allocr(node) = cur_allocr;
|
|
1045
|
+
SET_CAUSE(node, "2.1");
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// pass 2.2 expand gpu down
|
|
1051
|
+
{
|
|
1052
|
+
ggml_tallocr_t cur_allocr = NULL;
|
|
1053
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1054
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1055
|
+
if (ggml_is_view_op(node->op)) {
|
|
1056
|
+
continue;
|
|
1057
|
+
}
|
|
1058
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
1059
|
+
if (node_allocr != NULL) {
|
|
1060
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
|
1061
|
+
// skip cpu (lowest prio backend)
|
|
1062
|
+
cur_allocr = NULL;
|
|
1063
|
+
} else {
|
|
1064
|
+
cur_allocr = node_allocr;
|
|
956
1065
|
}
|
|
1066
|
+
} else {
|
|
1067
|
+
node_allocr(node) = cur_allocr;
|
|
1068
|
+
SET_CAUSE(node, "2.2");
|
|
957
1069
|
}
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
// pass 2.3 expand rest up
|
|
1074
|
+
{
|
|
1075
|
+
ggml_tallocr_t cur_allocr = NULL;
|
|
1076
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
1077
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1078
|
+
if (ggml_is_view_op(node->op)) {
|
|
1079
|
+
continue;
|
|
1080
|
+
}
|
|
1081
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
958
1082
|
if (node_allocr != NULL) {
|
|
959
|
-
|
|
1083
|
+
cur_allocr = node_allocr;
|
|
1084
|
+
} else {
|
|
1085
|
+
node_allocr(node) = cur_allocr;
|
|
1086
|
+
SET_CAUSE(node, "2.3");
|
|
960
1087
|
}
|
|
961
1088
|
}
|
|
962
1089
|
}
|
|
963
|
-
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
|
964
1090
|
|
|
965
|
-
// pass
|
|
1091
|
+
// pass 2.4 expand rest down
|
|
1092
|
+
{
|
|
1093
|
+
ggml_tallocr_t cur_allocr = NULL;
|
|
1094
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1095
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1096
|
+
if (ggml_is_view_op(node->op)) {
|
|
1097
|
+
continue;
|
|
1098
|
+
}
|
|
1099
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
1100
|
+
if (node_allocr != NULL) {
|
|
1101
|
+
cur_allocr = node_allocr;
|
|
1102
|
+
} else {
|
|
1103
|
+
node_allocr(node) = cur_allocr;
|
|
1104
|
+
SET_CAUSE(node, "2.4");
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
#ifdef DEBUG_PASS2
|
|
1109
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
|
1110
|
+
#endif
|
|
1111
|
+
|
|
1112
|
+
// pass 3: assign backends to remaining src from dst and view_src
|
|
966
1113
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
967
1114
|
struct ggml_tensor * node = graph->nodes[i];
|
|
968
|
-
ggml_tallocr_t
|
|
1115
|
+
ggml_tallocr_t cur_allocr = node_allocr(node);
|
|
1116
|
+
if (node->view_src != NULL && cur_allocr == NULL) {
|
|
1117
|
+
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
|
|
1118
|
+
SET_CAUSE(node, "3.vsrc");
|
|
1119
|
+
}
|
|
969
1120
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
970
1121
|
struct ggml_tensor * src = node->src[j];
|
|
971
1122
|
if (src == NULL) {
|
|
@@ -973,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
973
1124
|
}
|
|
974
1125
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
|
975
1126
|
if (src_allocr == NULL) {
|
|
976
|
-
|
|
1127
|
+
if (src->view_src != NULL) {
|
|
1128
|
+
// views are always on the same backend as the source
|
|
1129
|
+
node_allocr(src) = node_allocr(src->view_src);
|
|
1130
|
+
SET_CAUSE(src, "3.vsrc");
|
|
1131
|
+
} else {
|
|
1132
|
+
node_allocr(src) = cur_allocr;
|
|
1133
|
+
SET_CAUSE(src, "3.cur");
|
|
1134
|
+
}
|
|
977
1135
|
}
|
|
978
1136
|
}
|
|
979
1137
|
}
|
|
980
|
-
|
|
1138
|
+
#ifdef DEBUG_PASS3
|
|
1139
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
|
1140
|
+
#endif
|
|
981
1141
|
|
|
982
1142
|
// pass 4: split graph, find tensors that need to be copied
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
}
|
|
993
|
-
}
|
|
994
|
-
sched->splits[0].i_start = 0;
|
|
995
|
-
sched->splits[0].n_inputs = 0;
|
|
996
|
-
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
|
997
|
-
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
|
998
|
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
999
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1000
|
-
struct ggml_tensor * node = graph->nodes[i];
|
|
1001
|
-
|
|
1002
|
-
if (ggml_is_view_op(node->op)) {
|
|
1003
|
-
continue;
|
|
1143
|
+
{
|
|
1144
|
+
int cur_split = 0;
|
|
1145
|
+
// find the backend of the first split, skipping view ops
|
|
1146
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1147
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1148
|
+
if (!ggml_is_view_op(node->op)) {
|
|
1149
|
+
sched->splits[0].tallocr = node_allocr(node);
|
|
1150
|
+
break;
|
|
1151
|
+
}
|
|
1004
1152
|
}
|
|
1153
|
+
sched->splits[0].i_start = 0;
|
|
1154
|
+
sched->splits[0].n_inputs = 0;
|
|
1155
|
+
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
|
1156
|
+
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
|
1157
|
+
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
1158
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1159
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1160
|
+
|
|
1161
|
+
if (ggml_is_view_op(node->op)) {
|
|
1162
|
+
continue;
|
|
1163
|
+
}
|
|
1005
1164
|
|
|
1006
|
-
|
|
1165
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
1007
1166
|
|
|
1008
|
-
|
|
1009
|
-
sched->splits[cur_split].i_end = i;
|
|
1010
|
-
cur_split++;
|
|
1011
|
-
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
|
1012
|
-
sched->splits[cur_split].tallocr = node_allocr;
|
|
1013
|
-
sched->splits[cur_split].i_start = i;
|
|
1014
|
-
sched->splits[cur_split].n_inputs = 0;
|
|
1015
|
-
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
|
1016
|
-
cur_allocr = node_allocr;
|
|
1017
|
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
1018
|
-
}
|
|
1167
|
+
GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
|
|
1019
1168
|
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1169
|
+
if (node_allocr != cur_allocr) {
|
|
1170
|
+
sched->splits[cur_split].i_end = i;
|
|
1171
|
+
cur_split++;
|
|
1172
|
+
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
|
1173
|
+
sched->splits[cur_split].tallocr = node_allocr;
|
|
1174
|
+
sched->splits[cur_split].i_start = i;
|
|
1175
|
+
sched->splits[cur_split].n_inputs = 0;
|
|
1176
|
+
cur_allocr = node_allocr;
|
|
1177
|
+
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
|
1025
1178
|
}
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1179
|
+
|
|
1180
|
+
// find inputs that are not on the same backend
|
|
1181
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1182
|
+
struct ggml_tensor * src = node->src[j];
|
|
1183
|
+
if (src == NULL) {
|
|
1184
|
+
break;
|
|
1185
|
+
}
|
|
1186
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
|
1187
|
+
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
|
1188
|
+
if (src_allocr != node_allocr) {
|
|
1189
|
+
// check if the input is already in the split
|
|
1190
|
+
bool found = false;
|
|
1191
|
+
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
|
1192
|
+
if (sched->splits[cur_split].inputs[k] == src) {
|
|
1193
|
+
found = true;
|
|
1194
|
+
break;
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
if (!found) {
|
|
1199
|
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
|
1200
|
+
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
|
1201
|
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
|
1202
|
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
// create a copy of the input in the split's backend
|
|
1206
|
+
size_t id = hash_id(src);
|
|
1207
|
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
|
1208
|
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
|
1209
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
|
1210
|
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
|
1211
|
+
|
|
1212
|
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
|
1213
|
+
node_allocr(tensor_copy) = cur_allocr;
|
|
1214
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
|
1215
|
+
}
|
|
1216
|
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
|
1040
1217
|
}
|
|
1041
|
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
|
1042
1218
|
}
|
|
1043
1219
|
}
|
|
1220
|
+
sched->splits[cur_split].i_end = graph->n_nodes;
|
|
1221
|
+
sched->n_splits = cur_split + 1;
|
|
1044
1222
|
}
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
|
1223
|
+
#ifdef DEBUG_PASS4
|
|
1224
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
|
1225
|
+
#endif
|
|
1049
1226
|
|
|
1050
|
-
#
|
|
1227
|
+
#ifndef NDEBUG
|
|
1051
1228
|
// sanity check: all sources should have the same backend as the node
|
|
1052
1229
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1053
1230
|
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -1055,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
1055
1232
|
if (node_allocr == NULL) {
|
|
1056
1233
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
|
1057
1234
|
}
|
|
1235
|
+
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
|
|
1236
|
+
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
|
1237
|
+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
|
1238
|
+
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
|
|
1239
|
+
}
|
|
1058
1240
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1059
1241
|
struct ggml_tensor * src = node->src[j];
|
|
1060
1242
|
if (src == NULL) {
|
|
@@ -1066,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
1066
1248
|
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
|
1067
1249
|
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
|
1068
1250
|
}
|
|
1251
|
+
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
|
|
1252
|
+
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
|
1253
|
+
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
|
|
1254
|
+
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
|
|
1255
|
+
}
|
|
1069
1256
|
}
|
|
1070
1257
|
}
|
|
1258
|
+
fflush(stderr);
|
|
1071
1259
|
#endif
|
|
1072
1260
|
|
|
1073
1261
|
// create copies of the graph for each split
|
|
@@ -1081,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
|
1081
1269
|
for (int j = 0; j < split->n_inputs; j++) {
|
|
1082
1270
|
struct ggml_tensor * input = split->inputs[j];
|
|
1083
1271
|
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
|
1272
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
|
1273
|
+
GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
|
|
1084
1274
|
input_cpy->src[0] = input;
|
|
1085
1275
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
|
1086
1276
|
}
|
|
@@ -1115,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
|
1115
1305
|
uint64_t copy_start_us = ggml_time_us();
|
|
1116
1306
|
for (int j = 0; j < split->n_inputs; j++) {
|
|
1117
1307
|
struct ggml_tensor * input = split->inputs[j];
|
|
1118
|
-
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
}
|
|
1127
|
-
if (input_cpy->buffer == NULL) {
|
|
1128
|
-
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
|
1129
|
-
exit(1);
|
|
1130
|
-
}
|
|
1131
|
-
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
|
|
1132
|
-
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
|
1133
|
-
ggml_backend_tensor_copy(input, input_cpy);
|
|
1308
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
|
|
1309
|
+
|
|
1310
|
+
GGML_ASSERT(input->buffer != NULL);
|
|
1311
|
+
GGML_ASSERT(input_cpy->buffer != NULL);
|
|
1312
|
+
|
|
1313
|
+
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
|
1314
|
+
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
|
1315
|
+
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
|
1134
1316
|
}
|
|
1135
|
-
//
|
|
1317
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
|
1136
1318
|
int64_t copy_end_us = ggml_time_us();
|
|
1137
1319
|
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
|
1138
1320
|
|
|
@@ -1144,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
|
1144
1326
|
|
|
1145
1327
|
uint64_t compute_start_us = ggml_time_us();
|
|
1146
1328
|
ggml_backend_graph_compute(split_backend, &split->graph);
|
|
1147
|
-
//
|
|
1329
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
|
1148
1330
|
uint64_t compute_end_us = ggml_time_us();
|
|
1149
1331
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
|
1150
1332
|
}
|
|
@@ -1164,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|
|
1164
1346
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1165
1347
|
ggml_tallocr_reset(sched->tallocs[i]);
|
|
1166
1348
|
}
|
|
1349
|
+
// reset state for the next run
|
|
1350
|
+
size_t hash_size = sched->hash_set.size;
|
|
1351
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
|
1352
|
+
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
|
1353
|
+
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
|
1354
|
+
|
|
1355
|
+
sched->is_reset = true;
|
|
1167
1356
|
}
|
|
1168
1357
|
|
|
1169
|
-
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
|
1358
|
+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
|
1359
|
+
GGML_ASSERT(n_backends > 0);
|
|
1170
1360
|
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
|
1171
1361
|
|
|
1172
|
-
struct ggml_backend_sched * sched =
|
|
1173
|
-
|
|
1362
|
+
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
|
1363
|
+
|
|
1364
|
+
// initialize hash table
|
|
1365
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
|
1366
|
+
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
|
1367
|
+
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
|
1174
1368
|
|
|
1175
1369
|
sched->n_backends = n_backends;
|
|
1176
1370
|
for (int i = 0; i < n_backends; i++) {
|
|
1177
1371
|
sched->backends[i] = backends[i];
|
|
1372
|
+
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
|
1178
1373
|
}
|
|
1179
1374
|
|
|
1180
1375
|
sched->galloc = ggml_gallocr_new();
|
|
1181
1376
|
|
|
1182
1377
|
// init measure allocs for each backend
|
|
1183
1378
|
for (int i = 0; i < n_backends; i++) {
|
|
1184
|
-
sched->tallocs[i] =
|
|
1379
|
+
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
|
1185
1380
|
}
|
|
1186
1381
|
|
|
1382
|
+
sched_reset(sched);
|
|
1383
|
+
|
|
1187
1384
|
return sched;
|
|
1188
1385
|
}
|
|
1189
1386
|
|
|
@@ -1195,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1195
1392
|
ggml_tallocr_free(sched->tallocs[i]);
|
|
1196
1393
|
}
|
|
1197
1394
|
ggml_gallocr_free(sched->galloc);
|
|
1395
|
+
ggml_free(sched->ctx);
|
|
1198
1396
|
free(sched->hash_set.keys);
|
|
1199
1397
|
free(sched->node_talloc);
|
|
1200
1398
|
free(sched->node_copies);
|
|
@@ -1202,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1202
1400
|
}
|
|
1203
1401
|
|
|
1204
1402
|
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
|
1205
|
-
//
|
|
1206
|
-
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
|
1207
|
-
sched->hash_set.size = hash_size;
|
|
1208
|
-
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
|
1209
|
-
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
|
1210
|
-
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
|
1403
|
+
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
|
1211
1404
|
|
|
1212
1405
|
sched_split_graph(sched, measure_graph);
|
|
1213
1406
|
sched_alloc_splits(sched);
|
|
@@ -1216,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|
|
1216
1409
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1217
1410
|
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
|
1218
1411
|
ggml_tallocr_free(sched->tallocs[i]);
|
|
1219
|
-
sched->tallocs[i] =
|
|
1412
|
+
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
|
1220
1413
|
}
|
|
1221
1414
|
|
|
1222
1415
|
sched_reset(sched);
|
|
1223
1416
|
}
|
|
1224
1417
|
|
|
1225
1418
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
1226
|
-
GGML_ASSERT(sched->hash_set.size >= graph->
|
|
1419
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
|
1420
|
+
|
|
1421
|
+
if (!sched->is_reset) {
|
|
1422
|
+
sched_reset(sched);
|
|
1423
|
+
}
|
|
1227
1424
|
|
|
1228
1425
|
sched_split_graph(sched, graph);
|
|
1229
1426
|
sched_alloc_splits(sched);
|
|
1230
1427
|
sched_compute_splits(sched);
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|
1231
1431
|
sched_reset(sched);
|
|
1232
1432
|
}
|
|
1233
1433
|
|
|
1434
|
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|
1435
|
+
return sched->n_splits;
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1234
1438
|
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1235
1439
|
int backend_index = sched_backend_prio(sched, backend);
|
|
1440
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1236
1441
|
return sched->tallocs[backend_index];
|
|
1237
1442
|
}
|
|
1238
1443
|
|
|
1239
1444
|
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1240
1445
|
int backend_index = sched_backend_prio(sched, backend);
|
|
1446
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1241
1447
|
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
|
1242
1448
|
}
|
|
1243
1449
|
|
|
@@ -1247,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
|
1247
1453
|
node_allocr(node) = sched->tallocs[backend_index];
|
|
1248
1454
|
}
|
|
1249
1455
|
|
|
1456
|
+
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
1457
|
+
ggml_tallocr_t allocr = node_allocr(node);
|
|
1458
|
+
if (allocr == NULL) {
|
|
1459
|
+
return NULL;
|
|
1460
|
+
}
|
|
1461
|
+
return get_allocr_backend(sched, allocr);
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1250
1464
|
// utils
|
|
1465
|
+
|
|
1251
1466
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
1252
1467
|
GGML_ASSERT(tensor->buffer == NULL);
|
|
1253
|
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-
|
|
1468
|
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
|
1254
1469
|
GGML_ASSERT(tensor->view_src != NULL);
|
|
1255
1470
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
|
1256
1471
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
@@ -1316,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
|
1316
1531
|
|
|
1317
1532
|
struct ggml_tensor * dst = node_copies[id];
|
|
1318
1533
|
if (dst->view_src != NULL) {
|
|
1534
|
+
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
|
1319
1535
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
|
1320
1536
|
}
|
|
1321
1537
|
else {
|
|
@@ -1349,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
1349
1565
|
struct ggml_context * ctx_allocated = ggml_init(params);
|
|
1350
1566
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
|
1351
1567
|
|
|
1568
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
|
1569
|
+
fprintf(stderr, "failed to allocate context for graph copy\n");
|
|
1570
|
+
free(hash_set.keys);
|
|
1571
|
+
free(node_copies);
|
|
1572
|
+
free(node_init);
|
|
1573
|
+
ggml_free(ctx_allocated);
|
|
1574
|
+
ggml_free(ctx_unallocated);
|
|
1575
|
+
return (struct ggml_backend_graph_copy) {
|
|
1576
|
+
/* .buffer = */ NULL,
|
|
1577
|
+
/* .ctx_allocated = */ NULL,
|
|
1578
|
+
/* .ctx_unallocated = */ NULL,
|
|
1579
|
+
/* .graph = */ NULL,
|
|
1580
|
+
};
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1352
1583
|
// dup nodes
|
|
1353
1584
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1354
1585
|
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -1357,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
1357
1588
|
|
|
1358
1589
|
// allocate nodes
|
|
1359
1590
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
1591
|
+
if (buffer == NULL) {
|
|
1592
|
+
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
|
1593
|
+
free(hash_set.keys);
|
|
1594
|
+
free(node_copies);
|
|
1595
|
+
free(node_init);
|
|
1596
|
+
ggml_free(ctx_allocated);
|
|
1597
|
+
ggml_free(ctx_unallocated);
|
|
1598
|
+
return (struct ggml_backend_graph_copy) {
|
|
1599
|
+
/* .buffer = */ NULL,
|
|
1600
|
+
/* .ctx_allocated = */ NULL,
|
|
1601
|
+
/* .ctx_unallocated = */ NULL,
|
|
1602
|
+
/* .graph = */ NULL,
|
|
1603
|
+
};
|
|
1604
|
+
}
|
|
1360
1605
|
|
|
1361
1606
|
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
|
1362
1607
|
|
|
@@ -1393,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
|
1393
1638
|
ggml_free(copy.ctx_unallocated);
|
|
1394
1639
|
}
|
|
1395
1640
|
|
|
1396
|
-
|
|
1641
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
|
1397
1642
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
|
1643
|
+
if (copy.buffer == NULL) {
|
|
1644
|
+
return false;
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1398
1647
|
struct ggml_cgraph * g1 = graph;
|
|
1399
1648
|
struct ggml_cgraph * g2 = copy.graph;
|
|
1400
1649
|
|
|
@@ -1424,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
1424
1673
|
}
|
|
1425
1674
|
|
|
1426
1675
|
ggml_backend_graph_copy_free(copy);
|
|
1676
|
+
|
|
1677
|
+
return true;
|
|
1427
1678
|
}
|