llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -15,7 +15,11 @@
|
|
15
15
|
|
16
16
|
// backend buffer type
|
17
17
|
|
18
|
-
|
18
|
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
19
|
+
return buft->iface.get_name(buft);
|
20
|
+
}
|
21
|
+
|
22
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
19
23
|
return buft->iface.alloc_buffer(buft, size);
|
20
24
|
}
|
21
25
|
|
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
23
27
|
return buft->iface.get_alignment(buft);
|
24
28
|
}
|
25
29
|
|
26
|
-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
30
|
+
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
27
31
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
28
32
|
if (buft->iface.get_alloc_size) {
|
29
33
|
return buft->iface.get_alloc_size(buft, tensor);
|
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
44
48
|
|
45
49
|
// backend buffer
|
46
50
|
|
47
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
51
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
48
52
|
ggml_backend_buffer_type_t buft,
|
49
53
|
struct ggml_backend_buffer_i iface,
|
50
54
|
ggml_backend_buffer_context_t context,
|
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
58
62
|
/* .buft = */ buft,
|
59
63
|
/* .context = */ context,
|
60
64
|
/* .size = */ size,
|
65
|
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
61
66
|
};
|
62
67
|
|
63
68
|
return buffer;
|
64
69
|
}
|
65
70
|
|
71
|
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
72
|
+
return buffer->iface.get_name(buffer);
|
73
|
+
}
|
74
|
+
|
66
75
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
67
76
|
if (buffer == NULL) {
|
68
77
|
return;
|
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
86
95
|
return base;
|
87
96
|
}
|
88
97
|
|
89
|
-
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
98
|
+
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
90
99
|
// init_tensor is optional
|
91
100
|
if (buffer->iface.init_tensor) {
|
92
101
|
buffer->iface.init_tensor(buffer, tensor);
|
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
94
103
|
}
|
95
104
|
|
96
105
|
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
97
|
-
return ggml_backend_buft_get_alignment(
|
106
|
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
98
107
|
}
|
99
108
|
|
100
109
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
101
|
-
return ggml_backend_buft_get_alloc_size(
|
110
|
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
102
111
|
}
|
103
112
|
|
104
113
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
106
115
|
}
|
107
116
|
|
108
117
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
109
|
-
return ggml_backend_buft_is_host(
|
118
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
110
119
|
}
|
111
120
|
|
112
|
-
|
121
|
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
122
|
+
buffer->usage = usage;
|
123
|
+
}
|
124
|
+
|
125
|
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
113
126
|
return buffer->buft;
|
114
127
|
}
|
115
128
|
|
129
|
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
130
|
+
if (buffer->iface.reset) {
|
131
|
+
buffer->iface.reset(buffer);
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
136
|
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
137
|
+
if (dst_buf->iface.cpy_tensor) {
|
138
|
+
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
139
|
+
}
|
140
|
+
return false;
|
141
|
+
}
|
142
|
+
|
116
143
|
// backend
|
117
144
|
|
118
145
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|
146
173
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
147
174
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
148
175
|
|
149
|
-
backend->iface.set_tensor_async
|
176
|
+
if (backend->iface.set_tensor_async == NULL) {
|
177
|
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
178
|
+
} else {
|
179
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
180
|
+
}
|
150
181
|
}
|
151
182
|
|
152
183
|
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
153
184
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
154
185
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
155
186
|
|
156
|
-
backend->iface.get_tensor_async
|
187
|
+
if (backend->iface.get_tensor_async == NULL) {
|
188
|
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
189
|
+
} else {
|
190
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
191
|
+
}
|
157
192
|
}
|
158
193
|
|
159
|
-
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
194
|
+
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
195
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
196
|
+
|
160
197
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
161
|
-
GGML_ASSERT(
|
198
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
162
199
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
163
200
|
|
164
|
-
tensor->buffer->iface.set_tensor(
|
201
|
+
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
165
202
|
}
|
166
203
|
|
167
|
-
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
204
|
+
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
205
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
206
|
+
|
168
207
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
169
208
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
170
209
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
171
210
|
|
172
|
-
tensor->buffer->iface.get_tensor(
|
211
|
+
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
173
212
|
}
|
174
213
|
|
175
214
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
190
229
|
|
191
230
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
192
231
|
backend->iface.graph_plan_compute(backend, plan);
|
193
|
-
|
194
|
-
// TODO: optional sync
|
195
|
-
ggml_backend_synchronize(backend);
|
196
232
|
}
|
197
233
|
|
198
234
|
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
-
|
200
|
-
return false;
|
201
|
-
}
|
202
|
-
|
203
|
-
// TODO: optional sync
|
204
|
-
ggml_backend_synchronize(backend);
|
205
|
-
return true;
|
235
|
+
return backend->iface.graph_compute(backend, cgraph);
|
206
236
|
}
|
207
237
|
|
208
238
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
227
257
|
}
|
228
258
|
|
229
259
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
230
|
-
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
231
|
-
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
232
260
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
233
261
|
|
234
|
-
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
235
|
-
|
236
262
|
if (src == dst) {
|
237
263
|
return;
|
238
264
|
}
|
239
265
|
|
240
|
-
|
241
|
-
|
242
|
-
if (dst->buffer
|
243
|
-
|
244
|
-
} else if (src
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
#ifndef NDEBUG
|
249
|
-
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
|
250
|
-
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
|
251
|
-
#endif
|
266
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
267
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
268
|
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
269
|
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
270
|
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
271
|
+
#ifndef NDEBUG
|
272
|
+
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
273
|
+
#endif
|
252
274
|
size_t nbytes = ggml_nbytes(src);
|
253
275
|
void * data = malloc(nbytes);
|
254
276
|
ggml_backend_tensor_get(src, data, 0, nbytes);
|
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
257
279
|
}
|
258
280
|
}
|
259
281
|
|
282
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
283
|
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
284
|
+
|
285
|
+
if (src == dst) {
|
286
|
+
return;
|
287
|
+
}
|
288
|
+
|
289
|
+
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
|
290
|
+
if (backend->iface.cpy_tensor_async != NULL) {
|
291
|
+
if (backend->iface.cpy_tensor_async(backend, src, dst)) {
|
292
|
+
return;
|
293
|
+
}
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|
297
|
+
size_t nbytes = ggml_nbytes(src);
|
298
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
299
|
+
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
|
300
|
+
}
|
301
|
+
else {
|
302
|
+
ggml_backend_tensor_copy(src, dst);
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
|
260
307
|
// backend registry
|
261
308
|
|
262
309
|
#define GGML_MAX_BACKENDS_REG 16
|
@@ -271,9 +318,9 @@ struct ggml_backend_reg {
|
|
271
318
|
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
272
319
|
static size_t ggml_backend_registry_count = 0;
|
273
320
|
|
274
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
321
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
275
322
|
|
276
|
-
static void ggml_backend_registry_init(void) {
|
323
|
+
GGML_CALL static void ggml_backend_registry_init(void) {
|
277
324
|
static bool initialized = false;
|
278
325
|
|
279
326
|
if (initialized) {
|
@@ -286,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
|
286
333
|
|
287
334
|
// add forward decls here to avoid including the backend headers
|
288
335
|
#ifdef GGML_USE_CUBLAS
|
289
|
-
extern void ggml_backend_cuda_reg_devices(void);
|
336
|
+
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
290
337
|
ggml_backend_cuda_reg_devices();
|
291
338
|
#endif
|
292
339
|
|
293
340
|
#ifdef GGML_USE_METAL
|
294
|
-
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
295
|
-
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
341
|
+
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
342
|
+
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
296
343
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
297
344
|
#endif
|
298
345
|
}
|
299
346
|
|
300
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
347
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
301
348
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
302
349
|
|
303
350
|
size_t id = ggml_backend_registry_count;
|
@@ -392,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
392
439
|
|
393
440
|
// backend CPU
|
394
441
|
|
395
|
-
static
|
442
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
443
|
+
return "CPU";
|
444
|
+
|
445
|
+
GGML_UNUSED(buffer);
|
446
|
+
}
|
447
|
+
|
448
|
+
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
396
449
|
return (void *)buffer->context;
|
397
450
|
}
|
398
451
|
|
399
|
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
452
|
+
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
400
453
|
free(buffer->context);
|
401
454
|
}
|
402
455
|
|
403
|
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
456
|
+
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
404
457
|
memcpy((char *)tensor->data + offset, data, size);
|
405
458
|
|
406
459
|
GGML_UNUSED(buffer);
|
407
460
|
}
|
408
461
|
|
409
|
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
462
|
+
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
410
463
|
memcpy(data, (const char *)tensor->data + offset, size);
|
411
464
|
|
412
465
|
GGML_UNUSED(buffer);
|
413
466
|
}
|
414
467
|
|
415
|
-
static
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
}
|
420
|
-
|
421
|
-
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
422
|
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
468
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
469
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
470
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
471
|
+
return true;
|
472
|
+
}
|
473
|
+
return false;
|
423
474
|
|
424
475
|
GGML_UNUSED(buffer);
|
425
476
|
}
|
426
477
|
|
427
|
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
478
|
+
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
428
479
|
memset(buffer->context, value, buffer->size);
|
429
480
|
}
|
430
481
|
|
431
482
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
483
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
432
484
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
433
485
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
434
486
|
/* .init_tensor = */ NULL, // no initialization required
|
435
487
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
436
488
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
437
|
-
/* .
|
438
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
489
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
439
490
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
491
|
+
/* .reset = */ NULL,
|
440
492
|
};
|
441
493
|
|
442
494
|
// for buffers from ptr, free is not called
|
443
495
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
496
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
444
497
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
445
498
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
446
499
|
/* .init_tensor = */ NULL, // no initialization required
|
447
500
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
448
501
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
449
|
-
/* .
|
450
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
502
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
451
503
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
504
|
+
/* .reset = */ NULL,
|
452
505
|
};
|
453
506
|
|
454
507
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
455
508
|
|
456
|
-
static
|
509
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
510
|
+
return "CPU";
|
511
|
+
|
512
|
+
GGML_UNUSED(buft);
|
513
|
+
}
|
514
|
+
|
515
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
457
516
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
458
517
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
459
518
|
|
@@ -462,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
|
462
521
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
463
522
|
}
|
464
523
|
|
465
|
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
524
|
+
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
466
525
|
return TENSOR_ALIGNMENT;
|
467
526
|
|
468
527
|
GGML_UNUSED(buft);
|
469
528
|
}
|
470
529
|
|
471
|
-
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
530
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
472
531
|
return ggml_backend_is_cpu(backend);
|
473
532
|
|
474
533
|
GGML_UNUSED(buft);
|
475
534
|
}
|
476
535
|
|
477
|
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
536
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
478
537
|
return true;
|
479
538
|
|
480
539
|
GGML_UNUSED(buft);
|
481
540
|
}
|
482
541
|
|
483
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
542
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
484
543
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
485
544
|
/* .iface = */ {
|
545
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
486
546
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
487
547
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
488
548
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
@@ -501,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
501
561
|
|
502
562
|
#include <hbwmalloc.h>
|
503
563
|
|
504
|
-
static
|
564
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
565
|
+
return "CPU_HBM";
|
566
|
+
|
567
|
+
GGML_UNUSED(buft);
|
568
|
+
}
|
569
|
+
|
570
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
571
|
+
return "CPU_HBM";
|
572
|
+
|
573
|
+
GGML_UNUSED(buf);
|
574
|
+
}
|
575
|
+
|
576
|
+
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
505
577
|
hbw_free(buffer->context);
|
506
578
|
}
|
507
579
|
|
508
|
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
580
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
509
581
|
//void * ptr = hbw_malloc(size);
|
510
582
|
void * ptr;
|
511
583
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
514
586
|
return NULL;
|
515
587
|
}
|
516
588
|
|
517
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
518
589
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
519
590
|
buffer->buft = buft;
|
591
|
+
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
520
592
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
521
593
|
|
522
594
|
return buffer;
|
523
595
|
}
|
524
596
|
|
525
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
597
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
526
598
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
527
599
|
/* .iface = */ {
|
600
|
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
528
601
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
529
602
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
530
603
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
@@ -544,20 +617,20 @@ struct ggml_backend_cpu_context {
|
|
544
617
|
size_t work_size;
|
545
618
|
};
|
546
619
|
|
547
|
-
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
620
|
+
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
548
621
|
return "CPU";
|
549
622
|
|
550
623
|
GGML_UNUSED(backend);
|
551
624
|
}
|
552
625
|
|
553
|
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
626
|
+
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
554
627
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
555
628
|
free(cpu_ctx->work_data);
|
556
629
|
free(cpu_ctx);
|
557
630
|
free(backend);
|
558
631
|
}
|
559
632
|
|
560
|
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
633
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
561
634
|
return ggml_backend_cpu_buffer_type();
|
562
635
|
|
563
636
|
GGML_UNUSED(backend);
|
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
|
568
641
|
struct ggml_cgraph cgraph;
|
569
642
|
};
|
570
643
|
|
571
|
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
644
|
+
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
572
645
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
573
646
|
|
574
647
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
@@ -583,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
583
656
|
return cpu_plan;
|
584
657
|
}
|
585
658
|
|
586
|
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
659
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
587
660
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
588
661
|
|
589
662
|
free(cpu_plan->cplan.work_data);
|
@@ -592,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
|
592
665
|
GGML_UNUSED(backend);
|
593
666
|
}
|
594
667
|
|
595
|
-
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
668
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
596
669
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
597
670
|
|
598
671
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
@@ -600,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
600
673
|
GGML_UNUSED(backend);
|
601
674
|
}
|
602
675
|
|
603
|
-
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
676
|
+
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
604
677
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
605
678
|
|
606
679
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -617,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
617
690
|
return true;
|
618
691
|
}
|
619
692
|
|
620
|
-
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
693
|
+
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
621
694
|
switch (op->op) {
|
622
695
|
case GGML_OP_MUL_MAT:
|
623
696
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
634
707
|
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
635
708
|
/* .set_tensor_async = */ NULL,
|
636
709
|
/* .get_tensor_async = */ NULL,
|
637
|
-
/* .
|
638
|
-
/* .cpy_tensor_to_async = */ NULL,
|
710
|
+
/* .cpy_tensor_async = */ NULL,
|
639
711
|
/* .synchronize = */ NULL,
|
640
712
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
641
713
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
@@ -660,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
660
732
|
return cpu_backend;
|
661
733
|
}
|
662
734
|
|
663
|
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
664
|
-
return backend->iface.get_name == ggml_backend_cpu_name;
|
735
|
+
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
736
|
+
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
665
737
|
}
|
666
738
|
|
667
739
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
@@ -671,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
671
743
|
ctx->n_threads = n_threads;
|
672
744
|
}
|
673
745
|
|
674
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
746
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
675
747
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
676
748
|
}
|
677
749
|
|
678
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
750
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
679
751
|
return ggml_backend_cpu_init();
|
680
752
|
|
681
753
|
GGML_UNUSED(params);
|
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
|
|
685
757
|
|
686
758
|
// scheduler
|
687
759
|
|
688
|
-
#define GGML_MAX_BACKENDS
|
760
|
+
#define GGML_MAX_BACKENDS 16
|
689
761
|
#define GGML_MAX_SPLITS 256
|
690
762
|
#define GGML_MAX_SPLIT_INPUTS 16
|
691
763
|
|
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
|
|
695
767
|
int i_end;
|
696
768
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
697
769
|
int n_inputs;
|
770
|
+
// graph view of this split
|
698
771
|
struct ggml_cgraph graph;
|
699
772
|
};
|
700
773
|
|
701
774
|
struct ggml_backend_sched {
|
775
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
776
|
+
|
702
777
|
int n_backends;
|
703
778
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
779
|
+
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
704
780
|
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
705
781
|
|
706
782
|
ggml_gallocr_t galloc;
|
707
783
|
|
784
|
+
// hash keys of the nodes in the graph
|
708
785
|
struct ggml_hash_set hash_set;
|
709
|
-
|
710
|
-
|
786
|
+
// hash values (arrays of [hash_set.size])
|
787
|
+
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
|
788
|
+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
|
711
789
|
|
790
|
+
// copy of the graph with modified inputs
|
712
791
|
struct ggml_cgraph * graph;
|
792
|
+
|
713
793
|
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
714
794
|
int n_splits;
|
715
795
|
|
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
|
|
750
830
|
return INT_MAX;
|
751
831
|
}
|
752
832
|
|
753
|
-
static
|
833
|
+
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
754
834
|
if (buffer == NULL) {
|
755
835
|
return NULL;
|
756
836
|
}
|
837
|
+
|
838
|
+
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
839
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
840
|
+
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
841
|
+
return sched->tallocs[i];
|
842
|
+
}
|
843
|
+
}
|
844
|
+
|
757
845
|
// find highest prio backend that supports the buffer type
|
758
846
|
for (int i = 0; i < sched->n_backends; i++) {
|
759
847
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
760
|
-
return sched->
|
848
|
+
return sched->tallocs[i];
|
761
849
|
}
|
762
850
|
}
|
763
851
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
767
855
|
if (allocr == NULL) {
|
768
856
|
return NULL;
|
769
857
|
}
|
770
|
-
// find highest prio backend that supports the buffer type
|
771
858
|
for (int i = 0; i < sched->n_backends; i++) {
|
772
859
|
if (sched->tallocs[i] == allocr) {
|
773
860
|
return sched->backends[i];
|
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
777
864
|
}
|
778
865
|
|
779
866
|
#if 0
|
780
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*
|
867
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
781
868
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
782
869
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
783
870
|
#else
|
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
|
|
786
873
|
#endif
|
787
874
|
|
788
875
|
// returns the backend that should be used for the node based on the current locations
|
789
|
-
static
|
790
|
-
//
|
791
|
-
// ie. kv cache updates
|
792
|
-
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
876
|
+
static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
877
|
+
// assign pre-allocated nodes to their backend
|
793
878
|
// dst
|
794
|
-
|
795
|
-
if (
|
879
|
+
ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
|
880
|
+
if (cur_allocr != NULL) {
|
796
881
|
SET_CAUSE(node, "1.dst");
|
797
|
-
return
|
882
|
+
return cur_allocr;
|
798
883
|
}
|
799
|
-
|
800
884
|
// view_src
|
801
|
-
if (node->view_src != NULL
|
802
|
-
|
803
|
-
|
885
|
+
if (node->view_src != NULL) {
|
886
|
+
cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
|
887
|
+
if (cur_allocr != NULL) {
|
888
|
+
SET_CAUSE(node, "1.vsrc");
|
889
|
+
return cur_allocr;
|
890
|
+
}
|
804
891
|
}
|
805
|
-
|
806
|
-
// src
|
807
|
-
int cur_prio = INT_MAX;
|
808
|
-
size_t cur_size = 0;
|
809
|
-
|
892
|
+
// assign nodes that use weights to the backend of the weights
|
810
893
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
811
894
|
const struct ggml_tensor * src = node->src[i];
|
812
895
|
if (src == NULL) {
|
813
896
|
break;
|
814
897
|
}
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
cur_prio = src_prio;
|
821
|
-
cur_size = src_size;
|
822
|
-
cur_backend = src_backend;
|
823
|
-
SET_CAUSE(node, "1.src%d", i);
|
824
|
-
}
|
898
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
899
|
+
ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
|
900
|
+
// operations with weights are always run on the same backend as the weights
|
901
|
+
SET_CAUSE(node, "1.wgt%d", i);
|
902
|
+
return src_allocr;
|
825
903
|
}
|
826
904
|
}
|
827
|
-
|
905
|
+
|
906
|
+
return NULL;
|
828
907
|
}
|
829
908
|
|
830
909
|
static char * fmt_size(size_t size) {
|
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
857
936
|
}
|
858
937
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
859
938
|
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
860
|
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%
|
939
|
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
861
940
|
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
862
941
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
863
942
|
struct ggml_tensor * src = node->src[j];
|
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
866
945
|
}
|
867
946
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
868
947
|
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
869
|
-
fprintf(stderr, " %20.20s (%
|
948
|
+
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
870
949
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
871
950
|
}
|
872
951
|
fprintf(stderr, "\n");
|
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
|
882
961
|
return dup;
|
883
962
|
}
|
884
963
|
|
964
|
+
|
965
|
+
//#define DEBUG_PASS1
|
966
|
+
//#define DEBUG_PASS2
|
967
|
+
//#define DEBUG_PASS3
|
968
|
+
//#define DEBUG_PASS4
|
969
|
+
|
885
970
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
886
|
-
// TODO: merge passes
|
887
971
|
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
888
|
-
// reset
|
889
|
-
size_t hash_size = sched->hash_set.size;
|
890
|
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
891
|
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
892
|
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
972
|
+
// reset splits
|
893
973
|
sched->n_splits = 0;
|
974
|
+
sched->is_reset = false;
|
894
975
|
|
895
976
|
struct ggml_init_params params = {
|
896
977
|
/* .mem_size = */ sizeof(sched->context_buffer),
|
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
898
979
|
/* .no_alloc = */ true
|
899
980
|
};
|
900
981
|
|
901
|
-
|
902
|
-
ggml_free(sched->ctx);
|
903
|
-
}
|
982
|
+
ggml_free(sched->ctx);
|
904
983
|
|
905
984
|
sched->ctx = ggml_init(params);
|
985
|
+
if (sched->ctx == NULL) {
|
986
|
+
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
987
|
+
GGML_ASSERT(false);
|
988
|
+
}
|
906
989
|
|
907
|
-
// pass 1: assign backends to ops with allocated inputs
|
990
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
908
991
|
for (int i = 0; i < graph->n_leafs; i++) {
|
909
992
|
struct ggml_tensor * leaf = graph->leafs[i];
|
910
993
|
if (node_allocr(leaf) != NULL) {
|
911
994
|
// do not overwrite user assignments
|
912
995
|
continue;
|
913
996
|
}
|
914
|
-
|
915
|
-
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
916
|
-
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
|
917
|
-
}
|
918
|
-
if (leaf_backend != NULL) {
|
919
|
-
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
920
|
-
}
|
997
|
+
node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
|
921
998
|
}
|
922
999
|
|
923
1000
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -926,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
926
1003
|
// do not overwrite user assignments
|
927
1004
|
continue;
|
928
1005
|
}
|
929
|
-
|
930
|
-
|
931
|
-
|
1006
|
+
node_allocr(node) = sched_allocr_from_cur(sched, node);
|
1007
|
+
// src
|
1008
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1009
|
+
struct ggml_tensor * src = node->src[j];
|
1010
|
+
if (src == NULL) {
|
1011
|
+
break;
|
1012
|
+
}
|
1013
|
+
if (node_allocr(src) == NULL) {
|
1014
|
+
node_allocr(src) = sched_allocr_from_cur(sched, src);
|
1015
|
+
}
|
932
1016
|
}
|
933
1017
|
}
|
934
|
-
|
1018
|
+
#ifdef DEBUG_PASS1
|
1019
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1020
|
+
#endif
|
935
1021
|
|
936
|
-
// pass 2:
|
937
|
-
//
|
938
|
-
//
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
1022
|
+
// pass 2: expand current backend assignments
|
1023
|
+
// assign the same backend to adjacent nodes
|
1024
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1025
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1026
|
+
|
1027
|
+
// pass 2.1 expand gpu up
|
1028
|
+
{
|
1029
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1030
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1031
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1032
|
+
if (ggml_is_view_op(node->op)) {
|
1033
|
+
continue;
|
1034
|
+
}
|
1035
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1036
|
+
if (node_allocr != NULL) {
|
1037
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
1038
|
+
// skip cpu (lowest prio backend)
|
1039
|
+
cur_allocr = NULL;
|
1040
|
+
} else {
|
1041
|
+
cur_allocr = node_allocr;
|
949
1042
|
}
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
1043
|
+
} else {
|
1044
|
+
node_allocr(node) = cur_allocr;
|
1045
|
+
SET_CAUSE(node, "2.1");
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
// pass 2.2 expand gpu down
|
1051
|
+
{
|
1052
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1053
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1054
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1055
|
+
if (ggml_is_view_op(node->op)) {
|
1056
|
+
continue;
|
1057
|
+
}
|
1058
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1059
|
+
if (node_allocr != NULL) {
|
1060
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
1061
|
+
// skip cpu (lowest prio backend)
|
1062
|
+
cur_allocr = NULL;
|
1063
|
+
} else {
|
1064
|
+
cur_allocr = node_allocr;
|
960
1065
|
}
|
1066
|
+
} else {
|
1067
|
+
node_allocr(node) = cur_allocr;
|
1068
|
+
SET_CAUSE(node, "2.2");
|
961
1069
|
}
|
1070
|
+
}
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
// pass 2.3 expand rest up
|
1074
|
+
{
|
1075
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1076
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1077
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1078
|
+
if (ggml_is_view_op(node->op)) {
|
1079
|
+
continue;
|
1080
|
+
}
|
1081
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
962
1082
|
if (node_allocr != NULL) {
|
963
|
-
|
1083
|
+
cur_allocr = node_allocr;
|
1084
|
+
} else {
|
1085
|
+
node_allocr(node) = cur_allocr;
|
1086
|
+
SET_CAUSE(node, "2.3");
|
964
1087
|
}
|
965
1088
|
}
|
966
1089
|
}
|
967
|
-
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
968
1090
|
|
969
|
-
// pass
|
1091
|
+
// pass 2.4 expand rest down
|
1092
|
+
{
|
1093
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1094
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1095
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1096
|
+
if (ggml_is_view_op(node->op)) {
|
1097
|
+
continue;
|
1098
|
+
}
|
1099
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1100
|
+
if (node_allocr != NULL) {
|
1101
|
+
cur_allocr = node_allocr;
|
1102
|
+
} else {
|
1103
|
+
node_allocr(node) = cur_allocr;
|
1104
|
+
SET_CAUSE(node, "2.4");
|
1105
|
+
}
|
1106
|
+
}
|
1107
|
+
}
|
1108
|
+
#ifdef DEBUG_PASS2
|
1109
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1110
|
+
#endif
|
1111
|
+
|
1112
|
+
// pass 3: assign backends to remaining src from dst and view_src
|
970
1113
|
for (int i = 0; i < graph->n_nodes; i++) {
|
971
1114
|
struct ggml_tensor * node = graph->nodes[i];
|
972
|
-
ggml_tallocr_t
|
1115
|
+
ggml_tallocr_t cur_allocr = node_allocr(node);
|
1116
|
+
if (node->view_src != NULL && cur_allocr == NULL) {
|
1117
|
+
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
|
1118
|
+
SET_CAUSE(node, "3.vsrc");
|
1119
|
+
}
|
973
1120
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
974
1121
|
struct ggml_tensor * src = node->src[j];
|
975
1122
|
if (src == NULL) {
|
@@ -977,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
977
1124
|
}
|
978
1125
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
979
1126
|
if (src_allocr == NULL) {
|
980
|
-
|
1127
|
+
if (src->view_src != NULL) {
|
1128
|
+
// views are always on the same backend as the source
|
1129
|
+
node_allocr(src) = node_allocr(src->view_src);
|
1130
|
+
SET_CAUSE(src, "3.vsrc");
|
1131
|
+
} else {
|
1132
|
+
node_allocr(src) = cur_allocr;
|
1133
|
+
SET_CAUSE(src, "3.cur");
|
1134
|
+
}
|
981
1135
|
}
|
982
1136
|
}
|
983
1137
|
}
|
984
|
-
|
1138
|
+
#ifdef DEBUG_PASS3
|
1139
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1140
|
+
#endif
|
985
1141
|
|
986
1142
|
// pass 4: split graph, find tensors that need to be copied
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
}
|
997
|
-
}
|
998
|
-
sched->splits[0].i_start = 0;
|
999
|
-
sched->splits[0].n_inputs = 0;
|
1000
|
-
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1001
|
-
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
1002
|
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1003
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1004
|
-
struct ggml_tensor * node = graph->nodes[i];
|
1005
|
-
|
1006
|
-
if (ggml_is_view_op(node->op)) {
|
1007
|
-
continue;
|
1143
|
+
{
|
1144
|
+
int cur_split = 0;
|
1145
|
+
// find the backend of the first split, skipping view ops
|
1146
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1147
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1148
|
+
if (!ggml_is_view_op(node->op)) {
|
1149
|
+
sched->splits[0].tallocr = node_allocr(node);
|
1150
|
+
break;
|
1151
|
+
}
|
1008
1152
|
}
|
1153
|
+
sched->splits[0].i_start = 0;
|
1154
|
+
sched->splits[0].n_inputs = 0;
|
1155
|
+
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1156
|
+
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
1157
|
+
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1158
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1159
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1160
|
+
|
1161
|
+
if (ggml_is_view_op(node->op)) {
|
1162
|
+
continue;
|
1163
|
+
}
|
1009
1164
|
|
1010
|
-
|
1165
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1011
1166
|
|
1012
|
-
|
1013
|
-
sched->splits[cur_split].i_end = i;
|
1014
|
-
cur_split++;
|
1015
|
-
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1016
|
-
sched->splits[cur_split].tallocr = node_allocr;
|
1017
|
-
sched->splits[cur_split].i_start = i;
|
1018
|
-
sched->splits[cur_split].n_inputs = 0;
|
1019
|
-
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
1020
|
-
cur_allocr = node_allocr;
|
1021
|
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1022
|
-
}
|
1167
|
+
GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
|
1023
1168
|
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1169
|
+
if (node_allocr != cur_allocr) {
|
1170
|
+
sched->splits[cur_split].i_end = i;
|
1171
|
+
cur_split++;
|
1172
|
+
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1173
|
+
sched->splits[cur_split].tallocr = node_allocr;
|
1174
|
+
sched->splits[cur_split].i_start = i;
|
1175
|
+
sched->splits[cur_split].n_inputs = 0;
|
1176
|
+
cur_allocr = node_allocr;
|
1177
|
+
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1029
1178
|
}
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1179
|
+
|
1180
|
+
// find inputs that are not on the same backend
|
1181
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1182
|
+
struct ggml_tensor * src = node->src[j];
|
1183
|
+
if (src == NULL) {
|
1184
|
+
break;
|
1185
|
+
}
|
1186
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
1187
|
+
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
1188
|
+
if (src_allocr != node_allocr) {
|
1189
|
+
// check if the input is already in the split
|
1190
|
+
bool found = false;
|
1191
|
+
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
1192
|
+
if (sched->splits[cur_split].inputs[k] == src) {
|
1193
|
+
found = true;
|
1194
|
+
break;
|
1195
|
+
}
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
if (!found) {
|
1199
|
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1200
|
+
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
1201
|
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1202
|
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
// create a copy of the input in the split's backend
|
1206
|
+
size_t id = hash_id(src);
|
1207
|
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
1208
|
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
1209
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1210
|
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1211
|
+
|
1212
|
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
1213
|
+
node_allocr(tensor_copy) = cur_allocr;
|
1214
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1215
|
+
}
|
1216
|
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1044
1217
|
}
|
1045
|
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1046
1218
|
}
|
1047
1219
|
}
|
1220
|
+
sched->splits[cur_split].i_end = graph->n_nodes;
|
1221
|
+
sched->n_splits = cur_split + 1;
|
1048
1222
|
}
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
1223
|
+
#ifdef DEBUG_PASS4
|
1224
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1225
|
+
#endif
|
1053
1226
|
|
1054
|
-
#
|
1227
|
+
#ifndef NDEBUG
|
1055
1228
|
// sanity check: all sources should have the same backend as the node
|
1056
1229
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1057
1230
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -1059,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1059
1232
|
if (node_allocr == NULL) {
|
1060
1233
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1061
1234
|
}
|
1235
|
+
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
|
1236
|
+
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1237
|
+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
1238
|
+
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
|
1239
|
+
}
|
1062
1240
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1063
1241
|
struct ggml_tensor * src = node->src[j];
|
1064
1242
|
if (src == NULL) {
|
@@ -1070,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1070
1248
|
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
1071
1249
|
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
1072
1250
|
}
|
1251
|
+
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
|
1252
|
+
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1253
|
+
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
|
1254
|
+
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
|
1255
|
+
}
|
1073
1256
|
}
|
1074
1257
|
}
|
1258
|
+
fflush(stderr);
|
1075
1259
|
#endif
|
1076
1260
|
|
1077
1261
|
// create copies of the graph for each split
|
@@ -1085,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1085
1269
|
for (int j = 0; j < split->n_inputs; j++) {
|
1086
1270
|
struct ggml_tensor * input = split->inputs[j];
|
1087
1271
|
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
1272
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
1273
|
+
GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
|
1088
1274
|
input_cpy->src[0] = input;
|
1089
1275
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1090
1276
|
}
|
@@ -1119,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1119
1305
|
uint64_t copy_start_us = ggml_time_us();
|
1120
1306
|
for (int j = 0; j < split->n_inputs; j++) {
|
1121
1307
|
struct ggml_tensor * input = split->inputs[j];
|
1122
|
-
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
}
|
1131
|
-
if (input_cpy->buffer == NULL) {
|
1132
|
-
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
1133
|
-
exit(1);
|
1134
|
-
}
|
1135
|
-
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
|
1136
|
-
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
1137
|
-
ggml_backend_tensor_copy(input, input_cpy);
|
1308
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
|
1309
|
+
|
1310
|
+
GGML_ASSERT(input->buffer != NULL);
|
1311
|
+
GGML_ASSERT(input_cpy->buffer != NULL);
|
1312
|
+
|
1313
|
+
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
1314
|
+
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
1315
|
+
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
1138
1316
|
}
|
1139
|
-
//
|
1317
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
1140
1318
|
int64_t copy_end_us = ggml_time_us();
|
1141
1319
|
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
1142
1320
|
|
@@ -1148,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1148
1326
|
|
1149
1327
|
uint64_t compute_start_us = ggml_time_us();
|
1150
1328
|
ggml_backend_graph_compute(split_backend, &split->graph);
|
1151
|
-
//
|
1329
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1152
1330
|
uint64_t compute_end_us = ggml_time_us();
|
1153
1331
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1154
1332
|
}
|
@@ -1168,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|
1168
1346
|
for (int i = 0; i < sched->n_backends; i++) {
|
1169
1347
|
ggml_tallocr_reset(sched->tallocs[i]);
|
1170
1348
|
}
|
1349
|
+
// reset state for the next run
|
1350
|
+
size_t hash_size = sched->hash_set.size;
|
1351
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
1352
|
+
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
1353
|
+
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
1354
|
+
|
1355
|
+
sched->is_reset = true;
|
1171
1356
|
}
|
1172
1357
|
|
1173
|
-
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
1358
|
+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
1359
|
+
GGML_ASSERT(n_backends > 0);
|
1174
1360
|
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
1175
1361
|
|
1176
|
-
struct ggml_backend_sched * sched =
|
1177
|
-
|
1362
|
+
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1363
|
+
|
1364
|
+
// initialize hash table
|
1365
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1366
|
+
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
1367
|
+
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
1178
1368
|
|
1179
1369
|
sched->n_backends = n_backends;
|
1180
1370
|
for (int i = 0; i < n_backends; i++) {
|
1181
1371
|
sched->backends[i] = backends[i];
|
1372
|
+
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
1182
1373
|
}
|
1183
1374
|
|
1184
1375
|
sched->galloc = ggml_gallocr_new();
|
1185
1376
|
|
1186
1377
|
// init measure allocs for each backend
|
1187
1378
|
for (int i = 0; i < n_backends; i++) {
|
1188
|
-
sched->tallocs[i] =
|
1379
|
+
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
1189
1380
|
}
|
1190
1381
|
|
1382
|
+
sched_reset(sched);
|
1383
|
+
|
1191
1384
|
return sched;
|
1192
1385
|
}
|
1193
1386
|
|
@@ -1199,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1199
1392
|
ggml_tallocr_free(sched->tallocs[i]);
|
1200
1393
|
}
|
1201
1394
|
ggml_gallocr_free(sched->galloc);
|
1395
|
+
ggml_free(sched->ctx);
|
1202
1396
|
free(sched->hash_set.keys);
|
1203
1397
|
free(sched->node_talloc);
|
1204
1398
|
free(sched->node_copies);
|
@@ -1206,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1206
1400
|
}
|
1207
1401
|
|
1208
1402
|
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1209
|
-
//
|
1210
|
-
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
1211
|
-
sched->hash_set.size = hash_size;
|
1212
|
-
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
1213
|
-
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
1214
|
-
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
1403
|
+
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
1215
1404
|
|
1216
1405
|
sched_split_graph(sched, measure_graph);
|
1217
1406
|
sched_alloc_splits(sched);
|
@@ -1220,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|
1220
1409
|
for (int i = 0; i < sched->n_backends; i++) {
|
1221
1410
|
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
1222
1411
|
ggml_tallocr_free(sched->tallocs[i]);
|
1223
|
-
sched->tallocs[i] =
|
1412
|
+
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
1224
1413
|
}
|
1225
1414
|
|
1226
1415
|
sched_reset(sched);
|
1227
1416
|
}
|
1228
1417
|
|
1229
1418
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1230
|
-
GGML_ASSERT(sched->hash_set.size >= graph->
|
1419
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1420
|
+
|
1421
|
+
if (!sched->is_reset) {
|
1422
|
+
sched_reset(sched);
|
1423
|
+
}
|
1231
1424
|
|
1232
1425
|
sched_split_graph(sched, graph);
|
1233
1426
|
sched_alloc_splits(sched);
|
1234
1427
|
sched_compute_splits(sched);
|
1428
|
+
}
|
1429
|
+
|
1430
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1235
1431
|
sched_reset(sched);
|
1236
1432
|
}
|
1237
1433
|
|
1434
|
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
1435
|
+
return sched->n_splits;
|
1436
|
+
}
|
1437
|
+
|
1238
1438
|
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1239
1439
|
int backend_index = sched_backend_prio(sched, backend);
|
1440
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1240
1441
|
return sched->tallocs[backend_index];
|
1241
1442
|
}
|
1242
1443
|
|
1243
1444
|
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1244
1445
|
int backend_index = sched_backend_prio(sched, backend);
|
1446
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1245
1447
|
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
1246
1448
|
}
|
1247
1449
|
|
@@ -1251,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
1251
1453
|
node_allocr(node) = sched->tallocs[backend_index];
|
1252
1454
|
}
|
1253
1455
|
|
1456
|
+
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1457
|
+
ggml_tallocr_t allocr = node_allocr(node);
|
1458
|
+
if (allocr == NULL) {
|
1459
|
+
return NULL;
|
1460
|
+
}
|
1461
|
+
return get_allocr_backend(sched, allocr);
|
1462
|
+
}
|
1463
|
+
|
1254
1464
|
// utils
|
1465
|
+
|
1255
1466
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1256
1467
|
GGML_ASSERT(tensor->buffer == NULL);
|
1257
|
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-
|
1468
|
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
1258
1469
|
GGML_ASSERT(tensor->view_src != NULL);
|
1259
1470
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1260
1471
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -1320,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1320
1531
|
|
1321
1532
|
struct ggml_tensor * dst = node_copies[id];
|
1322
1533
|
if (dst->view_src != NULL) {
|
1534
|
+
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1323
1535
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
1324
1536
|
}
|
1325
1537
|
else {
|
@@ -1353,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1353
1565
|
struct ggml_context * ctx_allocated = ggml_init(params);
|
1354
1566
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
1355
1567
|
|
1568
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
1569
|
+
fprintf(stderr, "failed to allocate context for graph copy\n");
|
1570
|
+
free(hash_set.keys);
|
1571
|
+
free(node_copies);
|
1572
|
+
free(node_init);
|
1573
|
+
ggml_free(ctx_allocated);
|
1574
|
+
ggml_free(ctx_unallocated);
|
1575
|
+
return (struct ggml_backend_graph_copy) {
|
1576
|
+
/* .buffer = */ NULL,
|
1577
|
+
/* .ctx_allocated = */ NULL,
|
1578
|
+
/* .ctx_unallocated = */ NULL,
|
1579
|
+
/* .graph = */ NULL,
|
1580
|
+
};
|
1581
|
+
}
|
1582
|
+
|
1356
1583
|
// dup nodes
|
1357
1584
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1358
1585
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -1361,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1361
1588
|
|
1362
1589
|
// allocate nodes
|
1363
1590
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1591
|
+
if (buffer == NULL) {
|
1592
|
+
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
1593
|
+
free(hash_set.keys);
|
1594
|
+
free(node_copies);
|
1595
|
+
free(node_init);
|
1596
|
+
ggml_free(ctx_allocated);
|
1597
|
+
ggml_free(ctx_unallocated);
|
1598
|
+
return (struct ggml_backend_graph_copy) {
|
1599
|
+
/* .buffer = */ NULL,
|
1600
|
+
/* .ctx_allocated = */ NULL,
|
1601
|
+
/* .ctx_unallocated = */ NULL,
|
1602
|
+
/* .graph = */ NULL,
|
1603
|
+
};
|
1604
|
+
}
|
1364
1605
|
|
1365
1606
|
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
1366
1607
|
|
@@ -1397,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
1397
1638
|
ggml_free(copy.ctx_unallocated);
|
1398
1639
|
}
|
1399
1640
|
|
1400
|
-
|
1641
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
1401
1642
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
1643
|
+
if (copy.buffer == NULL) {
|
1644
|
+
return false;
|
1645
|
+
}
|
1646
|
+
|
1402
1647
|
struct ggml_cgraph * g1 = graph;
|
1403
1648
|
struct ggml_cgraph * g2 = copy.graph;
|
1404
1649
|
|
@@ -1428,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
1428
1673
|
}
|
1429
1674
|
|
1430
1675
|
ggml_backend_graph_copy_free(copy);
|
1676
|
+
|
1677
|
+
return true;
|
1431
1678
|
}
|