llama_cpp 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -15,7 +15,11 @@
|
|
15
15
|
|
16
16
|
// backend buffer type
|
17
17
|
|
18
|
-
|
18
|
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
19
|
+
return buft->iface.get_name(buft);
|
20
|
+
}
|
21
|
+
|
22
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
19
23
|
return buft->iface.alloc_buffer(buft, size);
|
20
24
|
}
|
21
25
|
|
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
23
27
|
return buft->iface.get_alignment(buft);
|
24
28
|
}
|
25
29
|
|
26
|
-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
30
|
+
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
27
31
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
28
32
|
if (buft->iface.get_alloc_size) {
|
29
33
|
return buft->iface.get_alloc_size(buft, tensor);
|
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
44
48
|
|
45
49
|
// backend buffer
|
46
50
|
|
47
|
-
ggml_backend_buffer_t ggml_backend_buffer_init(
|
51
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
48
52
|
ggml_backend_buffer_type_t buft,
|
49
53
|
struct ggml_backend_buffer_i iface,
|
50
54
|
ggml_backend_buffer_context_t context,
|
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
58
62
|
/* .buft = */ buft,
|
59
63
|
/* .context = */ context,
|
60
64
|
/* .size = */ size,
|
65
|
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
61
66
|
};
|
62
67
|
|
63
68
|
return buffer;
|
64
69
|
}
|
65
70
|
|
71
|
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
72
|
+
return buffer->iface.get_name(buffer);
|
73
|
+
}
|
74
|
+
|
66
75
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
67
76
|
if (buffer == NULL) {
|
68
77
|
return;
|
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
86
95
|
return base;
|
87
96
|
}
|
88
97
|
|
89
|
-
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
98
|
+
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
90
99
|
// init_tensor is optional
|
91
100
|
if (buffer->iface.init_tensor) {
|
92
101
|
buffer->iface.init_tensor(buffer, tensor);
|
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
94
103
|
}
|
95
104
|
|
96
105
|
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
97
|
-
return ggml_backend_buft_get_alignment(
|
106
|
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
98
107
|
}
|
99
108
|
|
100
109
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
101
|
-
return ggml_backend_buft_get_alloc_size(
|
110
|
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
102
111
|
}
|
103
112
|
|
104
113
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
106
115
|
}
|
107
116
|
|
108
117
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
109
|
-
return ggml_backend_buft_is_host(
|
118
|
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
110
119
|
}
|
111
120
|
|
112
|
-
|
121
|
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
122
|
+
buffer->usage = usage;
|
123
|
+
}
|
124
|
+
|
125
|
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
113
126
|
return buffer->buft;
|
114
127
|
}
|
115
128
|
|
129
|
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
130
|
+
if (buffer->iface.reset) {
|
131
|
+
buffer->iface.reset(buffer);
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
136
|
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
137
|
+
if (dst_buf->iface.cpy_tensor) {
|
138
|
+
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
139
|
+
}
|
140
|
+
return false;
|
141
|
+
}
|
142
|
+
|
116
143
|
// backend
|
117
144
|
|
118
145
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|
146
173
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
147
174
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
148
175
|
|
149
|
-
backend->iface.set_tensor_async
|
176
|
+
if (backend->iface.set_tensor_async == NULL) {
|
177
|
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
178
|
+
} else {
|
179
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
180
|
+
}
|
150
181
|
}
|
151
182
|
|
152
183
|
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
153
184
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
154
185
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
155
186
|
|
156
|
-
backend->iface.get_tensor_async
|
187
|
+
if (backend->iface.get_tensor_async == NULL) {
|
188
|
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
189
|
+
} else {
|
190
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
191
|
+
}
|
157
192
|
}
|
158
193
|
|
159
|
-
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
194
|
+
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
195
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
196
|
+
|
160
197
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
161
|
-
GGML_ASSERT(
|
198
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
162
199
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
163
200
|
|
164
|
-
tensor->buffer->iface.set_tensor(
|
201
|
+
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
165
202
|
}
|
166
203
|
|
167
|
-
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
204
|
+
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
205
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
206
|
+
|
168
207
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
169
208
|
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
170
209
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
171
210
|
|
172
|
-
tensor->buffer->iface.get_tensor(
|
211
|
+
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
173
212
|
}
|
174
213
|
|
175
214
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
190
229
|
|
191
230
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
192
231
|
backend->iface.graph_plan_compute(backend, plan);
|
193
|
-
|
194
|
-
// TODO: optional sync
|
195
|
-
ggml_backend_synchronize(backend);
|
196
232
|
}
|
197
233
|
|
198
234
|
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
-
|
200
|
-
return false;
|
201
|
-
}
|
202
|
-
|
203
|
-
// TODO: optional sync
|
204
|
-
ggml_backend_synchronize(backend);
|
205
|
-
return true;
|
235
|
+
return backend->iface.graph_compute(backend, cgraph);
|
206
236
|
}
|
207
237
|
|
208
238
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
227
257
|
}
|
228
258
|
|
229
259
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
230
|
-
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
231
|
-
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
232
260
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
233
261
|
|
234
|
-
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
235
|
-
|
236
262
|
if (src == dst) {
|
237
263
|
return;
|
238
264
|
}
|
239
265
|
|
240
|
-
|
241
|
-
|
242
|
-
if (dst->buffer
|
243
|
-
|
244
|
-
} else if (src
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
#ifndef NDEBUG
|
249
|
-
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
|
250
|
-
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
|
251
|
-
#endif
|
266
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
267
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
268
|
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
269
|
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
270
|
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
271
|
+
#ifndef NDEBUG
|
272
|
+
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
273
|
+
#endif
|
252
274
|
size_t nbytes = ggml_nbytes(src);
|
253
275
|
void * data = malloc(nbytes);
|
254
276
|
ggml_backend_tensor_get(src, data, 0, nbytes);
|
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
257
279
|
}
|
258
280
|
}
|
259
281
|
|
282
|
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
283
|
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
284
|
+
|
285
|
+
if (src == dst) {
|
286
|
+
return;
|
287
|
+
}
|
288
|
+
|
289
|
+
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
|
290
|
+
if (backend->iface.cpy_tensor_async != NULL) {
|
291
|
+
if (backend->iface.cpy_tensor_async(backend, src, dst)) {
|
292
|
+
return;
|
293
|
+
}
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|
297
|
+
size_t nbytes = ggml_nbytes(src);
|
298
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
299
|
+
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
|
300
|
+
}
|
301
|
+
else {
|
302
|
+
ggml_backend_tensor_copy(src, dst);
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
|
260
307
|
// backend registry
|
261
308
|
|
262
309
|
#define GGML_MAX_BACKENDS_REG 16
|
@@ -271,9 +318,9 @@ struct ggml_backend_reg {
|
|
271
318
|
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
272
319
|
static size_t ggml_backend_registry_count = 0;
|
273
320
|
|
274
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
321
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
275
322
|
|
276
|
-
static void ggml_backend_registry_init(void) {
|
323
|
+
GGML_CALL static void ggml_backend_registry_init(void) {
|
277
324
|
static bool initialized = false;
|
278
325
|
|
279
326
|
if (initialized) {
|
@@ -286,18 +333,18 @@ static void ggml_backend_registry_init(void) {
|
|
286
333
|
|
287
334
|
// add forward decls here to avoid including the backend headers
|
288
335
|
#ifdef GGML_USE_CUBLAS
|
289
|
-
extern void ggml_backend_cuda_reg_devices(void);
|
336
|
+
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
290
337
|
ggml_backend_cuda_reg_devices();
|
291
338
|
#endif
|
292
339
|
|
293
340
|
#ifdef GGML_USE_METAL
|
294
|
-
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
295
|
-
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
341
|
+
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
342
|
+
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
296
343
|
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
297
344
|
#endif
|
298
345
|
}
|
299
346
|
|
300
|
-
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
347
|
+
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
301
348
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
302
349
|
|
303
350
|
size_t id = ggml_backend_registry_count;
|
@@ -392,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
392
439
|
|
393
440
|
// backend CPU
|
394
441
|
|
395
|
-
static
|
442
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
443
|
+
return "CPU";
|
444
|
+
|
445
|
+
GGML_UNUSED(buffer);
|
446
|
+
}
|
447
|
+
|
448
|
+
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
396
449
|
return (void *)buffer->context;
|
397
450
|
}
|
398
451
|
|
399
|
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
452
|
+
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
400
453
|
free(buffer->context);
|
401
454
|
}
|
402
455
|
|
403
|
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
456
|
+
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
404
457
|
memcpy((char *)tensor->data + offset, data, size);
|
405
458
|
|
406
459
|
GGML_UNUSED(buffer);
|
407
460
|
}
|
408
461
|
|
409
|
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
462
|
+
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
410
463
|
memcpy(data, (const char *)tensor->data + offset, size);
|
411
464
|
|
412
465
|
GGML_UNUSED(buffer);
|
413
466
|
}
|
414
467
|
|
415
|
-
static
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
}
|
420
|
-
|
421
|
-
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
422
|
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
468
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
469
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
470
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
471
|
+
return true;
|
472
|
+
}
|
473
|
+
return false;
|
423
474
|
|
424
475
|
GGML_UNUSED(buffer);
|
425
476
|
}
|
426
477
|
|
427
|
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
478
|
+
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
428
479
|
memset(buffer->context, value, buffer->size);
|
429
480
|
}
|
430
481
|
|
431
482
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
483
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
432
484
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
433
485
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
434
486
|
/* .init_tensor = */ NULL, // no initialization required
|
435
487
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
436
488
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
437
|
-
/* .
|
438
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
489
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
439
490
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
491
|
+
/* .reset = */ NULL,
|
440
492
|
};
|
441
493
|
|
442
494
|
// for buffers from ptr, free is not called
|
443
495
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
496
|
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
444
497
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
445
498
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
446
499
|
/* .init_tensor = */ NULL, // no initialization required
|
447
500
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
448
501
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
449
|
-
/* .
|
450
|
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
502
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
451
503
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
504
|
+
/* .reset = */ NULL,
|
452
505
|
};
|
453
506
|
|
454
507
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
455
508
|
|
456
|
-
static
|
509
|
+
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
510
|
+
return "CPU";
|
511
|
+
|
512
|
+
GGML_UNUSED(buft);
|
513
|
+
}
|
514
|
+
|
515
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
457
516
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
458
517
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
459
518
|
|
@@ -462,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
|
|
462
521
|
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
463
522
|
}
|
464
523
|
|
465
|
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
524
|
+
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
466
525
|
return TENSOR_ALIGNMENT;
|
467
526
|
|
468
527
|
GGML_UNUSED(buft);
|
469
528
|
}
|
470
529
|
|
471
|
-
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
530
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
472
531
|
return ggml_backend_is_cpu(backend);
|
473
532
|
|
474
533
|
GGML_UNUSED(buft);
|
475
534
|
}
|
476
535
|
|
477
|
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
536
|
+
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
478
537
|
return true;
|
479
538
|
|
480
539
|
GGML_UNUSED(buft);
|
481
540
|
}
|
482
541
|
|
483
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
542
|
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
484
543
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
485
544
|
/* .iface = */ {
|
545
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
486
546
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
487
547
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
488
548
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
@@ -501,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
501
561
|
|
502
562
|
#include <hbwmalloc.h>
|
503
563
|
|
504
|
-
static
|
564
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
565
|
+
return "CPU_HBM";
|
566
|
+
|
567
|
+
GGML_UNUSED(buft);
|
568
|
+
}
|
569
|
+
|
570
|
+
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
571
|
+
return "CPU_HBM";
|
572
|
+
|
573
|
+
GGML_UNUSED(buf);
|
574
|
+
}
|
575
|
+
|
576
|
+
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
505
577
|
hbw_free(buffer->context);
|
506
578
|
}
|
507
579
|
|
508
|
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
580
|
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
509
581
|
//void * ptr = hbw_malloc(size);
|
510
582
|
void * ptr;
|
511
583
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
514
586
|
return NULL;
|
515
587
|
}
|
516
588
|
|
517
|
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
518
589
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
519
590
|
buffer->buft = buft;
|
591
|
+
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
520
592
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
521
593
|
|
522
594
|
return buffer;
|
523
595
|
}
|
524
596
|
|
525
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
597
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
526
598
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
527
599
|
/* .iface = */ {
|
600
|
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
528
601
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
529
602
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
530
603
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
@@ -544,20 +617,20 @@ struct ggml_backend_cpu_context {
|
|
544
617
|
size_t work_size;
|
545
618
|
};
|
546
619
|
|
547
|
-
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
620
|
+
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
548
621
|
return "CPU";
|
549
622
|
|
550
623
|
GGML_UNUSED(backend);
|
551
624
|
}
|
552
625
|
|
553
|
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
626
|
+
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
554
627
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
555
628
|
free(cpu_ctx->work_data);
|
556
629
|
free(cpu_ctx);
|
557
630
|
free(backend);
|
558
631
|
}
|
559
632
|
|
560
|
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
633
|
+
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
561
634
|
return ggml_backend_cpu_buffer_type();
|
562
635
|
|
563
636
|
GGML_UNUSED(backend);
|
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
|
568
641
|
struct ggml_cgraph cgraph;
|
569
642
|
};
|
570
643
|
|
571
|
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
644
|
+
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
572
645
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
573
646
|
|
574
647
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
@@ -583,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|
583
656
|
return cpu_plan;
|
584
657
|
}
|
585
658
|
|
586
|
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
659
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
587
660
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
588
661
|
|
589
662
|
free(cpu_plan->cplan.work_data);
|
@@ -592,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
|
592
665
|
GGML_UNUSED(backend);
|
593
666
|
}
|
594
667
|
|
595
|
-
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
668
|
+
GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
596
669
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
597
670
|
|
598
671
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
@@ -600,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
600
673
|
GGML_UNUSED(backend);
|
601
674
|
}
|
602
675
|
|
603
|
-
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
676
|
+
GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
604
677
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
605
678
|
|
606
679
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -617,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
617
690
|
return true;
|
618
691
|
}
|
619
692
|
|
620
|
-
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
693
|
+
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
621
694
|
switch (op->op) {
|
622
695
|
case GGML_OP_MUL_MAT:
|
623
696
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
634
707
|
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
635
708
|
/* .set_tensor_async = */ NULL,
|
636
709
|
/* .get_tensor_async = */ NULL,
|
637
|
-
/* .
|
638
|
-
/* .cpy_tensor_to_async = */ NULL,
|
710
|
+
/* .cpy_tensor_async = */ NULL,
|
639
711
|
/* .synchronize = */ NULL,
|
640
712
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
641
713
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
@@ -660,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
660
732
|
return cpu_backend;
|
661
733
|
}
|
662
734
|
|
663
|
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
664
|
-
return backend->iface.get_name == ggml_backend_cpu_name;
|
735
|
+
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
736
|
+
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
665
737
|
}
|
666
738
|
|
667
739
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
@@ -671,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
671
743
|
ctx->n_threads = n_threads;
|
672
744
|
}
|
673
745
|
|
674
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
746
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
675
747
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
676
748
|
}
|
677
749
|
|
678
|
-
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
750
|
+
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
679
751
|
return ggml_backend_cpu_init();
|
680
752
|
|
681
753
|
GGML_UNUSED(params);
|
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
|
|
685
757
|
|
686
758
|
// scheduler
|
687
759
|
|
688
|
-
#define GGML_MAX_BACKENDS
|
760
|
+
#define GGML_MAX_BACKENDS 16
|
689
761
|
#define GGML_MAX_SPLITS 256
|
690
762
|
#define GGML_MAX_SPLIT_INPUTS 16
|
691
763
|
|
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
|
|
695
767
|
int i_end;
|
696
768
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
697
769
|
int n_inputs;
|
770
|
+
// graph view of this split
|
698
771
|
struct ggml_cgraph graph;
|
699
772
|
};
|
700
773
|
|
701
774
|
struct ggml_backend_sched {
|
775
|
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
776
|
+
|
702
777
|
int n_backends;
|
703
778
|
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
779
|
+
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
704
780
|
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
705
781
|
|
706
782
|
ggml_gallocr_t galloc;
|
707
783
|
|
784
|
+
// hash keys of the nodes in the graph
|
708
785
|
struct ggml_hash_set hash_set;
|
709
|
-
|
710
|
-
|
786
|
+
// hash values (arrays of [hash_set.size])
|
787
|
+
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
|
788
|
+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
|
711
789
|
|
790
|
+
// copy of the graph with modified inputs
|
712
791
|
struct ggml_cgraph * graph;
|
792
|
+
|
713
793
|
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
714
794
|
int n_splits;
|
715
795
|
|
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
|
|
750
830
|
return INT_MAX;
|
751
831
|
}
|
752
832
|
|
753
|
-
static
|
833
|
+
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
754
834
|
if (buffer == NULL) {
|
755
835
|
return NULL;
|
756
836
|
}
|
837
|
+
|
838
|
+
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
839
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
840
|
+
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
841
|
+
return sched->tallocs[i];
|
842
|
+
}
|
843
|
+
}
|
844
|
+
|
757
845
|
// find highest prio backend that supports the buffer type
|
758
846
|
for (int i = 0; i < sched->n_backends; i++) {
|
759
847
|
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
760
|
-
return sched->
|
848
|
+
return sched->tallocs[i];
|
761
849
|
}
|
762
850
|
}
|
763
851
|
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
767
855
|
if (allocr == NULL) {
|
768
856
|
return NULL;
|
769
857
|
}
|
770
|
-
// find highest prio backend that supports the buffer type
|
771
858
|
for (int i = 0; i < sched->n_backends; i++) {
|
772
859
|
if (sched->tallocs[i] == allocr) {
|
773
860
|
return sched->backends[i];
|
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
777
864
|
}
|
778
865
|
|
779
866
|
#if 0
|
780
|
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*
|
867
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
781
868
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
782
869
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
783
870
|
#else
|
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
|
|
786
873
|
#endif
|
787
874
|
|
788
875
|
// returns the backend that should be used for the node based on the current locations
|
789
|
-
static
|
790
|
-
//
|
791
|
-
// ie. kv cache updates
|
792
|
-
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
876
|
+
static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
877
|
+
// assign pre-allocated nodes to their backend
|
793
878
|
// dst
|
794
|
-
|
795
|
-
if (
|
879
|
+
ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
|
880
|
+
if (cur_allocr != NULL) {
|
796
881
|
SET_CAUSE(node, "1.dst");
|
797
|
-
return
|
882
|
+
return cur_allocr;
|
798
883
|
}
|
799
|
-
|
800
884
|
// view_src
|
801
|
-
if (node->view_src != NULL
|
802
|
-
|
803
|
-
|
885
|
+
if (node->view_src != NULL) {
|
886
|
+
cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
|
887
|
+
if (cur_allocr != NULL) {
|
888
|
+
SET_CAUSE(node, "1.vsrc");
|
889
|
+
return cur_allocr;
|
890
|
+
}
|
804
891
|
}
|
805
|
-
|
806
|
-
// src
|
807
|
-
int cur_prio = INT_MAX;
|
808
|
-
size_t cur_size = 0;
|
809
|
-
|
892
|
+
// assign nodes that use weights to the backend of the weights
|
810
893
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
811
894
|
const struct ggml_tensor * src = node->src[i];
|
812
895
|
if (src == NULL) {
|
813
896
|
break;
|
814
897
|
}
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
cur_prio = src_prio;
|
821
|
-
cur_size = src_size;
|
822
|
-
cur_backend = src_backend;
|
823
|
-
SET_CAUSE(node, "1.src%d", i);
|
824
|
-
}
|
898
|
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
899
|
+
ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
|
900
|
+
// operations with weights are always run on the same backend as the weights
|
901
|
+
SET_CAUSE(node, "1.wgt%d", i);
|
902
|
+
return src_allocr;
|
825
903
|
}
|
826
904
|
}
|
827
|
-
|
905
|
+
|
906
|
+
return NULL;
|
828
907
|
}
|
829
908
|
|
830
909
|
static char * fmt_size(size_t size) {
|
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
857
936
|
}
|
858
937
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
859
938
|
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
860
|
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%
|
939
|
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
861
940
|
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
862
941
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
863
942
|
struct ggml_tensor * src = node->src[j];
|
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
866
945
|
}
|
867
946
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
868
947
|
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
869
|
-
fprintf(stderr, " %20.20s (%
|
948
|
+
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
870
949
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
871
950
|
}
|
872
951
|
fprintf(stderr, "\n");
|
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
|
882
961
|
return dup;
|
883
962
|
}
|
884
963
|
|
964
|
+
|
965
|
+
//#define DEBUG_PASS1
|
966
|
+
//#define DEBUG_PASS2
|
967
|
+
//#define DEBUG_PASS3
|
968
|
+
//#define DEBUG_PASS4
|
969
|
+
|
885
970
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
886
|
-
// TODO: merge passes
|
887
971
|
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
888
|
-
// reset
|
889
|
-
size_t hash_size = sched->hash_set.size;
|
890
|
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
891
|
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
892
|
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
972
|
+
// reset splits
|
893
973
|
sched->n_splits = 0;
|
974
|
+
sched->is_reset = false;
|
894
975
|
|
895
976
|
struct ggml_init_params params = {
|
896
977
|
/* .mem_size = */ sizeof(sched->context_buffer),
|
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
898
979
|
/* .no_alloc = */ true
|
899
980
|
};
|
900
981
|
|
901
|
-
|
902
|
-
ggml_free(sched->ctx);
|
903
|
-
}
|
982
|
+
ggml_free(sched->ctx);
|
904
983
|
|
905
984
|
sched->ctx = ggml_init(params);
|
985
|
+
if (sched->ctx == NULL) {
|
986
|
+
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
987
|
+
GGML_ASSERT(false);
|
988
|
+
}
|
906
989
|
|
907
|
-
// pass 1: assign backends to ops with allocated inputs
|
990
|
+
// pass 1: assign backends to ops with pre-allocated inputs
|
908
991
|
for (int i = 0; i < graph->n_leafs; i++) {
|
909
992
|
struct ggml_tensor * leaf = graph->leafs[i];
|
910
993
|
if (node_allocr(leaf) != NULL) {
|
911
994
|
// do not overwrite user assignments
|
912
995
|
continue;
|
913
996
|
}
|
914
|
-
|
915
|
-
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
916
|
-
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
|
917
|
-
}
|
918
|
-
if (leaf_backend != NULL) {
|
919
|
-
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
920
|
-
}
|
997
|
+
node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
|
921
998
|
}
|
922
999
|
|
923
1000
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -926,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
926
1003
|
// do not overwrite user assignments
|
927
1004
|
continue;
|
928
1005
|
}
|
929
|
-
|
930
|
-
|
931
|
-
|
1006
|
+
node_allocr(node) = sched_allocr_from_cur(sched, node);
|
1007
|
+
// src
|
1008
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1009
|
+
struct ggml_tensor * src = node->src[j];
|
1010
|
+
if (src == NULL) {
|
1011
|
+
break;
|
1012
|
+
}
|
1013
|
+
if (node_allocr(src) == NULL) {
|
1014
|
+
node_allocr(src) = sched_allocr_from_cur(sched, src);
|
1015
|
+
}
|
932
1016
|
}
|
933
1017
|
}
|
934
|
-
|
1018
|
+
#ifdef DEBUG_PASS1
|
1019
|
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1020
|
+
#endif
|
935
1021
|
|
936
|
-
// pass 2:
|
937
|
-
//
|
938
|
-
//
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
1022
|
+
// pass 2: expand current backend assignments
|
1023
|
+
// assign the same backend to adjacent nodes
|
1024
|
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1025
|
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1026
|
+
|
1027
|
+
// pass 2.1 expand gpu up
|
1028
|
+
{
|
1029
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1030
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1031
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1032
|
+
if (ggml_is_view_op(node->op)) {
|
1033
|
+
continue;
|
1034
|
+
}
|
1035
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1036
|
+
if (node_allocr != NULL) {
|
1037
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
1038
|
+
// skip cpu (lowest prio backend)
|
1039
|
+
cur_allocr = NULL;
|
1040
|
+
} else {
|
1041
|
+
cur_allocr = node_allocr;
|
949
1042
|
}
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
1043
|
+
} else {
|
1044
|
+
node_allocr(node) = cur_allocr;
|
1045
|
+
SET_CAUSE(node, "2.1");
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
// pass 2.2 expand gpu down
|
1051
|
+
{
|
1052
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1053
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1054
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1055
|
+
if (ggml_is_view_op(node->op)) {
|
1056
|
+
continue;
|
1057
|
+
}
|
1058
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1059
|
+
if (node_allocr != NULL) {
|
1060
|
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
1061
|
+
// skip cpu (lowest prio backend)
|
1062
|
+
cur_allocr = NULL;
|
1063
|
+
} else {
|
1064
|
+
cur_allocr = node_allocr;
|
960
1065
|
}
|
1066
|
+
} else {
|
1067
|
+
node_allocr(node) = cur_allocr;
|
1068
|
+
SET_CAUSE(node, "2.2");
|
961
1069
|
}
|
1070
|
+
}
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
// pass 2.3 expand rest up
|
1074
|
+
{
|
1075
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1076
|
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
1077
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1078
|
+
if (ggml_is_view_op(node->op)) {
|
1079
|
+
continue;
|
1080
|
+
}
|
1081
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
962
1082
|
if (node_allocr != NULL) {
|
963
|
-
|
1083
|
+
cur_allocr = node_allocr;
|
1084
|
+
} else {
|
1085
|
+
node_allocr(node) = cur_allocr;
|
1086
|
+
SET_CAUSE(node, "2.3");
|
964
1087
|
}
|
965
1088
|
}
|
966
1089
|
}
|
967
|
-
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
968
1090
|
|
969
|
-
// pass
|
1091
|
+
// pass 2.4 expand rest down
|
1092
|
+
{
|
1093
|
+
ggml_tallocr_t cur_allocr = NULL;
|
1094
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1095
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1096
|
+
if (ggml_is_view_op(node->op)) {
|
1097
|
+
continue;
|
1098
|
+
}
|
1099
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1100
|
+
if (node_allocr != NULL) {
|
1101
|
+
cur_allocr = node_allocr;
|
1102
|
+
} else {
|
1103
|
+
node_allocr(node) = cur_allocr;
|
1104
|
+
SET_CAUSE(node, "2.4");
|
1105
|
+
}
|
1106
|
+
}
|
1107
|
+
}
|
1108
|
+
#ifdef DEBUG_PASS2
|
1109
|
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1110
|
+
#endif
|
1111
|
+
|
1112
|
+
// pass 3: assign backends to remaining src from dst and view_src
|
970
1113
|
for (int i = 0; i < graph->n_nodes; i++) {
|
971
1114
|
struct ggml_tensor * node = graph->nodes[i];
|
972
|
-
ggml_tallocr_t
|
1115
|
+
ggml_tallocr_t cur_allocr = node_allocr(node);
|
1116
|
+
if (node->view_src != NULL && cur_allocr == NULL) {
|
1117
|
+
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
|
1118
|
+
SET_CAUSE(node, "3.vsrc");
|
1119
|
+
}
|
973
1120
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
974
1121
|
struct ggml_tensor * src = node->src[j];
|
975
1122
|
if (src == NULL) {
|
@@ -977,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
977
1124
|
}
|
978
1125
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
979
1126
|
if (src_allocr == NULL) {
|
980
|
-
|
1127
|
+
if (src->view_src != NULL) {
|
1128
|
+
// views are always on the same backend as the source
|
1129
|
+
node_allocr(src) = node_allocr(src->view_src);
|
1130
|
+
SET_CAUSE(src, "3.vsrc");
|
1131
|
+
} else {
|
1132
|
+
node_allocr(src) = cur_allocr;
|
1133
|
+
SET_CAUSE(src, "3.cur");
|
1134
|
+
}
|
981
1135
|
}
|
982
1136
|
}
|
983
1137
|
}
|
984
|
-
|
1138
|
+
#ifdef DEBUG_PASS3
|
1139
|
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1140
|
+
#endif
|
985
1141
|
|
986
1142
|
// pass 4: split graph, find tensors that need to be copied
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
}
|
997
|
-
}
|
998
|
-
sched->splits[0].i_start = 0;
|
999
|
-
sched->splits[0].n_inputs = 0;
|
1000
|
-
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1001
|
-
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
1002
|
-
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1003
|
-
for (int i = 0; i < graph->n_nodes; i++) {
|
1004
|
-
struct ggml_tensor * node = graph->nodes[i];
|
1005
|
-
|
1006
|
-
if (ggml_is_view_op(node->op)) {
|
1007
|
-
continue;
|
1143
|
+
{
|
1144
|
+
int cur_split = 0;
|
1145
|
+
// find the backend of the first split, skipping view ops
|
1146
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1147
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1148
|
+
if (!ggml_is_view_op(node->op)) {
|
1149
|
+
sched->splits[0].tallocr = node_allocr(node);
|
1150
|
+
break;
|
1151
|
+
}
|
1008
1152
|
}
|
1153
|
+
sched->splits[0].i_start = 0;
|
1154
|
+
sched->splits[0].n_inputs = 0;
|
1155
|
+
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
1156
|
+
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
1157
|
+
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1158
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1159
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1160
|
+
|
1161
|
+
if (ggml_is_view_op(node->op)) {
|
1162
|
+
continue;
|
1163
|
+
}
|
1009
1164
|
|
1010
|
-
|
1165
|
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
1011
1166
|
|
1012
|
-
|
1013
|
-
sched->splits[cur_split].i_end = i;
|
1014
|
-
cur_split++;
|
1015
|
-
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1016
|
-
sched->splits[cur_split].tallocr = node_allocr;
|
1017
|
-
sched->splits[cur_split].i_start = i;
|
1018
|
-
sched->splits[cur_split].n_inputs = 0;
|
1019
|
-
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
1020
|
-
cur_allocr = node_allocr;
|
1021
|
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1022
|
-
}
|
1167
|
+
GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
|
1023
1168
|
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1169
|
+
if (node_allocr != cur_allocr) {
|
1170
|
+
sched->splits[cur_split].i_end = i;
|
1171
|
+
cur_split++;
|
1172
|
+
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
1173
|
+
sched->splits[cur_split].tallocr = node_allocr;
|
1174
|
+
sched->splits[cur_split].i_start = i;
|
1175
|
+
sched->splits[cur_split].n_inputs = 0;
|
1176
|
+
cur_allocr = node_allocr;
|
1177
|
+
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
1029
1178
|
}
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1179
|
+
|
1180
|
+
// find inputs that are not on the same backend
|
1181
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1182
|
+
struct ggml_tensor * src = node->src[j];
|
1183
|
+
if (src == NULL) {
|
1184
|
+
break;
|
1185
|
+
}
|
1186
|
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
1187
|
+
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
1188
|
+
if (src_allocr != node_allocr) {
|
1189
|
+
// check if the input is already in the split
|
1190
|
+
bool found = false;
|
1191
|
+
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
1192
|
+
if (sched->splits[cur_split].inputs[k] == src) {
|
1193
|
+
found = true;
|
1194
|
+
break;
|
1195
|
+
}
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
if (!found) {
|
1199
|
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
1200
|
+
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
1201
|
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
1202
|
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
1203
|
+
}
|
1204
|
+
|
1205
|
+
// create a copy of the input in the split's backend
|
1206
|
+
size_t id = hash_id(src);
|
1207
|
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
1208
|
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
1209
|
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
1210
|
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
1211
|
+
|
1212
|
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
1213
|
+
node_allocr(tensor_copy) = cur_allocr;
|
1214
|
+
SET_CAUSE(tensor_copy, "4.cpy");
|
1215
|
+
}
|
1216
|
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1044
1217
|
}
|
1045
|
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
1046
1218
|
}
|
1047
1219
|
}
|
1220
|
+
sched->splits[cur_split].i_end = graph->n_nodes;
|
1221
|
+
sched->n_splits = cur_split + 1;
|
1048
1222
|
}
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
1223
|
+
#ifdef DEBUG_PASS4
|
1224
|
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
1225
|
+
#endif
|
1053
1226
|
|
1054
|
-
#
|
1227
|
+
#ifndef NDEBUG
|
1055
1228
|
// sanity check: all sources should have the same backend as the node
|
1056
1229
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1057
1230
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -1059,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1059
1232
|
if (node_allocr == NULL) {
|
1060
1233
|
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
1061
1234
|
}
|
1235
|
+
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
|
1236
|
+
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
1237
|
+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
1238
|
+
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
|
1239
|
+
}
|
1062
1240
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1063
1241
|
struct ggml_tensor * src = node->src[j];
|
1064
1242
|
if (src == NULL) {
|
@@ -1070,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1070
1248
|
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
1071
1249
|
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
1072
1250
|
}
|
1251
|
+
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
|
1252
|
+
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
1253
|
+
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
|
1254
|
+
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
|
1255
|
+
}
|
1073
1256
|
}
|
1074
1257
|
}
|
1258
|
+
fflush(stderr);
|
1075
1259
|
#endif
|
1076
1260
|
|
1077
1261
|
// create copies of the graph for each split
|
@@ -1085,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
1085
1269
|
for (int j = 0; j < split->n_inputs; j++) {
|
1086
1270
|
struct ggml_tensor * input = split->inputs[j];
|
1087
1271
|
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
1272
|
+
// add a dependency to the input source so that it is not freed before the copy is done
|
1273
|
+
GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
|
1088
1274
|
input_cpy->src[0] = input;
|
1089
1275
|
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
1090
1276
|
}
|
@@ -1119,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1119
1305
|
uint64_t copy_start_us = ggml_time_us();
|
1120
1306
|
for (int j = 0; j < split->n_inputs; j++) {
|
1121
1307
|
struct ggml_tensor * input = split->inputs[j];
|
1122
|
-
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
}
|
1131
|
-
if (input_cpy->buffer == NULL) {
|
1132
|
-
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
1133
|
-
exit(1);
|
1134
|
-
}
|
1135
|
-
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
|
1136
|
-
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
1137
|
-
ggml_backend_tensor_copy(input, input_cpy);
|
1308
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
|
1309
|
+
|
1310
|
+
GGML_ASSERT(input->buffer != NULL);
|
1311
|
+
GGML_ASSERT(input_cpy->buffer != NULL);
|
1312
|
+
|
1313
|
+
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
1314
|
+
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
1315
|
+
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
1138
1316
|
}
|
1139
|
-
//
|
1317
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
1140
1318
|
int64_t copy_end_us = ggml_time_us();
|
1141
1319
|
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
1142
1320
|
|
@@ -1148,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
1148
1326
|
|
1149
1327
|
uint64_t compute_start_us = ggml_time_us();
|
1150
1328
|
ggml_backend_graph_compute(split_backend, &split->graph);
|
1151
|
-
//
|
1329
|
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
1152
1330
|
uint64_t compute_end_us = ggml_time_us();
|
1153
1331
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
1154
1332
|
}
|
@@ -1168,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|
1168
1346
|
for (int i = 0; i < sched->n_backends; i++) {
|
1169
1347
|
ggml_tallocr_reset(sched->tallocs[i]);
|
1170
1348
|
}
|
1349
|
+
// reset state for the next run
|
1350
|
+
size_t hash_size = sched->hash_set.size;
|
1351
|
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
1352
|
+
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
1353
|
+
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
1354
|
+
|
1355
|
+
sched->is_reset = true;
|
1171
1356
|
}
|
1172
1357
|
|
1173
|
-
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
1358
|
+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
1359
|
+
GGML_ASSERT(n_backends > 0);
|
1174
1360
|
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
1175
1361
|
|
1176
|
-
struct ggml_backend_sched * sched =
|
1177
|
-
|
1362
|
+
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
1363
|
+
|
1364
|
+
// initialize hash table
|
1365
|
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1366
|
+
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
1367
|
+
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
1178
1368
|
|
1179
1369
|
sched->n_backends = n_backends;
|
1180
1370
|
for (int i = 0; i < n_backends; i++) {
|
1181
1371
|
sched->backends[i] = backends[i];
|
1372
|
+
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
1182
1373
|
}
|
1183
1374
|
|
1184
1375
|
sched->galloc = ggml_gallocr_new();
|
1185
1376
|
|
1186
1377
|
// init measure allocs for each backend
|
1187
1378
|
for (int i = 0; i < n_backends; i++) {
|
1188
|
-
sched->tallocs[i] =
|
1379
|
+
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
1189
1380
|
}
|
1190
1381
|
|
1382
|
+
sched_reset(sched);
|
1383
|
+
|
1191
1384
|
return sched;
|
1192
1385
|
}
|
1193
1386
|
|
@@ -1199,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1199
1392
|
ggml_tallocr_free(sched->tallocs[i]);
|
1200
1393
|
}
|
1201
1394
|
ggml_gallocr_free(sched->galloc);
|
1395
|
+
ggml_free(sched->ctx);
|
1202
1396
|
free(sched->hash_set.keys);
|
1203
1397
|
free(sched->node_talloc);
|
1204
1398
|
free(sched->node_copies);
|
@@ -1206,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1206
1400
|
}
|
1207
1401
|
|
1208
1402
|
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
1209
|
-
//
|
1210
|
-
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
1211
|
-
sched->hash_set.size = hash_size;
|
1212
|
-
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
1213
|
-
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
1214
|
-
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
1403
|
+
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
1215
1404
|
|
1216
1405
|
sched_split_graph(sched, measure_graph);
|
1217
1406
|
sched_alloc_splits(sched);
|
@@ -1220,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|
1220
1409
|
for (int i = 0; i < sched->n_backends; i++) {
|
1221
1410
|
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
1222
1411
|
ggml_tallocr_free(sched->tallocs[i]);
|
1223
|
-
sched->tallocs[i] =
|
1412
|
+
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
1224
1413
|
}
|
1225
1414
|
|
1226
1415
|
sched_reset(sched);
|
1227
1416
|
}
|
1228
1417
|
|
1229
1418
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
1230
|
-
GGML_ASSERT(sched->hash_set.size >= graph->
|
1419
|
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
1420
|
+
|
1421
|
+
if (!sched->is_reset) {
|
1422
|
+
sched_reset(sched);
|
1423
|
+
}
|
1231
1424
|
|
1232
1425
|
sched_split_graph(sched, graph);
|
1233
1426
|
sched_alloc_splits(sched);
|
1234
1427
|
sched_compute_splits(sched);
|
1428
|
+
}
|
1429
|
+
|
1430
|
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
1235
1431
|
sched_reset(sched);
|
1236
1432
|
}
|
1237
1433
|
|
1434
|
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
1435
|
+
return sched->n_splits;
|
1436
|
+
}
|
1437
|
+
|
1238
1438
|
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1239
1439
|
int backend_index = sched_backend_prio(sched, backend);
|
1440
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1240
1441
|
return sched->tallocs[backend_index];
|
1241
1442
|
}
|
1242
1443
|
|
1243
1444
|
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1244
1445
|
int backend_index = sched_backend_prio(sched, backend);
|
1446
|
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1245
1447
|
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
1246
1448
|
}
|
1247
1449
|
|
@@ -1251,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
1251
1453
|
node_allocr(node) = sched->tallocs[backend_index];
|
1252
1454
|
}
|
1253
1455
|
|
1456
|
+
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
1457
|
+
ggml_tallocr_t allocr = node_allocr(node);
|
1458
|
+
if (allocr == NULL) {
|
1459
|
+
return NULL;
|
1460
|
+
}
|
1461
|
+
return get_allocr_backend(sched, allocr);
|
1462
|
+
}
|
1463
|
+
|
1254
1464
|
// utils
|
1465
|
+
|
1255
1466
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1256
1467
|
GGML_ASSERT(tensor->buffer == NULL);
|
1257
|
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-
|
1468
|
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
1258
1469
|
GGML_ASSERT(tensor->view_src != NULL);
|
1259
1470
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1260
1471
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
@@ -1320,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
1320
1531
|
|
1321
1532
|
struct ggml_tensor * dst = node_copies[id];
|
1322
1533
|
if (dst->view_src != NULL) {
|
1534
|
+
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1323
1535
|
ggml_backend_view_init(dst->view_src->buffer, dst);
|
1324
1536
|
}
|
1325
1537
|
else {
|
@@ -1353,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1353
1565
|
struct ggml_context * ctx_allocated = ggml_init(params);
|
1354
1566
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
1355
1567
|
|
1568
|
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
1569
|
+
fprintf(stderr, "failed to allocate context for graph copy\n");
|
1570
|
+
free(hash_set.keys);
|
1571
|
+
free(node_copies);
|
1572
|
+
free(node_init);
|
1573
|
+
ggml_free(ctx_allocated);
|
1574
|
+
ggml_free(ctx_unallocated);
|
1575
|
+
return (struct ggml_backend_graph_copy) {
|
1576
|
+
/* .buffer = */ NULL,
|
1577
|
+
/* .ctx_allocated = */ NULL,
|
1578
|
+
/* .ctx_unallocated = */ NULL,
|
1579
|
+
/* .graph = */ NULL,
|
1580
|
+
};
|
1581
|
+
}
|
1582
|
+
|
1356
1583
|
// dup nodes
|
1357
1584
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1358
1585
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -1361,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
1361
1588
|
|
1362
1589
|
// allocate nodes
|
1363
1590
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1591
|
+
if (buffer == NULL) {
|
1592
|
+
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
1593
|
+
free(hash_set.keys);
|
1594
|
+
free(node_copies);
|
1595
|
+
free(node_init);
|
1596
|
+
ggml_free(ctx_allocated);
|
1597
|
+
ggml_free(ctx_unallocated);
|
1598
|
+
return (struct ggml_backend_graph_copy) {
|
1599
|
+
/* .buffer = */ NULL,
|
1600
|
+
/* .ctx_allocated = */ NULL,
|
1601
|
+
/* .ctx_unallocated = */ NULL,
|
1602
|
+
/* .graph = */ NULL,
|
1603
|
+
};
|
1604
|
+
}
|
1364
1605
|
|
1365
1606
|
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
1366
1607
|
|
@@ -1397,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
1397
1638
|
ggml_free(copy.ctx_unallocated);
|
1398
1639
|
}
|
1399
1640
|
|
1400
|
-
|
1641
|
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
1401
1642
|
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
1643
|
+
if (copy.buffer == NULL) {
|
1644
|
+
return false;
|
1645
|
+
}
|
1646
|
+
|
1402
1647
|
struct ggml_cgraph * g1 = graph;
|
1403
1648
|
struct ggml_cgraph * g2 = copy.graph;
|
1404
1649
|
|
@@ -1428,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
1428
1673
|
}
|
1429
1674
|
|
1430
1675
|
ggml_backend_graph_copy_free(copy);
|
1676
|
+
|
1677
|
+
return true;
|
1431
1678
|
}
|