llama_cpp 0.9.5 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/llama_cpp.cpp +123 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +8 -1
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1796 -413
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +998 -169
- data/ext/llama_cpp/src/ggml-metal.metal +2253 -274
- data/ext/llama_cpp/src/ggml-quants.c +2 -2
- data/ext/llama_cpp/src/ggml.c +634 -248
- data/ext/llama_cpp/src/ggml.h +81 -15
- data/ext/llama_cpp/src/llama.cpp +932 -352
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -2
- metadata +2 -2
@@ -9,14 +9,36 @@
|
|
9
9
|
#include <stdlib.h>
|
10
10
|
#include <string.h>
|
11
11
|
|
12
|
-
#define UNUSED GGML_UNUSED
|
13
12
|
|
14
13
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
15
14
|
|
15
|
+
|
16
|
+
// backend buffer type
|
17
|
+
|
18
|
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
19
|
+
return buft->iface.alloc_buffer(buft, size);
|
20
|
+
}
|
21
|
+
|
22
|
+
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
23
|
+
return buft->iface.get_alignment(buft);
|
24
|
+
}
|
25
|
+
|
26
|
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
27
|
+
// get_alloc_size is optional, defaults to ggml_nbytes
|
28
|
+
if (buft->iface.get_alloc_size) {
|
29
|
+
return buft->iface.get_alloc_size(buft, tensor);
|
30
|
+
}
|
31
|
+
return ggml_nbytes(tensor);
|
32
|
+
}
|
33
|
+
|
34
|
+
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
35
|
+
return buft->iface.supports_backend(buft, backend);
|
36
|
+
}
|
37
|
+
|
16
38
|
// backend buffer
|
17
39
|
|
18
40
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
19
|
-
|
41
|
+
ggml_backend_buffer_type_t buft,
|
20
42
|
struct ggml_backend_buffer_i iface,
|
21
43
|
ggml_backend_buffer_context_t context,
|
22
44
|
size_t size) {
|
@@ -26,7 +48,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
26
48
|
|
27
49
|
(*buffer) = (struct ggml_backend_buffer) {
|
28
50
|
/* .interface = */ iface,
|
29
|
-
/* .
|
51
|
+
/* .buft = */ buft,
|
30
52
|
/* .context = */ context,
|
31
53
|
/* .size = */ size,
|
32
54
|
};
|
@@ -45,10 +67,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
45
67
|
free(buffer);
|
46
68
|
}
|
47
69
|
|
48
|
-
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
49
|
-
return ggml_backend_get_alignment(buffer->backend);
|
50
|
-
}
|
51
|
-
|
52
70
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
53
71
|
return buffer->size;
|
54
72
|
}
|
@@ -61,14 +79,6 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
61
79
|
return base;
|
62
80
|
}
|
63
81
|
|
64
|
-
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
65
|
-
// get_alloc_size is optional, defaults to ggml_nbytes
|
66
|
-
if (buffer->iface.get_alloc_size) {
|
67
|
-
return buffer->iface.get_alloc_size(buffer, tensor);
|
68
|
-
}
|
69
|
-
return ggml_nbytes(tensor);
|
70
|
-
}
|
71
|
-
|
72
82
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
73
83
|
// init_tensor is optional
|
74
84
|
if (buffer->iface.init_tensor) {
|
@@ -76,19 +86,20 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
76
86
|
}
|
77
87
|
}
|
78
88
|
|
79
|
-
|
80
|
-
|
81
|
-
if (buffer->iface.free_tensor) {
|
82
|
-
buffer->iface.free_tensor(buffer, tensor);
|
83
|
-
}
|
89
|
+
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
90
|
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
|
84
91
|
}
|
85
92
|
|
86
|
-
|
93
|
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
94
|
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
95
|
+
}
|
87
96
|
|
88
|
-
|
89
|
-
return
|
97
|
+
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
98
|
+
return buffer->buft;
|
90
99
|
}
|
91
100
|
|
101
|
+
// backend
|
102
|
+
|
92
103
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
93
104
|
if (backend == NULL) {
|
94
105
|
return "NULL";
|
@@ -104,43 +115,53 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
104
115
|
backend->iface.free(backend);
|
105
116
|
}
|
106
117
|
|
118
|
+
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
119
|
+
return backend->iface.get_default_buffer_type(backend);
|
120
|
+
}
|
121
|
+
|
107
122
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
108
|
-
return
|
123
|
+
return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
|
109
124
|
}
|
110
125
|
|
111
126
|
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
112
|
-
return
|
127
|
+
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
113
128
|
}
|
114
129
|
|
115
|
-
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
116
|
-
|
130
|
+
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
131
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
132
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
133
|
+
|
134
|
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
117
135
|
}
|
118
136
|
|
119
|
-
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
120
|
-
|
137
|
+
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
138
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
139
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
140
|
+
|
141
|
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
121
142
|
}
|
122
143
|
|
123
144
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
124
|
-
ggml_backend_t backend = ggml_get_backend(tensor);
|
125
|
-
|
126
145
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
127
|
-
GGML_ASSERT(
|
146
|
+
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
147
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
128
148
|
|
129
|
-
|
130
|
-
backend->iface.synchronize(backend);
|
149
|
+
tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
|
131
150
|
}
|
132
151
|
|
133
152
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
134
|
-
ggml_backend_t backend = ggml_get_backend(tensor);
|
135
|
-
|
136
153
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
137
|
-
GGML_ASSERT(
|
154
|
+
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
155
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
138
156
|
|
139
|
-
|
140
|
-
backend->iface.synchronize(backend);
|
157
|
+
tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
|
141
158
|
}
|
142
159
|
|
143
160
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
161
|
+
if (backend->iface.synchronize == NULL) {
|
162
|
+
return;
|
163
|
+
}
|
164
|
+
|
144
165
|
backend->iface.synchronize(backend);
|
145
166
|
}
|
146
167
|
|
@@ -154,10 +175,16 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
154
175
|
|
155
176
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
156
177
|
backend->iface.graph_plan_compute(backend, plan);
|
178
|
+
|
179
|
+
// TODO: optional sync
|
180
|
+
ggml_backend_synchronize(backend);
|
157
181
|
}
|
158
182
|
|
159
183
|
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
160
184
|
backend->iface.graph_compute(backend, cgraph);
|
185
|
+
|
186
|
+
// TODO: optional sync
|
187
|
+
ggml_backend_synchronize(backend);
|
161
188
|
}
|
162
189
|
|
163
190
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -194,14 +221,15 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
194
221
|
|
195
222
|
// TODO: allow backends to support copy to/from same backend
|
196
223
|
|
197
|
-
if (
|
198
|
-
|
199
|
-
} else if (
|
200
|
-
|
224
|
+
if (dst->buffer->iface.cpy_tensor_from != NULL) {
|
225
|
+
dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
|
226
|
+
} else if (src->buffer->iface.cpy_tensor_to != NULL) {
|
227
|
+
src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
|
201
228
|
} else {
|
202
229
|
// shouldn't be hit when copying from/to CPU
|
203
230
|
#ifndef NDEBUG
|
204
|
-
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to
|
231
|
+
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
|
232
|
+
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
|
205
233
|
#endif
|
206
234
|
size_t nbytes = ggml_nbytes(src);
|
207
235
|
void * data = malloc(nbytes);
|
@@ -211,101 +239,259 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
211
239
|
}
|
212
240
|
}
|
213
241
|
|
214
|
-
// backend
|
242
|
+
// backend registry
|
215
243
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
244
|
+
#define GGML_MAX_BACKENDS_REG 16
|
245
|
+
|
246
|
+
struct ggml_backend_reg {
|
247
|
+
char name[128];
|
248
|
+
ggml_backend_init_fn init_fn;
|
249
|
+
ggml_backend_buffer_type_t default_buffer_type;
|
250
|
+
void * user_data;
|
220
251
|
};
|
221
252
|
|
222
|
-
static
|
223
|
-
|
253
|
+
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
|
254
|
+
static size_t ggml_backend_registry_count = 0;
|
255
|
+
|
256
|
+
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
257
|
+
|
258
|
+
static void ggml_backend_registry_init(void) {
|
259
|
+
static bool initialized = false;
|
260
|
+
|
261
|
+
if (initialized) {
|
262
|
+
return;
|
263
|
+
}
|
264
|
+
|
265
|
+
initialized = true;
|
224
266
|
|
225
|
-
|
267
|
+
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
268
|
+
|
269
|
+
// add forward decls here to avoid including the backend headers
|
270
|
+
#ifdef GGML_USE_CUBLAS
|
271
|
+
extern void ggml_backend_cuda_reg_devices(void);
|
272
|
+
ggml_backend_cuda_reg_devices();
|
273
|
+
#endif
|
274
|
+
|
275
|
+
#ifdef GGML_USE_METAL
|
276
|
+
extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
277
|
+
extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
278
|
+
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
279
|
+
#endif
|
226
280
|
}
|
227
281
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
282
|
+
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
283
|
+
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
284
|
+
|
285
|
+
int id = ggml_backend_registry_count;
|
286
|
+
|
287
|
+
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
288
|
+
/* .name = */ {0},
|
289
|
+
/* .fn = */ init_fn,
|
290
|
+
/* .default_buffer_type = */ default_buffer_type,
|
291
|
+
/* .user_data = */ user_data,
|
292
|
+
};
|
293
|
+
|
294
|
+
snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
|
295
|
+
|
296
|
+
#ifndef NDEBUG
|
297
|
+
fprintf(stderr, "%s: registered backend %s\n", __func__, name);
|
298
|
+
#endif
|
299
|
+
|
300
|
+
ggml_backend_registry_count++;
|
301
|
+
}
|
302
|
+
|
303
|
+
size_t ggml_backend_reg_get_count(void) {
|
304
|
+
ggml_backend_registry_init();
|
305
|
+
|
306
|
+
return ggml_backend_registry_count;
|
307
|
+
}
|
308
|
+
|
309
|
+
size_t ggml_backend_reg_find_by_name(const char * name) {
|
310
|
+
ggml_backend_registry_init();
|
311
|
+
|
312
|
+
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
|
313
|
+
// TODO: case insensitive in a portable way
|
314
|
+
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
|
315
|
+
return i;
|
316
|
+
}
|
317
|
+
}
|
318
|
+
return SIZE_MAX;
|
319
|
+
}
|
320
|
+
|
321
|
+
// init from backend:params string
|
322
|
+
ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
|
323
|
+
ggml_backend_registry_init();
|
324
|
+
|
325
|
+
const char * params = strchr(backend_str, ':');
|
326
|
+
char backend_name[128];
|
327
|
+
if (params == NULL) {
|
328
|
+
strcpy(backend_name, backend_str);
|
329
|
+
params = "";
|
330
|
+
} else {
|
331
|
+
strncpy(backend_name, backend_str, params - backend_str);
|
332
|
+
backend_name[params - backend_str] = '\0';
|
333
|
+
params++;
|
334
|
+
}
|
335
|
+
|
336
|
+
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
337
|
+
if (backend_i == SIZE_MAX) {
|
338
|
+
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
339
|
+
return NULL;
|
340
|
+
}
|
341
|
+
|
342
|
+
return ggml_backend_reg_init_backend(backend_i, params);
|
343
|
+
}
|
344
|
+
|
345
|
+
const char * ggml_backend_reg_get_name(size_t i) {
|
346
|
+
ggml_backend_registry_init();
|
347
|
+
|
348
|
+
GGML_ASSERT(i < ggml_backend_registry_count);
|
349
|
+
return ggml_backend_registry[i].name;
|
350
|
+
}
|
351
|
+
|
352
|
+
ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
|
353
|
+
ggml_backend_registry_init();
|
354
|
+
|
355
|
+
GGML_ASSERT(i < ggml_backend_registry_count);
|
356
|
+
return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
|
357
|
+
}
|
358
|
+
|
359
|
+
ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
|
360
|
+
ggml_backend_registry_init();
|
361
|
+
|
362
|
+
GGML_ASSERT(i < ggml_backend_registry_count);
|
363
|
+
return ggml_backend_registry[i].default_buffer_type;
|
364
|
+
}
|
365
|
+
|
366
|
+
ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
367
|
+
ggml_backend_registry_init();
|
368
|
+
|
369
|
+
GGML_ASSERT(i < ggml_backend_registry_count);
|
370
|
+
return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
|
233
371
|
}
|
234
372
|
|
373
|
+
// backend CPU
|
374
|
+
|
235
375
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
236
376
|
return (void *)buffer->context;
|
237
377
|
}
|
238
378
|
|
239
379
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
240
380
|
free(buffer->context);
|
241
|
-
|
381
|
+
GGML_UNUSED(buffer);
|
382
|
+
}
|
383
|
+
|
384
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
385
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
386
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
387
|
+
|
388
|
+
memcpy((char *)tensor->data + offset, data, size);
|
389
|
+
|
390
|
+
GGML_UNUSED(buffer);
|
391
|
+
}
|
392
|
+
|
393
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
394
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
395
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
396
|
+
|
397
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
398
|
+
|
399
|
+
GGML_UNUSED(buffer);
|
400
|
+
}
|
401
|
+
|
402
|
+
static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
403
|
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
404
|
+
|
405
|
+
GGML_UNUSED(buffer);
|
406
|
+
}
|
407
|
+
|
408
|
+
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
409
|
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
410
|
+
|
411
|
+
GGML_UNUSED(buffer);
|
242
412
|
}
|
243
413
|
|
244
414
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
245
|
-
/* .free_buffer
|
246
|
-
/* .get_base
|
247
|
-
/* .
|
248
|
-
/* .
|
249
|
-
/* .
|
415
|
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
416
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
417
|
+
/* .init_tensor = */ NULL, // no initialization required
|
418
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
419
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
420
|
+
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
421
|
+
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
250
422
|
};
|
251
423
|
|
252
424
|
// for buffers from ptr, free is not called
|
253
425
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
254
|
-
/* .free_buffer
|
255
|
-
/* .get_base
|
256
|
-
/* .
|
257
|
-
/* .
|
258
|
-
/* .
|
426
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
427
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
428
|
+
/* .init_tensor = */ NULL, // no initialization required
|
429
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
430
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
431
|
+
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
432
|
+
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
259
433
|
};
|
260
434
|
|
261
435
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
262
436
|
|
263
|
-
static ggml_backend_buffer_t
|
437
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
264
438
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
265
439
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
266
440
|
|
267
441
|
GGML_ASSERT(data != NULL && "failed to allocate buffer");
|
268
442
|
|
269
|
-
return ggml_backend_buffer_init(
|
443
|
+
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
270
444
|
}
|
271
445
|
|
272
|
-
static size_t
|
446
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
273
447
|
return TENSOR_ALIGNMENT;
|
274
|
-
UNUSED(backend);
|
275
|
-
}
|
276
448
|
|
277
|
-
|
278
|
-
|
279
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
449
|
+
GGML_UNUSED(buft);
|
450
|
+
}
|
280
451
|
|
281
|
-
|
452
|
+
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
453
|
+
return ggml_backend_is_cpu(backend);
|
282
454
|
|
283
|
-
|
455
|
+
GGML_UNUSED(buft);
|
284
456
|
}
|
285
457
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
458
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
459
|
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
|
460
|
+
/* .iface = */ {
|
461
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
462
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
463
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
464
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
465
|
+
},
|
466
|
+
/* .context = */ NULL,
|
467
|
+
};
|
291
468
|
|
292
|
-
|
469
|
+
return &ggml_backend_buffer_type_cpu;
|
293
470
|
}
|
294
471
|
|
295
|
-
|
296
|
-
|
297
|
-
|
472
|
+
struct ggml_backend_cpu_context {
|
473
|
+
int n_threads;
|
474
|
+
void * work_data;
|
475
|
+
size_t work_size;
|
476
|
+
};
|
298
477
|
|
299
|
-
static
|
300
|
-
|
478
|
+
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
479
|
+
return "CPU";
|
301
480
|
|
302
|
-
|
481
|
+
GGML_UNUSED(backend);
|
303
482
|
}
|
304
483
|
|
305
|
-
static void
|
306
|
-
|
484
|
+
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
485
|
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
486
|
+
free(cpu_ctx->work_data);
|
487
|
+
free(cpu_ctx);
|
488
|
+
free(backend);
|
489
|
+
}
|
490
|
+
|
491
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
492
|
+
return ggml_backend_cpu_buffer_type();
|
307
493
|
|
308
|
-
|
494
|
+
GGML_UNUSED(backend);
|
309
495
|
}
|
310
496
|
|
311
497
|
struct ggml_backend_plan_cpu {
|
@@ -334,7 +520,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
|
|
334
520
|
free(cpu_plan->cplan.work_data);
|
335
521
|
free(cpu_plan);
|
336
522
|
|
337
|
-
|
523
|
+
GGML_UNUSED(backend);
|
338
524
|
}
|
339
525
|
|
340
526
|
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
@@ -342,7 +528,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
342
528
|
|
343
529
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
344
530
|
|
345
|
-
|
531
|
+
GGML_UNUSED(backend);
|
346
532
|
}
|
347
533
|
|
348
534
|
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
@@ -363,25 +549,25 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
363
549
|
|
364
550
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
365
551
|
return true;
|
366
|
-
|
367
|
-
|
552
|
+
|
553
|
+
GGML_UNUSED(backend);
|
554
|
+
GGML_UNUSED(op);
|
368
555
|
}
|
369
556
|
|
370
557
|
static struct ggml_backend_i cpu_backend_i = {
|
371
|
-
/* .get_name
|
372
|
-
/* .free
|
373
|
-
/* .
|
374
|
-
/* .
|
375
|
-
/* .
|
376
|
-
/* .
|
377
|
-
/* .
|
378
|
-
/* .
|
379
|
-
/* .
|
380
|
-
/* .
|
381
|
-
/* .
|
382
|
-
/* .
|
383
|
-
/* .
|
384
|
-
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
558
|
+
/* .get_name = */ ggml_backend_cpu_name,
|
559
|
+
/* .free = */ ggml_backend_cpu_free,
|
560
|
+
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
561
|
+
/* .set_tensor_async = */ NULL,
|
562
|
+
/* .get_tensor_async = */ NULL,
|
563
|
+
/* .cpy_tensor_from_async = */ NULL,
|
564
|
+
/* .cpy_tensor_to_async = */ NULL,
|
565
|
+
/* .synchronize = */ NULL,
|
566
|
+
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
567
|
+
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
568
|
+
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
569
|
+
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
570
|
+
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
385
571
|
};
|
386
572
|
|
387
573
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
@@ -411,10 +597,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
411
597
|
ctx->n_threads = n_threads;
|
412
598
|
}
|
413
599
|
|
414
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(
|
415
|
-
return ggml_backend_buffer_init(
|
600
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
601
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
602
|
+
}
|
603
|
+
|
604
|
+
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
605
|
+
return ggml_backend_cpu_init();
|
606
|
+
|
607
|
+
GGML_UNUSED(params);
|
608
|
+
GGML_UNUSED(user_data);
|
416
609
|
}
|
417
610
|
|
611
|
+
|
418
612
|
// scheduler
|
419
613
|
|
420
614
|
#define GGML_MAX_BACKENDS 4
|
@@ -427,7 +621,7 @@ struct ggml_backend_sched_split {
|
|
427
621
|
int i_end;
|
428
622
|
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
429
623
|
int n_inputs;
|
430
|
-
struct ggml_cgraph
|
624
|
+
struct ggml_cgraph graph;
|
431
625
|
};
|
432
626
|
|
433
627
|
struct ggml_backend_sched {
|
@@ -453,7 +647,7 @@ struct ggml_backend_sched {
|
|
453
647
|
#else
|
454
648
|
__attribute__((aligned(GGML_MEM_ALIGN)))
|
455
649
|
#endif
|
456
|
-
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) +
|
650
|
+
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
457
651
|
};
|
458
652
|
|
459
653
|
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
@@ -482,23 +676,57 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
|
|
482
676
|
return INT_MAX;
|
483
677
|
}
|
484
678
|
|
679
|
+
static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
680
|
+
if (buffer == NULL) {
|
681
|
+
return NULL;
|
682
|
+
}
|
683
|
+
// find highest prio backend that supports the buffer type
|
684
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
685
|
+
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
686
|
+
return sched->backends[i];
|
687
|
+
}
|
688
|
+
}
|
689
|
+
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
690
|
+
}
|
691
|
+
|
692
|
+
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
|
693
|
+
if (allocr == NULL) {
|
694
|
+
return NULL;
|
695
|
+
}
|
696
|
+
// find highest prio backend that supports the buffer type
|
697
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
698
|
+
if (sched->tallocs[i] == allocr) {
|
699
|
+
return sched->backends[i];
|
700
|
+
}
|
701
|
+
}
|
702
|
+
GGML_UNREACHABLE();
|
703
|
+
}
|
704
|
+
|
705
|
+
#if 0
|
706
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
707
|
+
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
708
|
+
#define GET_CAUSE(node) causes[hash_id(node)]
|
709
|
+
#else
|
710
|
+
#define SET_CAUSE(node, ...)
|
711
|
+
#define GET_CAUSE(node) ""
|
712
|
+
#endif
|
713
|
+
|
485
714
|
// returns the backend that should be used for the node based on the current locations
|
486
|
-
char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
487
715
|
static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
488
716
|
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
489
717
|
// ie. kv cache updates
|
490
718
|
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
491
719
|
// dst
|
492
|
-
ggml_backend_t cur_backend =
|
720
|
+
ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
|
493
721
|
if (cur_backend != NULL) {
|
494
|
-
|
722
|
+
SET_CAUSE(node, "1.dst");
|
495
723
|
return cur_backend;
|
496
724
|
}
|
497
725
|
|
498
726
|
// view_src
|
499
|
-
if (node->view_src != NULL &&
|
500
|
-
|
501
|
-
return
|
727
|
+
if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
|
728
|
+
SET_CAUSE(node, "1.vsrc");
|
729
|
+
return get_buffer_backend(sched, node->view_src->buffer);
|
502
730
|
}
|
503
731
|
|
504
732
|
// src
|
@@ -510,7 +738,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|
510
738
|
if (src == NULL) {
|
511
739
|
break;
|
512
740
|
}
|
513
|
-
ggml_backend_t src_backend =
|
741
|
+
ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
|
514
742
|
if (src_backend != NULL) {
|
515
743
|
int src_prio = sched_backend_prio(sched, src_backend);
|
516
744
|
size_t src_size = ggml_nbytes(src);
|
@@ -518,7 +746,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|
518
746
|
cur_prio = src_prio;
|
519
747
|
cur_size = src_size;
|
520
748
|
cur_backend = src_backend;
|
521
|
-
|
749
|
+
SET_CAUSE(node, "1.src%d", i);
|
522
750
|
}
|
523
751
|
}
|
524
752
|
}
|
@@ -539,10 +767,12 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
539
767
|
int cur_split = 0;
|
540
768
|
for (int i = 0; i < graph->n_nodes; i++) {
|
541
769
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
542
|
-
ggml_backend_t split_backend =
|
543
|
-
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
770
|
+
ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
|
771
|
+
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
772
|
+
sched->splits[cur_split].n_inputs);
|
544
773
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
545
|
-
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
774
|
+
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
775
|
+
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
546
776
|
}
|
547
777
|
fprintf(stderr, "\n");
|
548
778
|
cur_split++;
|
@@ -552,16 +782,18 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
552
782
|
continue;
|
553
783
|
}
|
554
784
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
555
|
-
ggml_backend_t node_backend = node_allocr ?
|
556
|
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
785
|
+
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
786
|
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
787
|
+
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
557
788
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
558
789
|
struct ggml_tensor * src = node->src[j];
|
559
790
|
if (src == NULL) {
|
560
791
|
break;
|
561
792
|
}
|
562
793
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
563
|
-
ggml_backend_t src_backend = src_allocr ?
|
564
|
-
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
|
794
|
+
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
795
|
+
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
|
796
|
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
565
797
|
}
|
566
798
|
fprintf(stderr, "\n");
|
567
799
|
}
|
@@ -587,9 +819,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
587
819
|
sched->n_splits = 0;
|
588
820
|
|
589
821
|
struct ggml_init_params params = {
|
590
|
-
|
591
|
-
|
592
|
-
|
822
|
+
/* .mem_size = */ sizeof(sched->context_buffer),
|
823
|
+
/* .mem_buffer = */ sched->context_buffer,
|
824
|
+
/* .no_alloc = */ true
|
593
825
|
};
|
594
826
|
|
595
827
|
if (sched->ctx != NULL) {
|
@@ -605,9 +837,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
605
837
|
// do not overwrite user assignments
|
606
838
|
continue;
|
607
839
|
}
|
608
|
-
ggml_backend_t leaf_backend =
|
840
|
+
ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
|
609
841
|
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
610
|
-
leaf_backend =
|
842
|
+
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
|
611
843
|
}
|
612
844
|
if (leaf_backend != NULL) {
|
613
845
|
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
@@ -649,7 +881,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
649
881
|
cur_prio = src_prio;
|
650
882
|
cur_size = src_size;
|
651
883
|
node_allocr = src_allocr;
|
652
|
-
|
884
|
+
SET_CAUSE(node, "2.src%d", j);
|
653
885
|
}
|
654
886
|
}
|
655
887
|
}
|
@@ -733,7 +965,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
733
965
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
734
966
|
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
735
967
|
node_allocr(tensor_copy) = cur_allocr;
|
736
|
-
ggml_backend_t backend =
|
968
|
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
737
969
|
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
738
970
|
}
|
739
971
|
node->src[j] = sched->node_copies[id][cur_backend_id];
|
@@ -761,8 +993,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
761
993
|
ggml_tallocr_t src_allocr = node_allocr(src);
|
762
994
|
if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
|
763
995
|
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
764
|
-
node->name, node_allocr ? ggml_backend_name(
|
765
|
-
j, src->name, src_allocr ? ggml_backend_name(
|
996
|
+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
997
|
+
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
766
998
|
}
|
767
999
|
}
|
768
1000
|
}
|
@@ -773,7 +1005,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
773
1005
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
|
774
1006
|
for (int i = 0; i < sched->n_splits; i++) {
|
775
1007
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
776
|
-
split->graph = ggml_graph_view(
|
1008
|
+
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
777
1009
|
|
778
1010
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
779
1011
|
for (int j = 0; j < split->n_inputs; j++) {
|
@@ -806,31 +1038,29 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
806
1038
|
|
807
1039
|
for (int i = 0; i < sched->n_splits; i++) {
|
808
1040
|
struct ggml_backend_sched_split * split = &splits[i];
|
809
|
-
ggml_backend_t split_backend =
|
1041
|
+
ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
|
810
1042
|
int split_backend_id = sched_backend_prio(sched, split_backend);
|
811
1043
|
|
812
1044
|
// copy the input tensors to the split backend
|
813
1045
|
uint64_t copy_start_us = ggml_time_us();
|
814
1046
|
for (int j = 0; j < split->n_inputs; j++) {
|
815
|
-
struct ggml_tensor *
|
816
|
-
|
817
|
-
|
818
|
-
|
1047
|
+
struct ggml_tensor * input = split->inputs[j];
|
1048
|
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
|
1049
|
+
if (input->buffer == NULL) {
|
1050
|
+
if (input->view_src == NULL) {
|
1051
|
+
fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
|
819
1052
|
exit(1);
|
820
1053
|
}
|
821
|
-
|
822
|
-
|
823
|
-
view->buffer = view->view_src->buffer;
|
824
|
-
view->data = (char *)view->view_src->data + view->view_offs;
|
825
|
-
ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
|
1054
|
+
// FIXME: may need to use the sched buffer instead
|
1055
|
+
ggml_backend_view_init(input->view_src->buffer, input);
|
826
1056
|
}
|
827
1057
|
if (input_cpy->buffer == NULL) {
|
828
1058
|
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
829
1059
|
exit(1);
|
830
1060
|
}
|
831
|
-
GGML_ASSERT(
|
832
|
-
GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
833
|
-
ggml_backend_tensor_copy(
|
1061
|
+
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
|
1062
|
+
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
1063
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
834
1064
|
}
|
835
1065
|
// ggml_backend_synchronize(split_backend);
|
836
1066
|
int64_t copy_end_us = ggml_time_us();
|
@@ -843,7 +1073,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
843
1073
|
#endif
|
844
1074
|
|
845
1075
|
uint64_t compute_start_us = ggml_time_us();
|
846
|
-
ggml_backend_graph_compute(split_backend, split->graph);
|
1076
|
+
ggml_backend_graph_compute(split_backend, &split->graph);
|
847
1077
|
// ggml_backend_synchronize(split_backend);
|
848
1078
|
uint64_t compute_end_us = ggml_time_us();
|
849
1079
|
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
@@ -872,8 +1102,6 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
|
|
872
1102
|
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
|
873
1103
|
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
874
1104
|
|
875
|
-
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
|
876
|
-
|
877
1105
|
sched->n_backends = n_backends;
|
878
1106
|
for (int i = 0; i < n_backends; i++) {
|
879
1107
|
sched->backends[i] = backends[i];
|
@@ -948,3 +1176,182 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
948
1176
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
949
1177
|
node_allocr(node) = sched->tallocs[backend_index];
|
950
1178
|
}
|
1179
|
+
|
1180
|
+
// utils
|
1181
|
+
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
1182
|
+
GGML_ASSERT(tensor->buffer == NULL);
|
1183
|
+
GGML_ASSERT(tensor->data == NULL);
|
1184
|
+
GGML_ASSERT(tensor->view_src != NULL);
|
1185
|
+
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1186
|
+
GGML_ASSERT(tensor->view_src->data != NULL);
|
1187
|
+
|
1188
|
+
tensor->buffer = buffer;
|
1189
|
+
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1190
|
+
tensor->backend = tensor->view_src->backend;
|
1191
|
+
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1192
|
+
}
|
1193
|
+
|
1194
|
+
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
1195
|
+
GGML_ASSERT(tensor->buffer == NULL);
|
1196
|
+
GGML_ASSERT(tensor->data == NULL);
|
1197
|
+
GGML_ASSERT(tensor->view_src == NULL);
|
1198
|
+
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
|
1199
|
+
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
1200
|
+
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
1201
|
+
|
1202
|
+
tensor->buffer = buffer;
|
1203
|
+
tensor->data = addr;
|
1204
|
+
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1205
|
+
}
|
1206
|
+
|
1207
|
+
static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
1208
|
+
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
1209
|
+
|
1210
|
+
GGML_ASSERT(src != NULL);
|
1211
|
+
GGML_ASSERT(src->data && "graph must be allocated");
|
1212
|
+
|
1213
|
+
size_t id = ggml_hash_insert(hash_set, src);
|
1214
|
+
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
|
1215
|
+
return node_copies[ggml_hash_find(hash_set, src)];
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
1219
|
+
if (src->view_src != NULL) {
|
1220
|
+
dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
1221
|
+
dst->view_offs = src->view_offs;
|
1222
|
+
}
|
1223
|
+
dst->op = src->op;
|
1224
|
+
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
1225
|
+
ggml_set_name(dst, src->name);
|
1226
|
+
|
1227
|
+
// copy src
|
1228
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1229
|
+
struct ggml_tensor * s = src->src[i];
|
1230
|
+
if (s == NULL) {
|
1231
|
+
break;
|
1232
|
+
}
|
1233
|
+
dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
node_copies[id] = dst;
|
1237
|
+
return dst;
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
1241
|
+
size_t id = ggml_hash_find(hash_set, src);
|
1242
|
+
if (node_init[id]) {
|
1243
|
+
return;
|
1244
|
+
}
|
1245
|
+
node_init[id] = true;
|
1246
|
+
|
1247
|
+
struct ggml_tensor * dst = node_copies[id];
|
1248
|
+
if (dst->view_src != NULL) {
|
1249
|
+
ggml_backend_view_init(dst->view_src->buffer, dst);
|
1250
|
+
}
|
1251
|
+
else {
|
1252
|
+
ggml_backend_tensor_copy(src, dst);
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
// init src
|
1256
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
1257
|
+
struct ggml_tensor * s = src->src[i];
|
1258
|
+
if (s == NULL) {
|
1259
|
+
break;
|
1260
|
+
}
|
1261
|
+
graph_init_tensor(hash_set, node_copies, node_init, s);
|
1262
|
+
}
|
1263
|
+
}
|
1264
|
+
|
1265
|
+
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1266
|
+
struct ggml_hash_set hash_set = {
|
1267
|
+
/* .size = */ graph->visited_hash_table.size,
|
1268
|
+
/* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
|
1269
|
+
};
|
1270
|
+
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
|
1271
|
+
bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
|
1272
|
+
|
1273
|
+
struct ggml_init_params params = {
|
1274
|
+
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
1275
|
+
/* .mem_buffer = */ NULL,
|
1276
|
+
/* .no_alloc = */ true
|
1277
|
+
};
|
1278
|
+
|
1279
|
+
struct ggml_context * ctx_allocated = ggml_init(params);
|
1280
|
+
struct ggml_context * ctx_unallocated = ggml_init(params);
|
1281
|
+
|
1282
|
+
// dup nodes
|
1283
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1284
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1285
|
+
graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
1286
|
+
}
|
1287
|
+
|
1288
|
+
// allocate nodes
|
1289
|
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
1290
|
+
|
1291
|
+
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
1292
|
+
|
1293
|
+
// copy data and init views
|
1294
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1295
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1296
|
+
graph_init_tensor(hash_set, node_copies, node_init, node);
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
// build graph copy
|
1300
|
+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
1301
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1302
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1303
|
+
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
|
1304
|
+
graph_copy->nodes[i] = node_copy;
|
1305
|
+
}
|
1306
|
+
graph_copy->n_nodes = graph->n_nodes;
|
1307
|
+
|
1308
|
+
free(hash_set.keys);
|
1309
|
+
free(node_copies);
|
1310
|
+
free(node_init);
|
1311
|
+
|
1312
|
+
return (struct ggml_backend_graph_copy) {
|
1313
|
+
/* .buffer = */ buffer,
|
1314
|
+
/* .ctx_allocated = */ ctx_allocated,
|
1315
|
+
/* .ctx_unallocated = */ ctx_unallocated,
|
1316
|
+
/* .graph = */ graph_copy,
|
1317
|
+
};
|
1318
|
+
}
|
1319
|
+
|
1320
|
+
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
1321
|
+
ggml_backend_buffer_free(copy.buffer);
|
1322
|
+
ggml_free(copy.ctx_allocated);
|
1323
|
+
ggml_free(copy.ctx_unallocated);
|
1324
|
+
}
|
1325
|
+
|
1326
|
+
void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
1327
|
+
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
1328
|
+
struct ggml_cgraph * g1 = graph;
|
1329
|
+
struct ggml_cgraph * g2 = copy.graph;
|
1330
|
+
|
1331
|
+
assert(g1->n_nodes == g2->n_nodes);
|
1332
|
+
|
1333
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
1334
|
+
//printf("eval %d/%d\n", i, g1->n_nodes);
|
1335
|
+
struct ggml_tensor * t1 = g1->nodes[i];
|
1336
|
+
struct ggml_tensor * t2 = g2->nodes[i];
|
1337
|
+
|
1338
|
+
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
1339
|
+
|
1340
|
+
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
1341
|
+
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
1342
|
+
|
1343
|
+
ggml_backend_graph_compute(backend1, &g1v);
|
1344
|
+
ggml_backend_graph_compute(backend2, &g2v);
|
1345
|
+
|
1346
|
+
if (ggml_is_view_op(t1->op)) {
|
1347
|
+
continue;
|
1348
|
+
}
|
1349
|
+
|
1350
|
+
// compare results, calculate rms etc
|
1351
|
+
if (!callback(i, t1, t2, user_data)) {
|
1352
|
+
break;
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
|
1356
|
+
ggml_backend_graph_copy_free(copy);
|
1357
|
+
}
|