cui-llama.rn 1.0.3 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +35 -39
  2. package/android/src/main/CMakeLists.txt +12 -2
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +29 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +33 -1
  5. package/android/src/main/jni.cpp +62 -8
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
  8. package/cpp/common.cpp +3237 -3231
  9. package/cpp/common.h +469 -468
  10. package/cpp/ggml-aarch64.c +2193 -2193
  11. package/cpp/ggml-aarch64.h +39 -39
  12. package/cpp/ggml-alloc.c +1036 -1042
  13. package/cpp/ggml-backend-impl.h +153 -153
  14. package/cpp/ggml-backend.c +2240 -2234
  15. package/cpp/ggml-backend.h +238 -238
  16. package/cpp/ggml-common.h +1833 -1829
  17. package/cpp/ggml-impl.h +755 -655
  18. package/cpp/ggml-metal.h +65 -65
  19. package/cpp/ggml-metal.m +3269 -3269
  20. package/cpp/ggml-quants.c +14872 -14860
  21. package/cpp/ggml-quants.h +132 -132
  22. package/cpp/ggml.c +22055 -22044
  23. package/cpp/ggml.h +2453 -2447
  24. package/cpp/llama-grammar.cpp +539 -0
  25. package/cpp/llama-grammar.h +39 -0
  26. package/cpp/llama-impl.h +26 -0
  27. package/cpp/llama-sampling.cpp +635 -0
  28. package/cpp/llama-sampling.h +56 -0
  29. package/cpp/llama-vocab.cpp +1721 -0
  30. package/cpp/llama-vocab.h +130 -0
  31. package/cpp/llama.cpp +19171 -21892
  32. package/cpp/llama.h +1240 -1217
  33. package/cpp/log.h +737 -737
  34. package/cpp/rn-llama.hpp +207 -29
  35. package/cpp/sampling.cpp +460 -460
  36. package/cpp/sgemm.cpp +1027 -1027
  37. package/cpp/sgemm.h +14 -14
  38. package/cpp/unicode.cpp +6 -0
  39. package/cpp/unicode.h +3 -0
  40. package/ios/RNLlama.mm +15 -6
  41. package/ios/RNLlamaContext.h +2 -8
  42. package/ios/RNLlamaContext.mm +41 -34
  43. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  44. package/lib/commonjs/chat.js +37 -0
  45. package/lib/commonjs/chat.js.map +1 -0
  46. package/lib/commonjs/index.js +14 -1
  47. package/lib/commonjs/index.js.map +1 -1
  48. package/lib/module/NativeRNLlama.js.map +1 -1
  49. package/lib/module/chat.js +31 -0
  50. package/lib/module/chat.js.map +1 -0
  51. package/lib/module/index.js +14 -1
  52. package/lib/module/index.js.map +1 -1
  53. package/lib/typescript/NativeRNLlama.d.ts +5 -1
  54. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  55. package/lib/typescript/chat.d.ts +10 -0
  56. package/lib/typescript/chat.d.ts.map +1 -0
  57. package/lib/typescript/index.d.ts +9 -2
  58. package/lib/typescript/index.d.ts.map +1 -1
  59. package/package.json +1 -1
  60. package/src/NativeRNLlama.ts +10 -1
  61. package/src/chat.ts +44 -0
  62. package/src/index.ts +31 -4
@@ -1,2234 +1,2240 @@
1
- #include "ggml-backend-impl.h"
2
- #include "ggml-alloc.h"
3
- #include "ggml-impl.h"
4
-
5
- #include <assert.h>
6
- #include <limits.h>
7
- #include <stdarg.h>
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
11
-
12
-
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
-
15
- // backend buffer type
16
-
17
- const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
18
- return buft->iface.get_name(buft);
19
- }
20
-
21
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
22
- return buft->iface.alloc_buffer(buft, size);
23
- }
24
-
25
- size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) {
26
- return buft->iface.get_alignment(buft);
27
- }
28
-
29
- size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
30
- // get_max_size is optional, defaults to SIZE_MAX
31
- if (buft->iface.get_max_size) {
32
- return buft->iface.get_max_size(buft);
33
- }
34
- return SIZE_MAX;
35
- }
36
-
37
- LM_GGML_CALL size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
38
- // get_alloc_size is optional, defaults to lm_ggml_nbytes
39
- if (buft->iface.get_alloc_size) {
40
- size_t size = buft->iface.get_alloc_size(buft, tensor);
41
- assert(size >= lm_ggml_nbytes(tensor));
42
- return size;
43
- }
44
- return lm_ggml_nbytes(tensor);
45
- }
46
-
47
- bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
48
- if (buft->iface.is_host) {
49
- return buft->iface.is_host(buft);
50
- }
51
- return false;
52
- }
53
-
54
- // backend buffer
55
-
56
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
57
- lm_ggml_backend_buffer_type_t buft,
58
- struct lm_ggml_backend_buffer_i iface,
59
- lm_ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- lm_ggml_backend_buffer_t buffer = malloc(sizeof(struct lm_ggml_backend_buffer));
62
-
63
- (*buffer) = (struct lm_ggml_backend_buffer) {
64
- /* .interface = */ iface,
65
- /* .buft = */ buft,
66
- /* .context = */ context,
67
- /* .size = */ size,
68
- /* .usage = */ LM_GGML_BACKEND_BUFFER_USAGE_ANY
69
- };
70
-
71
- return buffer;
72
- }
73
-
74
- const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
75
- return buffer->iface.get_name(buffer);
76
- }
77
-
78
- void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
79
- if (buffer == NULL) {
80
- return;
81
- }
82
-
83
- if (buffer->iface.free_buffer != NULL) {
84
- buffer->iface.free_buffer(buffer);
85
- }
86
- free(buffer);
87
- }
88
-
89
- size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
90
- return buffer->size;
91
- }
92
-
93
- void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
94
- void * base = buffer->iface.get_base(buffer);
95
-
96
- LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
97
-
98
- return base;
99
- }
100
-
101
- LM_GGML_CALL void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
102
- // init_tensor is optional
103
- if (buffer->iface.init_tensor) {
104
- buffer->iface.init_tensor(buffer, tensor);
105
- }
106
- }
107
-
108
- size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) {
109
- return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
110
- }
111
-
112
- size_t lm_ggml_backend_buffer_get_max_size(lm_ggml_backend_buffer_t buffer) {
113
- return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_get_type(buffer));
114
- }
115
-
116
- size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
117
- return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
118
- }
119
-
120
- void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
121
- buffer->iface.clear(buffer, value);
122
- }
123
-
124
- bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
125
- return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
126
- }
127
-
128
- void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
129
- buffer->usage = usage;
130
-
131
- // FIXME: add a generic callback to the buffer interface
132
- if (lm_ggml_backend_buffer_is_multi_buffer(buffer)) {
133
- lm_ggml_backend_multi_buffer_set_usage(buffer, usage);
134
- }
135
- }
136
-
137
- enum lm_ggml_backend_buffer_usage lm_ggml_backend_buffer_get_usage(lm_ggml_backend_buffer_t buffer) {
138
- return buffer->usage;
139
- }
140
-
141
- lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) {
142
- return buffer->buft;
143
- }
144
-
145
- void lm_ggml_backend_buffer_reset(lm_ggml_backend_buffer_t buffer) {
146
- if (buffer->iface.reset) {
147
- buffer->iface.reset(buffer);
148
- }
149
- }
150
-
151
- bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
152
- lm_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
- if (dst_buf->iface.cpy_tensor) {
154
- return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
- }
156
- return false;
157
- }
158
-
159
- // backend
160
-
161
- lm_ggml_guid_t lm_ggml_backend_guid(lm_ggml_backend_t backend) {
162
- if (backend == NULL) {
163
- return NULL;
164
- }
165
- return backend->guid;
166
- }
167
-
168
- const char * lm_ggml_backend_name(lm_ggml_backend_t backend) {
169
- if (backend == NULL) {
170
- return "NULL";
171
- }
172
- return backend->iface.get_name(backend);
173
- }
174
-
175
- void lm_ggml_backend_free(lm_ggml_backend_t backend) {
176
- if (backend == NULL) {
177
- return;
178
- }
179
-
180
- backend->iface.free(backend);
181
- }
182
-
183
- lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
184
- return backend->iface.get_default_buffer_type(backend);
185
- }
186
-
187
- lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
188
- return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size);
189
- }
190
-
191
- size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) {
192
- return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend));
193
- }
194
-
195
- size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend) {
196
- return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_get_default_buffer_type(backend));
197
- }
198
-
199
- void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
200
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
201
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
202
-
203
- if (backend->iface.set_tensor_async == NULL) {
204
- lm_ggml_backend_tensor_set(tensor, data, offset, size);
205
- } else {
206
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
207
- }
208
- }
209
-
210
- void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
211
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
212
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
213
-
214
- if (backend->iface.get_tensor_async == NULL) {
215
- lm_ggml_backend_tensor_get(tensor, data, offset, size);
216
- } else {
217
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
218
- }
219
- }
220
-
221
- LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
- lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
-
224
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
227
-
228
- if (!size) {
229
- return;
230
- }
231
-
232
- buf->iface.set_tensor(buf, tensor, data, offset, size);
233
- }
234
-
235
- LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
- lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
-
238
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
241
-
242
- if (!size) {
243
- return;
244
- }
245
-
246
- buf->iface.get_tensor(buf, tensor, data, offset, size);
247
- }
248
-
249
- void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) {
250
- if (backend->iface.synchronize == NULL) {
251
- return;
252
- }
253
-
254
- backend->iface.synchronize(backend);
255
- }
256
-
257
- lm_ggml_backend_graph_plan_t lm_ggml_backend_graph_plan_create(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
258
- LM_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
-
260
- return backend->iface.graph_plan_create(backend, cgraph);
261
- }
262
-
263
- void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
264
- LM_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
-
266
- backend->iface.graph_plan_free(backend, plan);
267
- }
268
-
269
- enum lm_ggml_status lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
270
- LM_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
-
272
- return backend->iface.graph_plan_compute(backend, plan);
273
- }
274
-
275
- enum lm_ggml_status lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
276
- enum lm_ggml_status err = lm_ggml_backend_graph_compute_async(backend, cgraph);
277
- lm_ggml_backend_synchronize(backend);
278
- return err;
279
- }
280
-
281
- enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
282
- return backend->iface.graph_compute(backend, cgraph);
283
- }
284
-
285
- bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
286
- return backend->iface.supports_op(backend, op);
287
- }
288
-
289
- bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
290
- return backend->iface.supports_buft(backend, buft);
291
- }
292
-
293
- bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
294
- if (backend->iface.offload_op != NULL) {
295
- return backend->iface.offload_op(backend, op);
296
- }
297
- return false;
298
- }
299
-
300
- // backend copy
301
-
302
- static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
303
- if (a->type != b->type) {
304
- return false;
305
- }
306
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
307
- if (a->ne[i] != b->ne[i]) {
308
- return false;
309
- }
310
- if (a->nb[i] != b->nb[i]) {
311
- return false;
312
- }
313
- }
314
- return true;
315
- }
316
-
317
- void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
318
- LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
-
320
- if (src == dst) {
321
- return;
322
- }
323
-
324
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
325
- lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
326
- } else if (lm_ggml_backend_buffer_is_host(dst->buffer)) {
327
- lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
328
- } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
329
- #ifndef NDEBUG
330
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
331
- #endif
332
- size_t nbytes = lm_ggml_nbytes(src);
333
- void * data = malloc(nbytes);
334
- lm_ggml_backend_tensor_get(src, data, 0, nbytes);
335
- lm_ggml_backend_tensor_set(dst, data, 0, nbytes);
336
- free(data);
337
- }
338
- }
339
-
340
- void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
341
- LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
342
-
343
- if (src == dst) {
344
- return;
345
- }
346
-
347
- if (backend_dst->iface.cpy_tensor_async != NULL) {
348
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
349
- return;
350
- }
351
- }
352
-
353
- // an async copy would normally happen after all the queued operations on both backends are completed
354
- // sync src, set_async dst
355
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
356
- lm_ggml_backend_synchronize(backend_src);
357
- lm_ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, lm_ggml_nbytes(src));
358
- } else {
359
- lm_ggml_backend_synchronize(backend_src);
360
- lm_ggml_backend_tensor_copy(src, dst);
361
- lm_ggml_backend_synchronize(backend_dst);
362
- }
363
- }
364
-
365
- // events
366
-
367
- lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_t backend) {
368
- if (backend->iface.event_new == NULL) {
369
- return NULL;
370
- }
371
- return backend->iface.event_new(backend);
372
- }
373
-
374
- void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
375
- if (event == NULL) {
376
- return;
377
- }
378
- event->backend->iface.event_free(event);
379
- }
380
-
381
- void lm_ggml_backend_event_record(lm_ggml_backend_event_t event) {
382
- LM_GGML_ASSERT(event->backend->iface.event_record != NULL);
383
-
384
- event->backend->iface.event_record(event);
385
- }
386
-
387
- void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
388
- LM_GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
389
-
390
- event->backend->iface.event_synchronize(event);
391
- }
392
-
393
- void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
394
- LM_GGML_ASSERT(backend->iface.event_wait != NULL);
395
-
396
- backend->iface.event_wait(backend, event);
397
- }
398
-
399
- // backend registry
400
-
401
- #define LM_GGML_REG_MAX_BACKENDS 64
402
-
403
- struct lm_ggml_backend_reg {
404
- char name[128];
405
- lm_ggml_backend_init_fn init_fn;
406
- lm_ggml_backend_buffer_type_t default_buffer_type;
407
- void * user_data;
408
- };
409
-
410
- static struct lm_ggml_backend_reg lm_ggml_backend_registry[LM_GGML_REG_MAX_BACKENDS];
411
- static size_t lm_ggml_backend_registry_count = 0;
412
-
413
- LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data);
414
-
415
- LM_GGML_CALL static void lm_ggml_backend_registry_init(void) {
416
- static bool initialized = false;
417
-
418
- if (initialized) {
419
- return;
420
- }
421
-
422
- initialized = true;
423
-
424
- lm_ggml_backend_register("CPU", lm_ggml_backend_reg_cpu_init, lm_ggml_backend_cpu_buffer_type(), NULL);
425
-
426
- // add forward decls here to avoid including the backend headers
427
- #ifdef LM_GGML_USE_CUDA
428
- extern LM_GGML_CALL void lm_ggml_backend_cuda_reg_devices(void);
429
- lm_ggml_backend_cuda_reg_devices();
430
- #endif
431
-
432
- #ifdef LM_GGML_USE_SYCL
433
- extern void lm_ggml_backend_sycl_reg_devices(void);
434
- lm_ggml_backend_sycl_reg_devices();
435
- #endif
436
-
437
- #ifdef LM_GGML_USE_METAL
438
- extern LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data);
439
- extern LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
440
- lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL);
441
- #endif
442
-
443
- #ifdef LM_GGML_USE_VULKAN
444
- extern LM_GGML_CALL int lm_ggml_backend_vk_reg_devices(void);
445
- lm_ggml_backend_vk_reg_devices();
446
- #endif
447
-
448
- #ifdef LM_GGML_USE_KOMPUTE
449
- extern LM_GGML_CALL void lm_ggml_backend_kompute_reg_devices(void);
450
- lm_ggml_backend_kompute_reg_devices();
451
- #endif
452
-
453
- #ifdef LM_GGML_USE_CANN
454
- extern LM_GGML_CALL int lm_ggml_backend_cann_reg_devices(void);
455
- lm_ggml_backend_cann_reg_devices();
456
- #endif
457
- }
458
-
459
- LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
460
- LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_REG_MAX_BACKENDS);
461
-
462
- size_t id = lm_ggml_backend_registry_count;
463
-
464
- lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) {
465
- /* .name = */ {0},
466
- /* .fn = */ init_fn,
467
- /* .default_buffer_type = */ default_buffer_type,
468
- /* .user_data = */ user_data,
469
- };
470
-
471
- snprintf(lm_ggml_backend_registry[id].name, sizeof(lm_ggml_backend_registry[id].name), "%s", name);
472
-
473
- #ifndef NDEBUG
474
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
475
- #endif
476
-
477
- lm_ggml_backend_registry_count++;
478
- }
479
-
480
- size_t lm_ggml_backend_reg_get_count(void) {
481
- lm_ggml_backend_registry_init();
482
-
483
- return lm_ggml_backend_registry_count;
484
- }
485
-
486
- size_t lm_ggml_backend_reg_find_by_name(const char * name) {
487
- lm_ggml_backend_registry_init();
488
-
489
- for (size_t i = 0; i < lm_ggml_backend_registry_count; i++) {
490
- // TODO: case insensitive in a portable way
491
- if (strcmp(lm_ggml_backend_registry[i].name, name) == 0) {
492
- return i;
493
- }
494
- }
495
-
496
- // not found
497
- return SIZE_MAX;
498
- }
499
-
500
- // init from backend:params string
501
- lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str) {
502
- lm_ggml_backend_registry_init();
503
-
504
- const char * params = strchr(backend_str, ':');
505
- char backend_name[128];
506
- if (params == NULL) {
507
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
508
- params = "";
509
- } else {
510
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
511
- params++;
512
- }
513
-
514
- size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name);
515
-
516
- if (backend_i == SIZE_MAX) {
517
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
518
- return NULL;
519
- }
520
-
521
- return lm_ggml_backend_reg_init_backend(backend_i, params);
522
- }
523
-
524
- const char * lm_ggml_backend_reg_get_name(size_t i) {
525
- lm_ggml_backend_registry_init();
526
-
527
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
528
- return lm_ggml_backend_registry[i].name;
529
- }
530
-
531
- lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params) {
532
- lm_ggml_backend_registry_init();
533
-
534
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
535
- return lm_ggml_backend_registry[i].init_fn(params, lm_ggml_backend_registry[i].user_data);
536
- }
537
-
538
- lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i) {
539
- lm_ggml_backend_registry_init();
540
-
541
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
542
- return lm_ggml_backend_registry[i].default_buffer_type;
543
- }
544
-
545
- lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
546
- lm_ggml_backend_registry_init();
547
-
548
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
549
- return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_registry[i].default_buffer_type, size);
550
- }
551
-
552
- // backend CPU
553
-
554
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
555
-
556
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend_buffer_t buffer) {
557
- return "CPU";
558
-
559
- LM_GGML_UNUSED(buffer);
560
- }
561
-
562
- LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
563
- uintptr_t data = (uintptr_t)buffer->context;
564
-
565
- // align the buffer
566
- if (data % TENSOR_ALIGNMENT != 0) {
567
- data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
568
- }
569
-
570
- return (void *)data;
571
- }
572
-
573
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
574
- free(buffer->context);
575
- }
576
-
577
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
578
- memcpy((char *)tensor->data + offset, data, size);
579
-
580
- LM_GGML_UNUSED(buffer);
581
- }
582
-
583
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
584
- memcpy(data, (const char *)tensor->data + offset, size);
585
-
586
- LM_GGML_UNUSED(buffer);
587
- }
588
-
589
- LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
590
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
591
- memcpy(dst->data, src->data, lm_ggml_nbytes(src));
592
- return true;
593
- }
594
- return false;
595
-
596
- LM_GGML_UNUSED(buffer);
597
- }
598
-
599
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
600
- memset(buffer->context, value, buffer->size);
601
- }
602
-
603
- static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
604
- /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
605
- /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
606
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
607
- /* .init_tensor = */ NULL, // no initialization required
608
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
609
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
610
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
611
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
612
- /* .reset = */ NULL,
613
- };
614
-
615
- // for buffers from ptr, free is not called
616
- static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
617
- /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
618
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
619
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
620
- /* .init_tensor = */ NULL, // no initialization required
621
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
622
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
623
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
624
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
625
- /* .reset = */ NULL,
626
- };
627
-
628
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
629
- return "CPU";
630
-
631
- LM_GGML_UNUSED(buft);
632
- }
633
-
634
- LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
635
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
636
- void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
637
- if (data == NULL) {
638
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
639
- return NULL;
640
- }
641
-
642
- return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
643
- }
644
-
645
- LM_GGML_CALL static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
646
- return TENSOR_ALIGNMENT;
647
-
648
- LM_GGML_UNUSED(buft);
649
- }
650
-
651
- LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
652
- return true;
653
-
654
- LM_GGML_UNUSED(buft);
655
- }
656
-
657
- LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
658
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
659
- /* .iface = */ {
660
- /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
661
- /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
662
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
663
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
664
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
665
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
666
- },
667
- /* .context = */ NULL,
668
- };
669
-
670
- return &lm_ggml_backend_cpu_buffer_type;
671
- }
672
-
673
- #ifdef LM_GGML_USE_CPU_HBM
674
-
675
- // buffer type HBM
676
-
677
- #include <hbwmalloc.h>
678
-
679
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
680
- return "CPU_HBM";
681
-
682
- LM_GGML_UNUSED(buft);
683
- }
684
-
685
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
686
- return "CPU_HBM";
687
-
688
- LM_GGML_UNUSED(buf);
689
- }
690
-
691
- LM_GGML_CALL static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
692
- hbw_free(buffer->context);
693
- }
694
-
695
- LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
696
- //void * ptr = hbw_malloc(size);
697
- void * ptr;
698
- int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
699
- if (result != 0) {
700
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
701
- return NULL;
702
- }
703
-
704
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
705
- buffer->buft = buft;
706
- buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name;
707
- buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
708
-
709
- return buffer;
710
- }
711
-
712
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
713
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
714
- /* .iface = */ {
715
- /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
716
- /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
717
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
718
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
719
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
720
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
721
- },
722
- /* .context = */ NULL,
723
- };
724
-
725
- return &lm_ggml_backend_cpu_buffer_type_hbm;
726
- }
727
- #endif
728
-
729
- struct lm_ggml_backend_cpu_context {
730
- int n_threads;
731
- void * work_data;
732
- size_t work_size;
733
-
734
- lm_ggml_abort_callback abort_callback;
735
- void * abort_callback_data;
736
- };
737
-
738
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) {
739
- return "CPU";
740
-
741
- LM_GGML_UNUSED(backend);
742
- }
743
-
744
- LM_GGML_CALL static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
745
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
746
- free(cpu_ctx->work_data);
747
- free(cpu_ctx);
748
- free(backend);
749
- }
750
-
751
- LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
752
- return lm_ggml_backend_cpu_buffer_type();
753
-
754
- LM_GGML_UNUSED(backend);
755
- }
756
-
757
- struct lm_ggml_backend_plan_cpu {
758
- struct lm_ggml_cplan cplan;
759
- struct lm_ggml_cgraph cgraph;
760
- };
761
-
762
- LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
763
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
764
-
765
- struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu));
766
-
767
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
768
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
769
-
770
- if (cpu_plan->cplan.work_size > 0) {
771
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
772
- if (cpu_plan->cplan.work_data == NULL) {
773
- free(cpu_plan);
774
- return NULL;
775
- }
776
- }
777
-
778
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
779
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
780
-
781
- return cpu_plan;
782
- }
783
-
784
- LM_GGML_CALL static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
785
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
786
-
787
- free(cpu_plan->cplan.work_data);
788
- free(cpu_plan);
789
-
790
- LM_GGML_UNUSED(backend);
791
- }
792
-
793
- LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
794
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
795
-
796
- return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
797
-
798
- LM_GGML_UNUSED(backend);
799
- }
800
-
801
- LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
802
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
803
-
804
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
805
-
806
- if (cpu_ctx->work_size < cplan.work_size) {
807
- free(cpu_ctx->work_data);
808
- cpu_ctx->work_data = malloc(cplan.work_size);
809
- if (cpu_ctx->work_data == NULL) {
810
- cpu_ctx->work_size = 0;
811
- return LM_GGML_STATUS_ALLOC_FAILED;
812
- }
813
- cpu_ctx->work_size = cplan.work_size;
814
- }
815
- cplan.work_data = cpu_ctx->work_data;
816
-
817
- cplan.abort_callback = cpu_ctx->abort_callback;
818
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
819
-
820
- return lm_ggml_graph_compute(cgraph, &cplan);
821
- }
822
-
823
- LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
824
- switch (op->op) {
825
- case LM_GGML_OP_CPY:
826
- return
827
- op->type != LM_GGML_TYPE_IQ2_XXS &&
828
- op->type != LM_GGML_TYPE_IQ2_XS &&
829
- op->type != LM_GGML_TYPE_IQ1_S &&
830
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
831
- case LM_GGML_OP_MUL_MAT:
832
- return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
833
- default:
834
- return true;
835
- }
836
-
837
- LM_GGML_UNUSED(backend);
838
- }
839
-
840
- LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
841
- return lm_ggml_backend_buft_is_host(buft);
842
-
843
- LM_GGML_UNUSED(backend);
844
- }
845
-
846
- static struct lm_ggml_backend_i cpu_backend_i = {
847
- /* .get_name = */ lm_ggml_backend_cpu_name,
848
- /* .free = */ lm_ggml_backend_cpu_free,
849
- /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
850
- /* .set_tensor_async = */ NULL,
851
- /* .get_tensor_async = */ NULL,
852
- /* .cpy_tensor_async = */ NULL,
853
- /* .synchronize = */ NULL,
854
- /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
855
- /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
856
- /* .graph_plan_update = */ NULL,
857
- /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
858
- /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
859
- /* .supports_op = */ lm_ggml_backend_cpu_supports_op,
860
- /* .supports_buft = */ lm_ggml_backend_cpu_supports_buft,
861
- /* .offload_op = */ NULL,
862
- /* .event_new = */ NULL,
863
- /* .event_free = */ NULL,
864
- /* .event_record = */ NULL,
865
- /* .event_wait = */ NULL,
866
- /* .event_synchronize = */ NULL,
867
- };
868
-
869
- static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
870
- static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
871
- return &guid;
872
- }
873
-
874
- lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
875
- struct lm_ggml_backend_cpu_context * ctx = malloc(sizeof(struct lm_ggml_backend_cpu_context));
876
- if (ctx == NULL) {
877
- return NULL;
878
- }
879
-
880
- ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
881
- ctx->work_data = NULL;
882
- ctx->work_size = 0;
883
- ctx->abort_callback = NULL;
884
- ctx->abort_callback_data = NULL;
885
-
886
- lm_ggml_backend_t cpu_backend = malloc(sizeof(struct lm_ggml_backend));
887
- if (cpu_backend == NULL) {
888
- free(ctx);
889
- return NULL;
890
- }
891
-
892
- *cpu_backend = (struct lm_ggml_backend) {
893
- /* .guid = */ lm_ggml_backend_cpu_guid(),
894
- /* .interface = */ cpu_backend_i,
895
- /* .context = */ ctx
896
- };
897
- return cpu_backend;
898
- }
899
-
900
- LM_GGML_CALL bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
901
- return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
902
- }
903
-
904
- void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
905
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
906
-
907
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
908
- ctx->n_threads = n_threads;
909
- }
910
-
911
- void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
912
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
913
-
914
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
915
- ctx->abort_callback = abort_callback;
916
- ctx->abort_callback_data = abort_callback_data;
917
- }
918
-
919
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
920
- LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
921
- return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
922
- }
923
-
924
- LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
925
- return lm_ggml_backend_cpu_init();
926
-
927
- LM_GGML_UNUSED(params);
928
- LM_GGML_UNUSED(user_data);
929
- }
930
-
931
- // multi-buffer buffer
932
-
933
- struct lm_ggml_backend_multi_buffer_context {
934
- lm_ggml_backend_buffer_t * buffers;
935
- size_t n_buffers;
936
- };
937
-
938
- typedef struct lm_ggml_backend_multi_buffer_context * lm_ggml_backend_multi_buffer_context_t;
939
-
940
- LM_GGML_CALL static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
941
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
942
-
943
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
944
- }
945
-
946
- LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
947
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
948
- for (size_t i = 0; i < ctx->n_buffers; i++) {
949
- lm_ggml_backend_buffer_free(ctx->buffers[i]);
950
- }
951
-
952
- free(ctx->buffers);
953
- free(ctx);
954
- }
955
-
956
- LM_GGML_CALL static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
957
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
958
- for (size_t i = 0; i < ctx->n_buffers; i++) {
959
- lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
960
- }
961
- }
962
-
963
- static struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_context_interface(void) {
964
- static struct lm_ggml_backend_buffer_i multi_backend_buffer_i = {
965
- /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
966
- /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
967
- /* .get_base = */ NULL,
968
- /* .init_tensor = */ NULL,
969
- /* .set_tensor = */ NULL,
970
- /* .get_tensor = */ NULL,
971
- /* .cpy_tensor = */ NULL,
972
- /* .clear = */ lm_ggml_backend_multi_buffer_clear,
973
- /* .reset = */ NULL,
974
- };
975
-
976
- return multi_backend_buffer_i;
977
- }
978
-
979
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
980
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
981
- ctx->n_buffers = n_buffers;
982
- ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
983
-
984
- LM_GGML_ASSERT(ctx->buffers != NULL);
985
-
986
- size_t total_size = 0;
987
- for (size_t i = 0; i < n_buffers; i++) {
988
- ctx->buffers[i] = buffers[i];
989
- total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
990
- }
991
-
992
- return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_context_interface(), ctx, total_size);
993
- }
994
-
995
- LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
996
- return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
997
- }
998
-
999
- LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1000
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
1001
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1002
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1003
- lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1004
- }
1005
- }
1006
-
1007
- // creates a copy of the tensor with the same memory layout
1008
- static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
1009
- struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
1010
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
1011
- dup->nb[i] = tensor->nb[i];
1012
- }
1013
- return dup;
1014
- }
1015
-
1016
- static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1017
- return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
1018
- }
1019
-
1020
- // scheduler
1021
-
1022
- #ifndef LM_GGML_SCHED_MAX_BACKENDS
1023
- #define LM_GGML_SCHED_MAX_BACKENDS 16
1024
- #endif
1025
-
1026
- #ifndef LM_GGML_SCHED_MAX_SPLITS
1027
- #define LM_GGML_SCHED_MAX_SPLITS 2048
1028
- #endif
1029
-
1030
- #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1031
- #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1032
- #endif
1033
-
1034
- #ifndef LM_GGML_SCHED_MAX_COPIES
1035
- #define LM_GGML_SCHED_MAX_COPIES 4
1036
- #endif
1037
-
1038
- struct lm_ggml_backend_sched_split {
1039
- int backend_id;
1040
- int i_start;
1041
- int i_end;
1042
- struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1043
- int n_inputs;
1044
- // graph view of this split
1045
- struct lm_ggml_cgraph graph;
1046
- };
1047
-
1048
- struct lm_ggml_backend_sched {
1049
- bool is_reset; // true if the scheduler has been reset since the last graph split
1050
- bool is_alloc;
1051
-
1052
- int n_backends;
1053
-
1054
- lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
1055
- lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
1056
- lm_ggml_gallocr_t galloc;
1057
-
1058
- // hash keys of the nodes in the graph
1059
- struct lm_ggml_hash_set hash_set;
1060
- // hash values
1061
- int * tensor_backend_id;
1062
- struct lm_ggml_tensor * (* tensor_copies)[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
1063
-
1064
- int * node_backend_ids; // [graph_size]
1065
- int * leaf_backend_ids; // [graph_size]
1066
-
1067
- int * prev_node_backend_ids; // [graph_size]
1068
- int * prev_leaf_backend_ids; // [graph_size]
1069
-
1070
- // copy of the graph with modified inputs
1071
- struct lm_ggml_cgraph * graph;
1072
-
1073
- // graph splits
1074
- struct lm_ggml_backend_sched_split * splits;
1075
- int n_splits;
1076
- int splits_capacity;
1077
-
1078
- // pipeline parallelism support
1079
- int n_copies;
1080
- int cur_copy;
1081
- lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
1082
- struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1083
- int n_graph_inputs;
1084
-
1085
- struct lm_ggml_context * ctx;
1086
-
1087
- lm_ggml_backend_sched_eval_callback callback_eval;
1088
- void * callback_eval_user_data;
1089
-
1090
- bool debug;
1091
-
1092
- // align context_buffer to LM_GGML_MEM_ALIGN
1093
- #ifdef _MSC_VER
1094
- __declspec(align(LM_GGML_MEM_ALIGN))
1095
- #else
1096
- __attribute__((aligned(LM_GGML_MEM_ALIGN)))
1097
- #endif
1098
- char context_buffer[LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + sizeof(struct lm_ggml_cgraph)];
1099
- };
1100
-
1101
- #define hash_id(tensor) lm_ggml_hash_find_or_insert(sched->hash_set, tensor)
1102
- #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
1103
-
1104
- // returns the priority of the backend, lower id is higher priority
1105
- static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1106
- for (int i = 0; i < sched->n_backends; i++) {
1107
- if (sched->backends[i] == backend) {
1108
- return i;
1109
- }
1110
- }
1111
- return -1;
1112
- }
1113
-
1114
- static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
1115
- lm_ggml_backend_buffer_t buffer = tensor->buffer;
1116
- if (buffer == NULL) {
1117
- return -1;
1118
- }
1119
-
1120
- // find highest prio backend that supports the buffer type and the op
1121
- for (int i = 0; i < sched->n_backends; i++) {
1122
- if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1123
- lm_ggml_backend_supports_op(sched->backends[i], op)) {
1124
- return i;
1125
- }
1126
- }
1127
-
1128
- #ifndef NDEBUG
1129
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1130
- __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
1131
- #endif
1132
-
1133
- return -1;
1134
- }
1135
-
1136
- #if 0
1137
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1138
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1139
- #define GET_CAUSE(node) causes[hash_id(node)]
1140
- #else
1141
- #define SET_CAUSE(node, ...)
1142
- #define GET_CAUSE(node) ""
1143
- #endif
1144
-
1145
- // returns the backend that should be used for the node based on the current locations
1146
- static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
1147
- // TODO: use supports_op to check if the backend supports the op
1148
-
1149
- // assign pre-allocated nodes to their backend
1150
- int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1151
- if (cur_backend_id != -1) {
1152
- SET_CAUSE(tensor, "1.dst");
1153
- return cur_backend_id;
1154
- }
1155
-
1156
- // view_src
1157
- if (tensor->view_src != NULL) {
1158
- cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1159
- if (cur_backend_id != -1) {
1160
- SET_CAUSE(tensor, "1.vsrc");
1161
- return cur_backend_id;
1162
- }
1163
- }
1164
-
1165
- // graph input
1166
- if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1167
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1168
- SET_CAUSE(tensor, "1.inp");
1169
- return cur_backend_id;
1170
- }
1171
-
1172
- // assign nodes that use weights to the backend of the weights
1173
- // operations with weights are preferably run on the same backend as the weights
1174
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1175
- const struct lm_ggml_tensor * src = tensor->src[i];
1176
- if (src == NULL) {
1177
- continue;
1178
- }
1179
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1180
- int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1181
- // check if a backend with higher prio wants to offload the op
1182
- if (src_backend_id == sched->n_backends - 1) {
1183
- for (int b = 0; b < src_backend_id; b++) {
1184
- if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
1185
- SET_CAUSE(tensor, "1.off");
1186
- return b;
1187
- }
1188
- }
1189
- }
1190
- SET_CAUSE(tensor, "1.wgt%d", i);
1191
- return src_backend_id;
1192
- }
1193
- }
1194
-
1195
- return -1;
1196
- }
1197
-
1198
- static char * fmt_size(size_t size) {
1199
- static char buffer[128];
1200
- if (size >= 1024*1024) {
1201
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1202
- } else {
1203
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1204
- }
1205
- return buffer;
1206
- }
1207
-
1208
- static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1209
- int cur_split = 0;
1210
- for (int i = 0; i < graph->n_nodes; i++) {
1211
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1212
- lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1213
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1214
- sched->splits[cur_split].n_inputs);
1215
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1216
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1217
- fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1218
- }
1219
- fprintf(stderr, "\n");
1220
- cur_split++;
1221
- }
1222
- struct lm_ggml_tensor * node = graph->nodes[i];
1223
- if (lm_ggml_is_view_op(node->op)) {
1224
- continue;
1225
- }
1226
- lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1227
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1228
- fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1229
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1230
- struct lm_ggml_tensor * src = node->src[j];
1231
- if (src == NULL) {
1232
- continue;
1233
- }
1234
- lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1235
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1236
- fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1237
- }
1238
- fprintf(stderr, "\n");
1239
- }
1240
- }
1241
-
1242
- static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1243
- lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1244
- lm_ggml_backend_buffer_type_t buft = NULL;
1245
-
1246
- if (buf) {
1247
- // the tensor is already allocated
1248
- buft = buf->buft;
1249
- } else {
1250
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1251
- int tensor_backend_id = tensor_backend_id(t);
1252
- if (tensor_backend_id == -1 && t->view_src) {
1253
- tensor_backend_id = tensor_backend_id(t->view_src);
1254
- }
1255
- if (tensor_backend_id != -1) {
1256
- buft = sched->bufts[tensor_backend_id];
1257
- }
1258
- }
1259
-
1260
- return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1261
- }
1262
-
1263
- static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1264
- if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1265
- *node_backend_id = cur_backend_id;
1266
- SET_CAUSE(node, "2.sup");
1267
- }
1268
- }
1269
-
1270
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1271
- static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1272
- // reset splits
1273
- sched->n_splits = 0;
1274
- sched->n_graph_inputs = 0;
1275
- sched->is_reset = false;
1276
-
1277
- struct lm_ggml_init_params params = {
1278
- /* .mem_size = */ sizeof(sched->context_buffer),
1279
- /* .mem_buffer = */ sched->context_buffer,
1280
- /* .no_alloc = */ true
1281
- };
1282
-
1283
- lm_ggml_free(sched->ctx);
1284
-
1285
- sched->ctx = lm_ggml_init(params);
1286
- if (sched->ctx == NULL) {
1287
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
1288
- LM_GGML_ASSERT(false);
1289
- }
1290
-
1291
- // pass 1: assign backends to ops with pre-allocated inputs
1292
- for (int i = 0; i < graph->n_leafs; i++) {
1293
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1294
- int * leaf_backend_id = &tensor_backend_id(leaf);
1295
- if (*leaf_backend_id != -1) {
1296
- // do not overwrite user assignments
1297
- continue;
1298
- }
1299
- *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1300
- }
1301
-
1302
- for (int i = 0; i < graph->n_nodes; i++) {
1303
- struct lm_ggml_tensor * node = graph->nodes[i];
1304
- int * node_backend_id = &tensor_backend_id(node);
1305
- if (*node_backend_id != -1) {
1306
- // do not overwrite user assignments
1307
- continue;
1308
- }
1309
- *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1310
- // src
1311
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1312
- struct lm_ggml_tensor * src = node->src[j];
1313
- if (src == NULL) {
1314
- continue;
1315
- }
1316
- int * src_backend_id = &tensor_backend_id(src);
1317
- if (*src_backend_id == -1) {
1318
- *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1319
- }
1320
- }
1321
- }
1322
-
1323
- // pass 2: expand current backend assignments
1324
- // assign the same backend to adjacent nodes
1325
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1326
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1327
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1328
- // expand gpu down
1329
- {
1330
- int cur_backend_id = -1;
1331
- for (int i = 0; i < graph->n_nodes; i++) {
1332
- struct lm_ggml_tensor * node = graph->nodes[i];
1333
- if (lm_ggml_is_view_op(node->op)) {
1334
- continue;
1335
- }
1336
- int * node_backend_id = &tensor_backend_id(node);
1337
- if (*node_backend_id != -1) {
1338
- if (*node_backend_id == sched->n_backends - 1) {
1339
- // skip cpu (lowest prio backend)
1340
- cur_backend_id = -1;
1341
- } else {
1342
- cur_backend_id = *node_backend_id;
1343
- }
1344
- } else if (cur_backend_id != -1) {
1345
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1346
- }
1347
- }
1348
- }
1349
- // expand gpu up
1350
- {
1351
- int cur_backend_id = -1;
1352
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1353
- struct lm_ggml_tensor * node = graph->nodes[i];
1354
- if (lm_ggml_is_view_op(node->op)) {
1355
- continue;
1356
- }
1357
- int * node_backend_id = &tensor_backend_id(node);
1358
- if (*node_backend_id != -1) {
1359
- if (*node_backend_id == sched->n_backends - 1) {
1360
- // skip cpu (lowest prio backend)
1361
- cur_backend_id = -1;
1362
- } else {
1363
- cur_backend_id = *node_backend_id;
1364
- }
1365
- } else if (cur_backend_id != -1) {
1366
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1367
- }
1368
- }
1369
- }
1370
- // expand rest down
1371
- {
1372
- int cur_backend_id = -1;
1373
- for (int i = 0; i < graph->n_nodes; i++) {
1374
- struct lm_ggml_tensor * node = graph->nodes[i];
1375
- if (lm_ggml_is_view_op(node->op)) {
1376
- continue;
1377
- }
1378
- int * node_backend_id = &tensor_backend_id(node);
1379
- if (*node_backend_id != -1) {
1380
- cur_backend_id = *node_backend_id;
1381
- } else if (cur_backend_id != -1) {
1382
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1383
- }
1384
- }
1385
- }
1386
- // expand rest up
1387
- {
1388
- int cur_backend_id = -1;
1389
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1390
- struct lm_ggml_tensor * node = graph->nodes[i];
1391
- if (lm_ggml_is_view_op(node->op)) {
1392
- continue;
1393
- }
1394
- int * node_backend_id = &tensor_backend_id(node);
1395
- if (*node_backend_id != -1) {
1396
- cur_backend_id = *node_backend_id;
1397
- } else if (cur_backend_id != -1) {
1398
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1399
- }
1400
- }
1401
- }
1402
-
1403
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1404
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1405
- // however, we also need to verify that the sources are in compatible buffer types
1406
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1407
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1408
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1409
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1410
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1411
- for (int i = 0; i < graph->n_nodes; i++) {
1412
- struct lm_ggml_tensor * node = graph->nodes[i];
1413
- if (lm_ggml_is_view_op(node->op)) {
1414
- continue;
1415
- }
1416
- int * node_backend_id = &tensor_backend_id(node);
1417
- if (*node_backend_id == -1) {
1418
- // unassigned node: find the backend with the most supported inputs
1419
- int n_supported_best = -1;
1420
- for (int b = 0; b < sched->n_backends; b++) {
1421
- if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1422
- int n_supported = 0;
1423
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1424
- struct lm_ggml_tensor * src = node->src[j];
1425
- if (src == NULL) {
1426
- continue;
1427
- }
1428
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1429
- n_supported++;
1430
- }
1431
- }
1432
- if (n_supported > n_supported_best) {
1433
- n_supported_best = n_supported;
1434
- *node_backend_id = b;
1435
- SET_CAUSE(node, "3.best");
1436
- }
1437
- }
1438
- }
1439
- } else {
1440
- // assigned node: upgrade to higher prio backend if possible
1441
- for (int b = 0; b < *node_backend_id; b++) {
1442
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1443
- bool supported = true;
1444
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1445
- struct lm_ggml_tensor * src = node->src[j];
1446
- if (src == NULL) {
1447
- continue;
1448
- }
1449
- if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1450
- supported = false;
1451
- break;
1452
- }
1453
- }
1454
- if (supported) {
1455
- *node_backend_id = b;
1456
- SET_CAUSE(node, "3.upg");
1457
- break;
1458
- }
1459
- }
1460
- }
1461
- }
1462
- }
1463
-
1464
- // pass 4: assign backends to remaining src from dst and view_src
1465
- for (int i = 0; i < graph->n_nodes; i++) {
1466
- struct lm_ggml_tensor * node = graph->nodes[i];
1467
- int * cur_backend_id = &tensor_backend_id(node);
1468
- if (node->view_src != NULL && *cur_backend_id == -1) {
1469
- *cur_backend_id = tensor_backend_id(node->view_src);
1470
- SET_CAUSE(node, "4.vsrc");
1471
- }
1472
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1473
- struct lm_ggml_tensor * src = node->src[j];
1474
- if (src == NULL) {
1475
- continue;
1476
- }
1477
- int * src_backend_id = &tensor_backend_id(src);
1478
- if (*src_backend_id == -1) {
1479
- if (src->view_src != NULL) {
1480
- // views are always on the same backend as the source
1481
- *src_backend_id = tensor_backend_id(src->view_src);
1482
- SET_CAUSE(src, "4.vsrc");
1483
- } else {
1484
- *src_backend_id = *cur_backend_id;
1485
- SET_CAUSE(src, "4.cur");
1486
- }
1487
- }
1488
- }
1489
- }
1490
-
1491
- // pass 4: split graph, find tensors that need to be copied
1492
- {
1493
- int i_split = 0;
1494
- struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1495
- // find the backend of the first split, skipping view ops
1496
- for (int i = 0; i < graph->n_nodes; i++) {
1497
- struct lm_ggml_tensor * node = graph->nodes[i];
1498
- if (!lm_ggml_is_view_op(node->op)) {
1499
- split->backend_id = tensor_backend_id(node);
1500
- break;
1501
- }
1502
- }
1503
- split->i_start = 0;
1504
- split->n_inputs = 0;
1505
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1506
- int cur_backend_id = split->backend_id;
1507
- for (int i = 0; i < graph->n_nodes; i++) {
1508
- struct lm_ggml_tensor * node = graph->nodes[i];
1509
-
1510
- if (lm_ggml_is_view_op(node->op)) {
1511
- continue;
1512
- }
1513
-
1514
- const int node_backend_id = tensor_backend_id(node);
1515
-
1516
- LM_GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1517
-
1518
- // check if we should start a new split based on the sources of the current node
1519
- bool need_new_split = false;
1520
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1521
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1522
- struct lm_ggml_tensor * src = node->src[j];
1523
- if (src == NULL) {
1524
- continue;
1525
- }
1526
- // check if a weight is on a different backend
1527
- // by starting a new split, the memory of the previously offloaded weights can be reused
1528
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1529
- int src_backend_id = tensor_backend_id(src);
1530
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1531
- need_new_split = true;
1532
- break;
1533
- }
1534
- }
1535
- // check if the split has too many inputs
1536
- // FIXME: count the number of inputs instead of only checking when full
1537
- if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1538
- const size_t id = hash_id(src);
1539
- int src_backend_id = sched->tensor_backend_id[id];
1540
- bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1541
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1542
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1543
- need_new_split = true;
1544
- break;
1545
- }
1546
- }
1547
- }
1548
- }
1549
-
1550
- if (node_backend_id != cur_backend_id || need_new_split) {
1551
- split->i_end = i;
1552
- i_split++;
1553
- if (i_split >= sched->splits_capacity) {
1554
- sched->splits_capacity *= 2;
1555
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1556
- LM_GGML_ASSERT(sched->splits != NULL);
1557
- }
1558
- LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
1559
- split = &sched->splits[i_split];
1560
- split->backend_id = node_backend_id;
1561
- split->i_start = i;
1562
- split->n_inputs = 0;
1563
- cur_backend_id = node_backend_id;
1564
- }
1565
-
1566
- // find inputs that are not on the same backend
1567
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1568
- struct lm_ggml_tensor * src = node->src[j];
1569
- if (src == NULL) {
1570
- continue;
1571
- }
1572
-
1573
- const int src_backend_id = tensor_backend_id(src);
1574
- assert(src_backend_id != -1); // all inputs should be assigned by now
1575
-
1576
- if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1577
- size_t id = hash_id(src);
1578
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1579
- lm_ggml_backend_t backend = sched->backends[src_backend_id];
1580
- for (int c = 0; c < sched->n_copies; c++) {
1581
- struct lm_ggml_tensor * tensor_copy;
1582
- if (c == sched->cur_copy) {
1583
- tensor_copy = src; // use the original tensor as the current copy
1584
- } else {
1585
- tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1586
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1587
- }
1588
- if (sched->n_copies > 1) {
1589
- lm_ggml_set_input(tensor_copy);
1590
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1591
- }
1592
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1593
- SET_CAUSE(tensor_copy, "4.cpy");
1594
- }
1595
- int n_graph_inputs = sched->n_graph_inputs++;
1596
- LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1597
- sched->graph_inputs[n_graph_inputs] = src;
1598
- }
1599
- }
1600
-
1601
- bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1602
- if (src_backend_id != cur_backend_id && !supported) {
1603
- // create a copy of the input in the split's backend
1604
- const size_t id = hash_id(src);
1605
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1606
- lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1607
- for (int c = 0; c < sched->n_copies; c++) {
1608
- struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1609
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1610
- if (sched->n_copies > 1) {
1611
- lm_ggml_set_input(tensor_copy);
1612
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1613
- }
1614
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1615
- SET_CAUSE(tensor_copy, "4.cpy");
1616
- }
1617
- int n_inputs = split->n_inputs++;
1618
- LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1619
- split->inputs[n_inputs] = src;
1620
- }
1621
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1622
- }
1623
- }
1624
- }
1625
- split->i_end = graph->n_nodes;
1626
- sched->n_splits = i_split + 1;
1627
- }
1628
-
1629
- if (sched->debug) {
1630
- lm_ggml_backend_sched_print_assignments(sched, graph);
1631
- }
1632
-
1633
- // swap node_backend_ids and leaf_backend_ids and prevs
1634
- {
1635
- int * tmp = sched->node_backend_ids;
1636
- sched->node_backend_ids = sched->prev_node_backend_ids;
1637
- sched->prev_node_backend_ids = tmp;
1638
-
1639
- tmp = sched->leaf_backend_ids;
1640
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1641
- sched->prev_leaf_backend_ids = tmp;
1642
- }
1643
-
1644
- // create copies of the graph for each split
1645
- // TODO: avoid this copy
1646
- struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1647
- for (int i = 0; i < sched->n_splits; i++) {
1648
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1649
- split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1650
-
1651
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1652
- for (int j = 0; j < split->n_inputs; j++) {
1653
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1654
-
1655
- struct lm_ggml_tensor * input = split->inputs[j];
1656
- const size_t input_id = hash_id(input);
1657
- struct lm_ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1658
-
1659
- // add a dependency to the input source so that it is not freed before the copy is done
1660
- struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1661
- input_dep->src[0] = input;
1662
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1663
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1664
-
1665
- // add a dependency to the input copy so that it is allocated at the start of the split
1666
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1667
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1668
- }
1669
-
1670
- for (int j = split->i_start; j < split->i_end; j++) {
1671
- assert(graph_copy->size > graph_copy->n_nodes);
1672
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1673
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1674
- }
1675
- }
1676
-
1677
- if (sched->n_copies > 1) {
1678
- // add input copies as leafs so that they are allocated first
1679
- for (int i = 0; i < sched->n_graph_inputs; i++) {
1680
- struct lm_ggml_tensor * input = sched->graph_inputs[i];
1681
- size_t id = hash_id(input);
1682
- int backend_id = tensor_backend_id(input);
1683
- for (int c = 0; c < sched->n_copies; c++) {
1684
- struct lm_ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1685
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1686
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1687
- }
1688
- }
1689
-
1690
- for (int i = 0; i < sched->n_splits; i++) {
1691
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1692
- int backend_id = split->backend_id;
1693
- for (int j = 0; j < split->n_inputs; j++) {
1694
- struct lm_ggml_tensor * input = split->inputs[j];
1695
- size_t id = hash_id(input);
1696
- for (int c = 0; c < sched->n_copies; c++) {
1697
- struct lm_ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1698
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1699
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1700
- }
1701
- }
1702
- }
1703
- }
1704
-
1705
- // add leafs from the original graph
1706
- for (int i = 0; i < graph->n_leafs; i++) {
1707
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1708
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1709
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1710
- }
1711
-
1712
- sched->graph = graph_copy;
1713
- }
1714
-
1715
- static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1716
- bool backend_ids_changed = false;
1717
- for (int i = 0; i < sched->graph->n_nodes; i++) {
1718
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1719
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1720
- backend_ids_changed = true;
1721
- break;
1722
- }
1723
- }
1724
- if (!backend_ids_changed) {
1725
- for (int i = 0; i < sched->graph->n_leafs; i++) {
1726
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1727
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1728
- backend_ids_changed = true;
1729
- break;
1730
- }
1731
- }
1732
- }
1733
-
1734
- // allocate graph
1735
- if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1736
- // the re-allocation may cause the split inputs to be moved to a different address
1737
- lm_ggml_backend_sched_synchronize(sched);
1738
- #ifndef NDEBUG
1739
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1740
- #endif
1741
- lm_ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1742
- if (!lm_ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1743
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1744
- return false;
1745
- }
1746
- }
1747
-
1748
- return true;
1749
- }
1750
-
1751
- static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1752
- struct lm_ggml_backend_sched_split * splits = sched->splits;
1753
-
1754
- for (int i = 0; i < sched->n_splits; i++) {
1755
- struct lm_ggml_backend_sched_split * split = &splits[i];
1756
- int split_backend_id = split->backend_id;
1757
- lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1758
-
1759
- // copy the input tensors to the split backend
1760
- for (int j = 0; j < split->n_inputs; j++) {
1761
- lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1762
- struct lm_ggml_tensor * input = split->inputs[j];
1763
- struct lm_ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1764
-
1765
- if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1766
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1767
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1768
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1769
- } else {
1770
- lm_ggml_backend_synchronize(split_backend);
1771
- }
1772
- lm_ggml_backend_tensor_copy(input, input_cpy);
1773
- } else {
1774
- // wait for the split backend to finish using the input before overwriting it
1775
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1776
- lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1777
- } else {
1778
- lm_ggml_backend_synchronize(split_backend);
1779
- }
1780
- lm_ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1781
- }
1782
- }
1783
-
1784
- if (!sched->callback_eval) {
1785
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1786
- if (ec != LM_GGML_STATUS_SUCCESS) {
1787
- return ec;
1788
- }
1789
- } else {
1790
- // similar to lm_ggml_backend_compare_graph_backend
1791
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1792
- struct lm_ggml_tensor * t = split->graph.nodes[j0];
1793
-
1794
- // check if the user needs data from this node
1795
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1796
-
1797
- int j1 = j0;
1798
-
1799
- // determine the range [j0, j1] of nodes that can be computed together
1800
- while (!need && j1 < split->graph.n_nodes - 1) {
1801
- t = split->graph.nodes[++j1];
1802
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1803
- }
1804
-
1805
- struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1806
-
1807
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1808
- if (ec != LM_GGML_STATUS_SUCCESS) {
1809
- return ec;
1810
- }
1811
-
1812
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1813
- lm_ggml_backend_synchronize(split_backend);
1814
-
1815
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1816
- break;
1817
- }
1818
-
1819
- j0 = j1;
1820
- }
1821
- }
1822
-
1823
- // record the event of this copy
1824
- if (split->n_inputs > 0) {
1825
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1826
- lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1827
- }
1828
- }
1829
- }
1830
-
1831
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1832
-
1833
- return LM_GGML_STATUS_SUCCESS;
1834
- }
1835
-
1836
- lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1837
- lm_ggml_backend_t * backends,
1838
- lm_ggml_backend_buffer_type_t * bufts,
1839
- int n_backends,
1840
- size_t graph_size,
1841
- bool parallel) {
1842
- LM_GGML_ASSERT(n_backends > 0);
1843
- LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1844
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1845
-
1846
- struct lm_ggml_backend_sched * sched = calloc(1, sizeof(struct lm_ggml_backend_sched));
1847
-
1848
- sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
1849
-
1850
- // initialize hash table
1851
- sched->hash_set = lm_ggml_hash_set_new(graph_size);
1852
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1853
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1854
-
1855
- const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1856
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1857
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1858
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1859
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1860
-
1861
- sched->n_backends = n_backends;
1862
-
1863
- sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1864
-
1865
- const int initial_splits_capacity = 16;
1866
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1867
- sched->splits_capacity = initial_splits_capacity;
1868
-
1869
- for (int b = 0; b < n_backends; b++) {
1870
- sched->backends[b] = backends[b];
1871
- sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1872
- LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1873
- if (sched->n_copies > 1) {
1874
- for (int c = 0; c < sched->n_copies; c++) {
1875
- sched->events[b][c] = lm_ggml_backend_event_new(backends[b]);
1876
- }
1877
- }
1878
- }
1879
-
1880
- sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1881
-
1882
- lm_ggml_backend_sched_reset(sched);
1883
-
1884
- return sched;
1885
- }
1886
-
1887
- void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1888
- if (sched == NULL) {
1889
- return;
1890
- }
1891
- for (int b = 0; b < sched->n_backends; b++) {
1892
- for (int c = 0; c < sched->n_copies; c++) {
1893
- lm_ggml_backend_event_free(sched->events[b][c]);
1894
- }
1895
- }
1896
- lm_ggml_gallocr_free(sched->galloc);
1897
- lm_ggml_free(sched->ctx);
1898
- free(sched->splits);
1899
- free(sched->hash_set.keys);
1900
- free(sched->tensor_backend_id);
1901
- free(sched->tensor_copies);
1902
- free(sched->node_backend_ids);
1903
- free(sched->leaf_backend_ids);
1904
- free(sched->prev_node_backend_ids);
1905
- free(sched->prev_leaf_backend_ids);
1906
- free(sched);
1907
- }
1908
-
1909
- void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1910
- // reset state for the next run
1911
- if (!sched->is_reset) {
1912
- size_t hash_size = sched->hash_set.size;
1913
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1914
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1915
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1916
-
1917
- sched->is_reset = true;
1918
- }
1919
- sched->is_alloc = false;
1920
- }
1921
-
1922
- bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1923
- LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1924
-
1925
- lm_ggml_backend_sched_split_graph(sched, measure_graph);
1926
-
1927
- // TODO: extract this to a separate function
1928
- if (!lm_ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1929
- return false;
1930
- }
1931
-
1932
- lm_ggml_backend_sched_reset(sched);
1933
- lm_ggml_backend_sched_synchronize(sched);
1934
-
1935
- return true;
1936
- }
1937
-
1938
- bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1939
- LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1940
-
1941
- lm_ggml_backend_sched_split_graph(sched, graph);
1942
-
1943
- if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1944
- return false;
1945
- }
1946
-
1947
- sched->is_alloc = true;
1948
-
1949
- return true;
1950
- }
1951
-
1952
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1953
- enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1954
- lm_ggml_backend_sched_synchronize(sched);
1955
- return err;
1956
- }
1957
-
1958
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1959
- if (!sched->is_reset && !sched->is_alloc) {
1960
- lm_ggml_backend_sched_reset(sched);
1961
- }
1962
-
1963
- if (!sched->is_alloc) {
1964
- if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1965
- return LM_GGML_STATUS_ALLOC_FAILED;
1966
- }
1967
- }
1968
-
1969
- return lm_ggml_backend_sched_compute_splits(sched);
1970
- }
1971
-
1972
- void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1973
- for (int i = 0; i < sched->n_backends; i++) {
1974
- lm_ggml_backend_synchronize(sched->backends[i]);
1975
- }
1976
- }
1977
-
1978
- void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1979
- sched->callback_eval = callback;
1980
- sched->callback_eval_user_data = user_data;
1981
- }
1982
-
1983
- int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1984
- return sched->n_splits;
1985
- }
1986
-
1987
- int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1988
- return sched->n_copies;
1989
- }
1990
-
1991
- int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
1992
- return sched->n_backends;
1993
- }
1994
-
1995
- lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
1996
- LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1997
- return sched->backends[i];
1998
- }
1999
-
2000
- size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
2001
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2002
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2003
-
2004
- return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2005
- }
2006
-
2007
- void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
2008
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2009
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2010
- tensor_backend_id(node) = backend_index;
2011
- SET_CAUSE(node, "usr");
2012
- }
2013
-
2014
- lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
2015
- int backend_index = tensor_backend_id(node);
2016
- if (backend_index == -1) {
2017
- return NULL;
2018
- }
2019
- return sched->backends[backend_index];
2020
- }
2021
-
2022
- // utils
2023
-
2024
- void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
2025
- LM_GGML_ASSERT(tensor->buffer == NULL);
2026
- LM_GGML_ASSERT(tensor->view_src != NULL);
2027
- LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
2028
- LM_GGML_ASSERT(tensor->view_src->data != NULL);
2029
-
2030
- tensor->buffer = tensor->view_src->buffer;
2031
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2032
- lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2033
- }
2034
-
2035
- void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
2036
- LM_GGML_ASSERT(tensor->buffer == NULL);
2037
- LM_GGML_ASSERT(tensor->data == NULL);
2038
- LM_GGML_ASSERT(tensor->view_src == NULL);
2039
- LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
2040
- LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2041
- (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
2042
-
2043
- tensor->buffer = buffer;
2044
- tensor->data = addr;
2045
- lm_ggml_backend_buffer_init_tensor(buffer, tensor);
2046
- }
2047
-
2048
- static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
2049
- struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
2050
-
2051
- LM_GGML_ASSERT(src != NULL);
2052
- LM_GGML_ASSERT(src->data && "graph must be allocated");
2053
-
2054
- size_t id = lm_ggml_hash_insert(hash_set, src);
2055
- if (id == LM_GGML_HASHTABLE_ALREADY_EXISTS) {
2056
- return node_copies[lm_ggml_hash_find(hash_set, src)];
2057
- }
2058
-
2059
- struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2060
- if (src->view_src != NULL) {
2061
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2062
- dst->view_offs = src->view_offs;
2063
- }
2064
- dst->op = src->op;
2065
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2066
- lm_ggml_set_name(dst, src->name);
2067
-
2068
- // copy src
2069
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2070
- struct lm_ggml_tensor * s = src->src[i];
2071
- if (s == NULL) {
2072
- continue;
2073
- }
2074
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2075
- }
2076
-
2077
- node_copies[id] = dst;
2078
- return dst;
2079
- }
2080
-
2081
- static void graph_copy_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
2082
- size_t id = lm_ggml_hash_find(hash_set, src);
2083
- if (node_init[id]) {
2084
- return;
2085
- }
2086
- node_init[id] = true;
2087
-
2088
- struct lm_ggml_tensor * dst = node_copies[id];
2089
- if (dst->view_src != NULL) {
2090
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2091
- lm_ggml_backend_view_init(dst);
2092
- }
2093
- else {
2094
- lm_ggml_backend_tensor_copy(src, dst);
2095
- }
2096
-
2097
- // init src
2098
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2099
- struct lm_ggml_tensor * s = src->src[i];
2100
- if (s == NULL) {
2101
- continue;
2102
- }
2103
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2104
- }
2105
- }
2106
-
2107
- struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
2108
- struct lm_ggml_hash_set hash_set = {
2109
- /* .size = */ graph->visited_hash_table.size,
2110
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
2111
- };
2112
- struct lm_ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2113
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2114
-
2115
- struct lm_ggml_init_params params = {
2116
- /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
2117
- /* .mem_buffer = */ NULL,
2118
- /* .no_alloc = */ true
2119
- };
2120
-
2121
- struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
2122
- struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
2123
-
2124
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2125
- fprintf(stderr, "failed to allocate context for graph copy\n");
2126
- free(hash_set.keys);
2127
- free(node_copies);
2128
- free(node_init);
2129
- lm_ggml_free(ctx_allocated);
2130
- lm_ggml_free(ctx_unallocated);
2131
- return (struct lm_ggml_backend_graph_copy) {
2132
- /* .buffer = */ NULL,
2133
- /* .ctx_allocated = */ NULL,
2134
- /* .ctx_unallocated = */ NULL,
2135
- /* .graph = */ NULL,
2136
- };
2137
- }
2138
-
2139
- // dup nodes
2140
- for (int i = 0; i < graph->n_nodes; i++) {
2141
- struct lm_ggml_tensor * node = graph->nodes[i];
2142
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2143
- }
2144
-
2145
- // allocate nodes
2146
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2147
- if (buffer == NULL) {
2148
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2149
- free(hash_set.keys);
2150
- free(node_copies);
2151
- free(node_init);
2152
- lm_ggml_free(ctx_allocated);
2153
- lm_ggml_free(ctx_unallocated);
2154
- return (struct lm_ggml_backend_graph_copy) {
2155
- /* .buffer = */ NULL,
2156
- /* .ctx_allocated = */ NULL,
2157
- /* .ctx_unallocated = */ NULL,
2158
- /* .graph = */ NULL,
2159
- };
2160
- }
2161
-
2162
- //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2163
-
2164
- // copy data and init views
2165
- for (int i = 0; i < graph->n_nodes; i++) {
2166
- struct lm_ggml_tensor * node = graph->nodes[i];
2167
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2168
- }
2169
-
2170
- // build graph copy
2171
- struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
2172
- for (int i = 0; i < graph->n_nodes; i++) {
2173
- struct lm_ggml_tensor * node = graph->nodes[i];
2174
- struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(hash_set, node)];
2175
- graph_copy->nodes[i] = node_copy;
2176
- }
2177
- graph_copy->n_nodes = graph->n_nodes;
2178
-
2179
- free(hash_set.keys);
2180
- free(node_copies);
2181
- free(node_init);
2182
-
2183
- return (struct lm_ggml_backend_graph_copy) {
2184
- /* .buffer = */ buffer,
2185
- /* .ctx_allocated = */ ctx_allocated,
2186
- /* .ctx_unallocated = */ ctx_unallocated,
2187
- /* .graph = */ graph_copy,
2188
- };
2189
- }
2190
-
2191
- void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
2192
- lm_ggml_backend_buffer_free(copy.buffer);
2193
- lm_ggml_free(copy.ctx_allocated);
2194
- lm_ggml_free(copy.ctx_unallocated);
2195
- }
2196
-
2197
- bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2198
- struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2199
- if (copy.buffer == NULL) {
2200
- return false;
2201
- }
2202
-
2203
- struct lm_ggml_cgraph * g1 = graph;
2204
- struct lm_ggml_cgraph * g2 = copy.graph;
2205
-
2206
- assert(g1->n_nodes == g2->n_nodes);
2207
-
2208
- for (int i = 0; i < g1->n_nodes; i++) {
2209
- //printf("eval %d/%d\n", i, g1->n_nodes);
2210
- struct lm_ggml_tensor * t1 = g1->nodes[i];
2211
- struct lm_ggml_tensor * t2 = g2->nodes[i];
2212
-
2213
- assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2214
-
2215
- struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2216
- struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2217
-
2218
- lm_ggml_backend_graph_compute(backend1, &g1v);
2219
- lm_ggml_backend_graph_compute(backend2, &g2v);
2220
-
2221
- if (lm_ggml_is_view_op(t1->op)) {
2222
- continue;
2223
- }
2224
-
2225
- // compare results, calculate rms etc
2226
- if (!callback(i, t1, t2, user_data)) {
2227
- break;
2228
- }
2229
- }
2230
-
2231
- lm_ggml_backend_graph_copy_free(copy);
2232
-
2233
- return true;
2234
- }
1
+ #include "ggml-backend-impl.h"
2
+ #include "ggml-alloc.h"
3
+ #include "ggml-impl.h"
4
+
5
+ #include <assert.h>
6
+ #include <limits.h>
7
+ #include <stdarg.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+
13
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
+
15
+ // backend buffer type
16
+
17
+ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
18
+ return buft->iface.get_name(buft);
19
+ }
20
+
21
+ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
22
+ return buft->iface.alloc_buffer(buft, size);
23
+ }
24
+
25
+ size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) {
26
+ return buft->iface.get_alignment(buft);
27
+ }
28
+
29
+ size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
30
+ // get_max_size is optional, defaults to SIZE_MAX
31
+ if (buft->iface.get_max_size) {
32
+ return buft->iface.get_max_size(buft);
33
+ }
34
+ return SIZE_MAX;
35
+ }
36
+
37
+ LM_GGML_CALL size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
38
+ // get_alloc_size is optional, defaults to lm_ggml_nbytes
39
+ if (buft->iface.get_alloc_size) {
40
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
41
+ assert(size >= lm_ggml_nbytes(tensor));
42
+ return size;
43
+ }
44
+ return lm_ggml_nbytes(tensor);
45
+ }
46
+
47
+ bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
48
+ if (buft->iface.is_host) {
49
+ return buft->iface.is_host(buft);
50
+ }
51
+ return false;
52
+ }
53
+
54
+ // backend buffer
55
+
56
+ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
57
+ lm_ggml_backend_buffer_type_t buft,
58
+ struct lm_ggml_backend_buffer_i iface,
59
+ lm_ggml_backend_buffer_context_t context,
60
+ size_t size) {
61
+ lm_ggml_backend_buffer_t buffer = malloc(sizeof(struct lm_ggml_backend_buffer));
62
+
63
+ (*buffer) = (struct lm_ggml_backend_buffer) {
64
+ /* .interface = */ iface,
65
+ /* .buft = */ buft,
66
+ /* .context = */ context,
67
+ /* .size = */ size,
68
+ /* .usage = */ LM_GGML_BACKEND_BUFFER_USAGE_ANY
69
+ };
70
+
71
+ return buffer;
72
+ }
73
+
74
+ const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
75
+ return buffer->iface.get_name(buffer);
76
+ }
77
+
78
+ void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
79
+ if (buffer == NULL) {
80
+ return;
81
+ }
82
+
83
+ if (buffer->iface.free_buffer != NULL) {
84
+ buffer->iface.free_buffer(buffer);
85
+ }
86
+ free(buffer);
87
+ }
88
+
89
+ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
90
+ return buffer->size;
91
+ }
92
+
93
+ void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
94
+ void * base = buffer->iface.get_base(buffer);
95
+
96
+ LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
97
+
98
+ return base;
99
+ }
100
+
101
+ LM_GGML_CALL void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
102
+ // init_tensor is optional
103
+ if (buffer->iface.init_tensor) {
104
+ buffer->iface.init_tensor(buffer, tensor);
105
+ }
106
+ }
107
+
108
+ size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) {
109
+ return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
110
+ }
111
+
112
+ size_t lm_ggml_backend_buffer_get_max_size(lm_ggml_backend_buffer_t buffer) {
113
+ return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_get_type(buffer));
114
+ }
115
+
116
+ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
117
+ return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
118
+ }
119
+
120
+ void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
121
+ buffer->iface.clear(buffer, value);
122
+ }
123
+
124
+ bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
125
+ return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
126
+ }
127
+
128
+ void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
129
+ buffer->usage = usage;
130
+
131
+ // FIXME: add a generic callback to the buffer interface
132
+ if (lm_ggml_backend_buffer_is_multi_buffer(buffer)) {
133
+ lm_ggml_backend_multi_buffer_set_usage(buffer, usage);
134
+ }
135
+ }
136
+
137
+ enum lm_ggml_backend_buffer_usage lm_ggml_backend_buffer_get_usage(lm_ggml_backend_buffer_t buffer) {
138
+ return buffer->usage;
139
+ }
140
+
141
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) {
142
+ return buffer->buft;
143
+ }
144
+
145
+ void lm_ggml_backend_buffer_reset(lm_ggml_backend_buffer_t buffer) {
146
+ if (buffer->iface.reset) {
147
+ buffer->iface.reset(buffer);
148
+ }
149
+ }
150
+
151
+ bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
152
+ lm_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
+ if (dst_buf->iface.cpy_tensor) {
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
+ }
156
+ return false;
157
+ }
158
+
159
+ // backend
160
+
161
+ lm_ggml_guid_t lm_ggml_backend_guid(lm_ggml_backend_t backend) {
162
+ if (backend == NULL) {
163
+ return NULL;
164
+ }
165
+ return backend->guid;
166
+ }
167
+
168
+ const char * lm_ggml_backend_name(lm_ggml_backend_t backend) {
169
+ if (backend == NULL) {
170
+ return "NULL";
171
+ }
172
+ return backend->iface.get_name(backend);
173
+ }
174
+
175
+ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
176
+ if (backend == NULL) {
177
+ return;
178
+ }
179
+
180
+ backend->iface.free(backend);
181
+ }
182
+
183
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
184
+ return backend->iface.get_default_buffer_type(backend);
185
+ }
186
+
187
+ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
188
+ return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size);
189
+ }
190
+
191
+ size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) {
192
+ return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend));
193
+ }
194
+
195
+ size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend) {
196
+ return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_get_default_buffer_type(backend));
197
+ }
198
+
199
+ void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
200
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
201
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
202
+
203
+ if (backend->iface.set_tensor_async == NULL) {
204
+ lm_ggml_backend_tensor_set(tensor, data, offset, size);
205
+ } else {
206
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
207
+ }
208
+ }
209
+
210
+ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
211
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
212
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
213
+
214
+ if (backend->iface.get_tensor_async == NULL) {
215
+ lm_ggml_backend_tensor_get(tensor, data, offset, size);
216
+ } else {
217
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
218
+ }
219
+ }
220
+
221
+ LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
+ lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
+
224
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
227
+
228
+ if (!size) {
229
+ return;
230
+ }
231
+
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
+ }
234
+
235
+ LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
+ lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
+
238
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
241
+
242
+ if (!size) {
243
+ return;
244
+ }
245
+
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
+ }
248
+
249
+ void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) {
250
+ if (backend->iface.synchronize == NULL) {
251
+ return;
252
+ }
253
+
254
+ backend->iface.synchronize(backend);
255
+ }
256
+
257
+ lm_ggml_backend_graph_plan_t lm_ggml_backend_graph_plan_create(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
258
+ LM_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
260
+ return backend->iface.graph_plan_create(backend, cgraph);
261
+ }
262
+
263
+ void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
264
+ LM_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
266
+ backend->iface.graph_plan_free(backend, plan);
267
+ }
268
+
269
+ enum lm_ggml_status lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
270
+ LM_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
272
+ return backend->iface.graph_plan_compute(backend, plan);
273
+ }
274
+
275
+ enum lm_ggml_status lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
276
+ enum lm_ggml_status err = lm_ggml_backend_graph_compute_async(backend, cgraph);
277
+ lm_ggml_backend_synchronize(backend);
278
+ return err;
279
+ }
280
+
281
+ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
282
+ return backend->iface.graph_compute(backend, cgraph);
283
+ }
284
+
285
+ bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
286
+ return backend->iface.supports_op(backend, op);
287
+ }
288
+
289
+ bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
290
+ return backend->iface.supports_buft(backend, buft);
291
+ }
292
+
293
+ bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
294
+ if (backend->iface.offload_op != NULL) {
295
+ return backend->iface.offload_op(backend, op);
296
+ }
297
+ return false;
298
+ }
299
+
300
+ // backend copy
301
+
302
+ static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
303
+ if (a->type != b->type) {
304
+ return false;
305
+ }
306
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
307
+ if (a->ne[i] != b->ne[i]) {
308
+ return false;
309
+ }
310
+ if (a->nb[i] != b->nb[i]) {
311
+ return false;
312
+ }
313
+ }
314
+ return true;
315
+ }
316
+
317
+ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
318
+ LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
319
+
320
+ if (src == dst) {
321
+ return;
322
+ }
323
+
324
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
325
+ lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
326
+ } else if (lm_ggml_backend_buffer_is_host(dst->buffer)) {
327
+ lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
328
+ } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
329
+ #ifndef NDEBUG
330
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
331
+ #endif
332
+ size_t nbytes = lm_ggml_nbytes(src);
333
+ void * data = malloc(nbytes);
334
+ lm_ggml_backend_tensor_get(src, data, 0, nbytes);
335
+ lm_ggml_backend_tensor_set(dst, data, 0, nbytes);
336
+ free(data);
337
+ }
338
+ }
339
+
340
+ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
341
+ LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
342
+
343
+ if (src == dst) {
344
+ return;
345
+ }
346
+
347
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
348
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
349
+ return;
350
+ }
351
+ }
352
+
353
+ // an async copy would normally happen after all the queued operations on both backends are completed
354
+ // sync src, set_async dst
355
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
356
+ lm_ggml_backend_synchronize(backend_src);
357
+ lm_ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, lm_ggml_nbytes(src));
358
+ } else {
359
+ lm_ggml_backend_synchronize(backend_src);
360
+ lm_ggml_backend_tensor_copy(src, dst);
361
+ lm_ggml_backend_synchronize(backend_dst);
362
+ }
363
+ }
364
+
365
+ // events
366
+
367
+ lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_t backend) {
368
+ if (backend->iface.event_new == NULL) {
369
+ return NULL;
370
+ }
371
+ return backend->iface.event_new(backend);
372
+ }
373
+
374
+ void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
375
+ if (event == NULL) {
376
+ return;
377
+ }
378
+ event->backend->iface.event_free(event);
379
+ }
380
+
381
+ void lm_ggml_backend_event_record(lm_ggml_backend_event_t event) {
382
+ LM_GGML_ASSERT(event->backend->iface.event_record != NULL);
383
+
384
+ event->backend->iface.event_record(event);
385
+ }
386
+
387
+ void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
388
+ LM_GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
389
+
390
+ event->backend->iface.event_synchronize(event);
391
+ }
392
+
393
+ void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
394
+ LM_GGML_ASSERT(backend->iface.event_wait != NULL);
395
+
396
+ backend->iface.event_wait(backend, event);
397
+ }
398
+
399
+ // backend registry
400
+
401
+ #define LM_GGML_REG_MAX_BACKENDS 64
402
+
403
+ struct lm_ggml_backend_reg {
404
+ char name[128];
405
+ lm_ggml_backend_init_fn init_fn;
406
+ lm_ggml_backend_buffer_type_t default_buffer_type;
407
+ void * user_data;
408
+ };
409
+
410
+ static struct lm_ggml_backend_reg lm_ggml_backend_registry[LM_GGML_REG_MAX_BACKENDS];
411
+ static size_t lm_ggml_backend_registry_count = 0;
412
+
413
+ LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data);
414
+
415
+ LM_GGML_CALL static void lm_ggml_backend_registry_init(void) {
416
+ static bool initialized = false;
417
+
418
+ if (initialized) {
419
+ return;
420
+ }
421
+
422
+ initialized = true;
423
+
424
+ lm_ggml_backend_register("CPU", lm_ggml_backend_reg_cpu_init, lm_ggml_backend_cpu_buffer_type(), NULL);
425
+
426
+ // add forward decls here to avoid including the backend headers
427
+ #ifdef LM_GGML_USE_CUDA
428
+ extern LM_GGML_CALL void lm_ggml_backend_cuda_reg_devices(void);
429
+ lm_ggml_backend_cuda_reg_devices();
430
+ #endif
431
+
432
+ #ifdef LM_GGML_USE_SYCL
433
+ extern void lm_ggml_backend_sycl_reg_devices(void);
434
+ lm_ggml_backend_sycl_reg_devices();
435
+ #endif
436
+
437
+ #ifdef LM_GGML_USE_METAL
438
+ extern LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data);
439
+ extern LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
440
+ lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL);
441
+ #endif
442
+
443
+ #ifdef LM_GGML_USE_VULKAN
444
+ extern LM_GGML_CALL int lm_ggml_backend_vk_reg_devices(void);
445
+ lm_ggml_backend_vk_reg_devices();
446
+ #endif
447
+
448
+ #ifdef LM_GGML_USE_KOMPUTE
449
+ extern LM_GGML_CALL void lm_ggml_backend_kompute_reg_devices(void);
450
+ lm_ggml_backend_kompute_reg_devices();
451
+ #endif
452
+
453
+ #ifdef LM_GGML_USE_CANN
454
+ extern LM_GGML_CALL int lm_ggml_backend_cann_reg_devices(void);
455
+ lm_ggml_backend_cann_reg_devices();
456
+ #endif
457
+ }
458
+
459
+ LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
460
+ LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_REG_MAX_BACKENDS);
461
+
462
+ size_t id = lm_ggml_backend_registry_count;
463
+
464
+ lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) {
465
+ /* .name = */ {0},
466
+ /* .fn = */ init_fn,
467
+ /* .default_buffer_type = */ default_buffer_type,
468
+ /* .user_data = */ user_data,
469
+ };
470
+
471
+ snprintf(lm_ggml_backend_registry[id].name, sizeof(lm_ggml_backend_registry[id].name), "%s", name);
472
+
473
+ #ifndef NDEBUG
474
+ fprintf(stderr, "%s: registered backend %s\n", __func__, name);
475
+ #endif
476
+
477
+ lm_ggml_backend_registry_count++;
478
+ }
479
+
480
+ size_t lm_ggml_backend_reg_get_count(void) {
481
+ lm_ggml_backend_registry_init();
482
+
483
+ return lm_ggml_backend_registry_count;
484
+ }
485
+
486
+ size_t lm_ggml_backend_reg_find_by_name(const char * name) {
487
+ lm_ggml_backend_registry_init();
488
+
489
+ for (size_t i = 0; i < lm_ggml_backend_registry_count; i++) {
490
+ // TODO: case insensitive in a portable way
491
+ if (strcmp(lm_ggml_backend_registry[i].name, name) == 0) {
492
+ return i;
493
+ }
494
+ }
495
+
496
+ // not found
497
+ return SIZE_MAX;
498
+ }
499
+
500
+ // init from backend:params string
501
+ lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str) {
502
+ lm_ggml_backend_registry_init();
503
+
504
+ const char * params = strchr(backend_str, ':');
505
+ char backend_name[128];
506
+ if (params == NULL) {
507
+ snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
508
+ params = "";
509
+ } else {
510
+ snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
511
+ params++;
512
+ }
513
+
514
+ size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name);
515
+
516
+ if (backend_i == SIZE_MAX) {
517
+ fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
518
+ return NULL;
519
+ }
520
+
521
+ return lm_ggml_backend_reg_init_backend(backend_i, params);
522
+ }
523
+
524
+ const char * lm_ggml_backend_reg_get_name(size_t i) {
525
+ lm_ggml_backend_registry_init();
526
+
527
+ LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
528
+ return lm_ggml_backend_registry[i].name;
529
+ }
530
+
531
+ lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params) {
532
+ lm_ggml_backend_registry_init();
533
+
534
+ LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
535
+ return lm_ggml_backend_registry[i].init_fn(params, lm_ggml_backend_registry[i].user_data);
536
+ }
537
+
538
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i) {
539
+ lm_ggml_backend_registry_init();
540
+
541
+ LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
542
+ return lm_ggml_backend_registry[i].default_buffer_type;
543
+ }
544
+
545
+ lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
546
+ lm_ggml_backend_registry_init();
547
+
548
+ LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
549
+ return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_registry[i].default_buffer_type, size);
550
+ }
551
+
552
+ // backend CPU
553
+
554
+ static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
555
+
556
+ LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend_buffer_t buffer) {
557
+ return "CPU";
558
+
559
+ LM_GGML_UNUSED(buffer);
560
+ }
561
+
562
+ LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
563
+ uintptr_t data = (uintptr_t)buffer->context;
564
+
565
+ // align the buffer
566
+ if (data % TENSOR_ALIGNMENT != 0) {
567
+ data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
568
+ }
569
+
570
+ return (void *)data;
571
+ }
572
+
573
+ LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
574
+ free(buffer->context);
575
+ }
576
+
577
+ LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
578
+ memcpy((char *)tensor->data + offset, data, size);
579
+
580
+ LM_GGML_UNUSED(buffer);
581
+ }
582
+
583
+ LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
584
+ memcpy(data, (const char *)tensor->data + offset, size);
585
+
586
+ LM_GGML_UNUSED(buffer);
587
+ }
588
+
589
+ LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
590
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
591
+ memcpy(dst->data, src->data, lm_ggml_nbytes(src));
592
+ return true;
593
+ }
594
+ return false;
595
+
596
+ LM_GGML_UNUSED(buffer);
597
+ }
598
+
599
+ LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
600
+ memset(buffer->context, value, buffer->size);
601
+ }
602
+
603
+ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
604
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
605
+ /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
606
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
607
+ /* .init_tensor = */ NULL, // no initialization required
608
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
609
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
610
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
611
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
612
+ /* .reset = */ NULL,
613
+ };
614
+
615
+ // for buffers from ptr, free is not called
616
+ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
617
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
618
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
619
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
620
+ /* .init_tensor = */ NULL, // no initialization required
621
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
622
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
623
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
624
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
625
+ /* .reset = */ NULL,
626
+ };
627
+
628
+ LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
629
+ return "CPU";
630
+
631
+ LM_GGML_UNUSED(buft);
632
+ }
633
+
634
+ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
635
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
636
+ void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
637
+ if (data == NULL) {
638
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
639
+ return NULL;
640
+ }
641
+
642
+ return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
643
+ }
644
+
645
+ LM_GGML_CALL static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
646
+ return TENSOR_ALIGNMENT;
647
+
648
+ LM_GGML_UNUSED(buft);
649
+ }
650
+
651
+ LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
652
+ return true;
653
+
654
+ LM_GGML_UNUSED(buft);
655
+ }
656
+
657
+ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
658
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
659
+ /* .iface = */ {
660
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
661
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
662
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
663
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
664
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
665
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
666
+ },
667
+ /* .context = */ NULL,
668
+ };
669
+
670
+ return &lm_ggml_backend_cpu_buffer_type;
671
+ }
672
+
673
+ #ifdef LM_GGML_USE_CPU_HBM
674
+
675
+ // buffer type HBM
676
+
677
+ #include <hbwmalloc.h>
678
+
679
+ LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
680
+ return "CPU_HBM";
681
+
682
+ LM_GGML_UNUSED(buft);
683
+ }
684
+
685
+ LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
686
+ return "CPU_HBM";
687
+
688
+ LM_GGML_UNUSED(buf);
689
+ }
690
+
691
+ LM_GGML_CALL static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
692
+ hbw_free(buffer->context);
693
+ }
694
+
695
+ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
696
+ //void * ptr = hbw_malloc(size);
697
+ void * ptr;
698
+ int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
699
+ if (result != 0) {
700
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
701
+ return NULL;
702
+ }
703
+
704
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
705
+ buffer->buft = buft;
706
+ buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name;
707
+ buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
708
+
709
+ return buffer;
710
+ }
711
+
712
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
713
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
714
+ /* .iface = */ {
715
+ /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
716
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
717
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
718
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
719
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
720
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
721
+ },
722
+ /* .context = */ NULL,
723
+ };
724
+
725
+ return &lm_ggml_backend_cpu_buffer_type_hbm;
726
+ }
727
+ #endif
728
+
729
+ struct lm_ggml_backend_cpu_context {
730
+ int n_threads;
731
+ void * work_data;
732
+ size_t work_size;
733
+
734
+ lm_ggml_abort_callback abort_callback;
735
+ void * abort_callback_data;
736
+ };
737
+
738
+ LM_GGML_CALL static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) {
739
+ return "CPU";
740
+
741
+ LM_GGML_UNUSED(backend);
742
+ }
743
+
744
+ LM_GGML_CALL static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
745
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
746
+ free(cpu_ctx->work_data);
747
+ free(cpu_ctx);
748
+ free(backend);
749
+ }
750
+
751
+ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
752
+ return lm_ggml_backend_cpu_buffer_type();
753
+
754
+ LM_GGML_UNUSED(backend);
755
+ }
756
+
757
+ struct lm_ggml_backend_plan_cpu {
758
+ struct lm_ggml_cplan cplan;
759
+ struct lm_ggml_cgraph cgraph;
760
+ };
761
+
762
+ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
763
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
764
+
765
+ struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu));
766
+
767
+ cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
768
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
769
+
770
+ if (cpu_plan->cplan.work_size > 0) {
771
+ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
772
+ if (cpu_plan->cplan.work_data == NULL) {
773
+ free(cpu_plan);
774
+ return NULL;
775
+ }
776
+ }
777
+
778
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
779
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
780
+
781
+ return cpu_plan;
782
+ }
783
+
784
+ LM_GGML_CALL static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
785
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
786
+
787
+ free(cpu_plan->cplan.work_data);
788
+ free(cpu_plan);
789
+
790
+ LM_GGML_UNUSED(backend);
791
+ }
792
+
793
+ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
794
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
795
+
796
+ return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
797
+
798
+ LM_GGML_UNUSED(backend);
799
+ }
800
+
801
+ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
802
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
803
+
804
+ struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
805
+
806
+ if (cpu_ctx->work_size < cplan.work_size) {
807
+ free(cpu_ctx->work_data);
808
+ cpu_ctx->work_data = malloc(cplan.work_size);
809
+ if (cpu_ctx->work_data == NULL) {
810
+ cpu_ctx->work_size = 0;
811
+ return LM_GGML_STATUS_ALLOC_FAILED;
812
+ }
813
+ cpu_ctx->work_size = cplan.work_size;
814
+ }
815
+ cplan.work_data = cpu_ctx->work_data;
816
+
817
+ cplan.abort_callback = cpu_ctx->abort_callback;
818
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
819
+
820
+ return lm_ggml_graph_compute(cgraph, &cplan);
821
+ }
822
+
823
+ LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
824
+ switch (op->op) {
825
+ case LM_GGML_OP_CPY:
826
+ return
827
+ op->type != LM_GGML_TYPE_IQ2_XXS &&
828
+ op->type != LM_GGML_TYPE_IQ2_XS &&
829
+ op->type != LM_GGML_TYPE_IQ1_S &&
830
+ op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
831
+ case LM_GGML_OP_MUL_MAT:
832
+ return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
833
+ default:
834
+ return true;
835
+ }
836
+
837
+ LM_GGML_UNUSED(backend);
838
+ }
839
+
840
+ LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
841
+ return lm_ggml_backend_buft_is_host(buft);
842
+
843
+ LM_GGML_UNUSED(backend);
844
+ }
845
+
846
+ static struct lm_ggml_backend_i cpu_backend_i = {
847
+ /* .get_name = */ lm_ggml_backend_cpu_name,
848
+ /* .free = */ lm_ggml_backend_cpu_free,
849
+ /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
850
+ /* .set_tensor_async = */ NULL,
851
+ /* .get_tensor_async = */ NULL,
852
+ /* .cpy_tensor_async = */ NULL,
853
+ /* .synchronize = */ NULL,
854
+ /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
855
+ /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
856
+ /* .graph_plan_update = */ NULL,
857
+ /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
858
+ /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
859
+ /* .supports_op = */ lm_ggml_backend_cpu_supports_op,
860
+ /* .supports_buft = */ lm_ggml_backend_cpu_supports_buft,
861
+ /* .offload_op = */ NULL,
862
+ /* .event_new = */ NULL,
863
+ /* .event_free = */ NULL,
864
+ /* .event_record = */ NULL,
865
+ /* .event_wait = */ NULL,
866
+ /* .event_synchronize = */ NULL,
867
+ };
868
+
869
+ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
870
+ static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
871
+ return &guid;
872
+ }
873
+
874
+ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
875
+ struct lm_ggml_backend_cpu_context * ctx = malloc(sizeof(struct lm_ggml_backend_cpu_context));
876
+ if (ctx == NULL) {
877
+ return NULL;
878
+ }
879
+
880
+ ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
881
+ ctx->work_data = NULL;
882
+ ctx->work_size = 0;
883
+ ctx->abort_callback = NULL;
884
+ ctx->abort_callback_data = NULL;
885
+
886
+ lm_ggml_backend_t cpu_backend = malloc(sizeof(struct lm_ggml_backend));
887
+ if (cpu_backend == NULL) {
888
+ free(ctx);
889
+ return NULL;
890
+ }
891
+
892
+ *cpu_backend = (struct lm_ggml_backend) {
893
+ /* .guid = */ lm_ggml_backend_cpu_guid(),
894
+ /* .interface = */ cpu_backend_i,
895
+ /* .context = */ ctx
896
+ };
897
+ return cpu_backend;
898
+ }
899
+
900
+ LM_GGML_CALL bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
901
+ return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
902
+ }
903
+
904
+ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
905
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
906
+
907
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
908
+ ctx->n_threads = n_threads;
909
+ }
910
+
911
+ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
912
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
913
+
914
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
915
+ ctx->abort_callback = abort_callback;
916
+ ctx->abort_callback_data = abort_callback_data;
917
+ }
918
+
919
+ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
920
+ LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
921
+ return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
922
+ }
923
+
924
+ LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
925
+ return lm_ggml_backend_cpu_init();
926
+
927
+ LM_GGML_UNUSED(params);
928
+ LM_GGML_UNUSED(user_data);
929
+ }
930
+
931
+ // multi-buffer buffer
932
+
933
+ struct lm_ggml_backend_multi_buffer_context {
934
+ lm_ggml_backend_buffer_t * buffers;
935
+ size_t n_buffers;
936
+ };
937
+
938
+ typedef struct lm_ggml_backend_multi_buffer_context * lm_ggml_backend_multi_buffer_context_t;
939
+
940
+ LM_GGML_CALL static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
941
+ lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
942
+
943
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
944
+ }
945
+
946
+ LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
947
+ lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
948
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
949
+ lm_ggml_backend_buffer_free(ctx->buffers[i]);
950
+ }
951
+
952
+ free(ctx->buffers);
953
+ free(ctx);
954
+ }
955
+
956
+ LM_GGML_CALL static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
957
+ lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
958
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
959
+ lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
960
+ }
961
+ }
962
+
963
+ static struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_context_interface(void) {
964
+ static struct lm_ggml_backend_buffer_i multi_backend_buffer_i = {
965
+ /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
966
+ /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
967
+ /* .get_base = */ NULL,
968
+ /* .init_tensor = */ NULL,
969
+ /* .set_tensor = */ NULL,
970
+ /* .get_tensor = */ NULL,
971
+ /* .cpy_tensor = */ NULL,
972
+ /* .clear = */ lm_ggml_backend_multi_buffer_clear,
973
+ /* .reset = */ NULL,
974
+ };
975
+
976
+ return multi_backend_buffer_i;
977
+ }
978
+
979
+ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
980
+ lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
981
+ ctx->n_buffers = n_buffers;
982
+ ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
983
+
984
+ LM_GGML_ASSERT(ctx->buffers != NULL);
985
+
986
+ size_t total_size = 0;
987
+ for (size_t i = 0; i < n_buffers; i++) {
988
+ ctx->buffers[i] = buffers[i];
989
+ total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
990
+ }
991
+
992
+ return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_context_interface(), ctx, total_size);
993
+ }
994
+
995
+ LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
996
+ return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
997
+ }
998
+
999
+ LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1000
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
1001
+ lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1002
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1003
+ lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1004
+ }
1005
+ }
1006
+
1007
+ // creates a copy of the tensor with the same memory layout
1008
+ static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
1009
+ struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
1010
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
1011
+ dup->nb[i] = tensor->nb[i];
1012
+ }
1013
+ return dup;
1014
+ }
1015
+
1016
+ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1017
+ return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
1018
+ }
1019
+
1020
+ // scheduler
1021
+
1022
+ #ifndef LM_GGML_SCHED_MAX_BACKENDS
1023
+ #define LM_GGML_SCHED_MAX_BACKENDS 16
1024
+ #endif
1025
+
1026
+ #ifndef LM_GGML_SCHED_MAX_SPLITS
1027
+ #define LM_GGML_SCHED_MAX_SPLITS 2048
1028
+ #endif
1029
+
1030
+ #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1031
+ #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1032
+ #endif
1033
+
1034
+ #ifndef LM_GGML_SCHED_MAX_COPIES
1035
+ #define LM_GGML_SCHED_MAX_COPIES 4
1036
+ #endif
1037
+
1038
+ struct lm_ggml_backend_sched_split {
1039
+ int backend_id;
1040
+ int i_start;
1041
+ int i_end;
1042
+ struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1043
+ int n_inputs;
1044
+ // graph view of this split
1045
+ struct lm_ggml_cgraph graph;
1046
+ };
1047
+
1048
+ struct lm_ggml_backend_sched {
1049
+ bool is_reset; // true if the scheduler has been reset since the last graph split
1050
+ bool is_alloc;
1051
+
1052
+ int n_backends;
1053
+
1054
+ lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
1055
+ lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
1056
+ lm_ggml_gallocr_t galloc;
1057
+
1058
+ // hash map of the nodes in the graph
1059
+ struct lm_ggml_hash_set hash_set;
1060
+ int * hv_tensor_backend_ids; // [hash_set.size]
1061
+ struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1062
+
1063
+ int * node_backend_ids; // [graph_size]
1064
+ int * leaf_backend_ids; // [graph_size]
1065
+
1066
+ int * prev_node_backend_ids; // [graph_size]
1067
+ int * prev_leaf_backend_ids; // [graph_size]
1068
+
1069
+ // copy of the graph with modified inputs
1070
+ struct lm_ggml_cgraph graph;
1071
+
1072
+ // graph splits
1073
+ struct lm_ggml_backend_sched_split * splits;
1074
+ int n_splits;
1075
+ int splits_capacity;
1076
+
1077
+ // pipeline parallelism support
1078
+ int n_copies;
1079
+ int cur_copy;
1080
+ lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
1081
+ struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1082
+ int n_graph_inputs;
1083
+
1084
+ struct lm_ggml_context * ctx;
1085
+
1086
+ lm_ggml_backend_sched_eval_callback callback_eval;
1087
+ void * callback_eval_user_data;
1088
+
1089
+ char * context_buffer;
1090
+ size_t context_buffer_size;
1091
+
1092
+ bool debug;
1093
+ };
1094
+
1095
+ #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
1096
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1097
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1098
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1099
+
1100
+ // returns the priority of the backend, lower id is higher priority
1101
+ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1102
+ for (int i = 0; i < sched->n_backends; i++) {
1103
+ if (sched->backends[i] == backend) {
1104
+ return i;
1105
+ }
1106
+ }
1107
+ return -1;
1108
+ }
1109
+
1110
+ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
1111
+ lm_ggml_backend_buffer_t buffer = tensor->buffer;
1112
+ if (buffer == NULL) {
1113
+ return -1;
1114
+ }
1115
+
1116
+ // find highest prio backend that supports the buffer type and the op
1117
+ for (int i = 0; i < sched->n_backends; i++) {
1118
+ if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1119
+ lm_ggml_backend_supports_op(sched->backends[i], op)) {
1120
+ return i;
1121
+ }
1122
+ }
1123
+
1124
+ #ifndef NDEBUG
1125
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1126
+ __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
1127
+ #endif
1128
+
1129
+ return -1;
1130
+ }
1131
+
1132
+ #if 0
1133
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1134
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1135
+ #define GET_CAUSE(node) causes[hash_id(node)]
1136
+ #else
1137
+ #define SET_CAUSE(node, ...)
1138
+ #define GET_CAUSE(node) ""
1139
+ #endif
1140
+
1141
+ // returns the backend that should be used for the node based on the current locations
1142
+ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
1143
+ // TODO: use supports_op to check if the backend supports the op
1144
+
1145
+ // assign pre-allocated nodes to their backend
1146
+ int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1147
+ if (cur_backend_id != -1) {
1148
+ SET_CAUSE(tensor, "1.dst");
1149
+ return cur_backend_id;
1150
+ }
1151
+
1152
+ // view_src
1153
+ if (tensor->view_src != NULL) {
1154
+ cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1155
+ if (cur_backend_id != -1) {
1156
+ SET_CAUSE(tensor, "1.vsrc");
1157
+ return cur_backend_id;
1158
+ }
1159
+ }
1160
+
1161
+ // graph input
1162
+ if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1163
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1164
+ SET_CAUSE(tensor, "1.inp");
1165
+ return cur_backend_id;
1166
+ }
1167
+
1168
+ // operations with weights are preferably run on the same backend as the weights
1169
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1170
+ const struct lm_ggml_tensor * src = tensor->src[i];
1171
+ if (src == NULL) {
1172
+ continue;
1173
+ }
1174
+ if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1175
+ int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1176
+ // check if a backend with higher prio wants to offload the op
1177
+ if (src_backend_id == sched->n_backends - 1) {
1178
+ for (int b = 0; b < src_backend_id; b++) {
1179
+ if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
1180
+ SET_CAUSE(tensor, "1.off");
1181
+ return b;
1182
+ }
1183
+ }
1184
+ }
1185
+ SET_CAUSE(tensor, "1.wgt%d", i);
1186
+ return src_backend_id;
1187
+ }
1188
+ }
1189
+
1190
+ return -1;
1191
+ }
1192
+
1193
+ static char * fmt_size(size_t size) {
1194
+ static char buffer[128];
1195
+ if (size >= 1024*1024) {
1196
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1197
+ } else {
1198
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1199
+ }
1200
+ return buffer;
1201
+ }
1202
+
1203
+ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1204
+ int cur_split = 0;
1205
+ for (int i = 0; i < graph->n_nodes; i++) {
1206
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1207
+ lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1208
+ fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1209
+ sched->splits[cur_split].n_inputs);
1210
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1211
+ fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1212
+ fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1213
+ }
1214
+ fprintf(stderr, "\n");
1215
+ cur_split++;
1216
+ }
1217
+ struct lm_ggml_tensor * node = graph->nodes[i];
1218
+ if (lm_ggml_is_view_op(node->op)) {
1219
+ continue;
1220
+ }
1221
+ lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1222
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1223
+ fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1224
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1225
+ struct lm_ggml_tensor * src = node->src[j];
1226
+ if (src == NULL) {
1227
+ continue;
1228
+ }
1229
+ lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1230
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1231
+ fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1232
+ }
1233
+ fprintf(stderr, "\n");
1234
+ }
1235
+ }
1236
+
1237
+ static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1238
+ lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1239
+ lm_ggml_backend_buffer_type_t buft = NULL;
1240
+
1241
+ if (buf) {
1242
+ // the tensor is already allocated
1243
+ buft = buf->buft;
1244
+ } else {
1245
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1246
+ int tensor_backend_id = tensor_backend_id(t);
1247
+ if (tensor_backend_id == -1 && t->view_src) {
1248
+ tensor_backend_id = tensor_backend_id(t->view_src);
1249
+ }
1250
+ if (tensor_backend_id != -1) {
1251
+ buft = sched->bufts[tensor_backend_id];
1252
+ }
1253
+ }
1254
+
1255
+ return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1256
+ }
1257
+
1258
+ static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1259
+ if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1260
+ *node_backend_id = cur_backend_id;
1261
+ SET_CAUSE(node, "2.sup");
1262
+ }
1263
+ }
1264
+
1265
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1266
+ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1267
+ // reset splits
1268
+ sched->n_splits = 0;
1269
+ sched->n_graph_inputs = 0;
1270
+ sched->is_reset = false;
1271
+
1272
+ struct lm_ggml_init_params params = {
1273
+ /* .mem_size = */ sched->context_buffer_size,
1274
+ /* .mem_buffer = */ sched->context_buffer,
1275
+ /* .no_alloc = */ true
1276
+ };
1277
+
1278
+ lm_ggml_free(sched->ctx);
1279
+
1280
+ sched->ctx = lm_ggml_init(params);
1281
+ if (sched->ctx == NULL) {
1282
+ LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
1283
+ }
1284
+
1285
+ // pass 1: assign backends to ops with pre-allocated inputs
1286
+ for (int i = 0; i < graph->n_leafs; i++) {
1287
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1288
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1289
+ // do not overwrite user assignments
1290
+ if (*leaf_backend_id == -1) {
1291
+ *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1292
+ }
1293
+ }
1294
+
1295
+ for (int i = 0; i < graph->n_nodes; i++) {
1296
+ struct lm_ggml_tensor * node = graph->nodes[i];
1297
+ int * node_backend_id = &tensor_backend_id(node);
1298
+ // do not overwrite user assignments
1299
+ if (*node_backend_id == -1) {
1300
+ *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1301
+
1302
+ #if 0
1303
+ // src
1304
+ if (node->op == LM_GGML_OP_NONE) {
1305
+ continue;
1306
+ }
1307
+
1308
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1309
+ struct lm_ggml_tensor * src = node->src[j];
1310
+ if (src == NULL) {
1311
+ continue;
1312
+ }
1313
+ int * src_backend_id = &tensor_backend_id(src);
1314
+ if (*src_backend_id == -1) {
1315
+ *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1316
+ }
1317
+ }
1318
+ #endif
1319
+ }
1320
+ }
1321
+
1322
+ // pass 2: expand current backend assignments
1323
+ // assign the same backend to adjacent nodes
1324
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1325
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1326
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1327
+ // expand gpu down
1328
+ {
1329
+ int cur_backend_id = -1;
1330
+ for (int i = 0; i < graph->n_nodes; i++) {
1331
+ struct lm_ggml_tensor * node = graph->nodes[i];
1332
+ if (lm_ggml_is_view_op(node->op)) {
1333
+ continue;
1334
+ }
1335
+ int * node_backend_id = &tensor_backend_id(node);
1336
+ if (*node_backend_id != -1) {
1337
+ if (*node_backend_id == sched->n_backends - 1) {
1338
+ // skip cpu (lowest prio backend)
1339
+ cur_backend_id = -1;
1340
+ } else {
1341
+ cur_backend_id = *node_backend_id;
1342
+ }
1343
+ } else if (cur_backend_id != -1) {
1344
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1345
+ }
1346
+ }
1347
+ }
1348
+ // expand gpu up
1349
+ {
1350
+ int cur_backend_id = -1;
1351
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1352
+ struct lm_ggml_tensor * node = graph->nodes[i];
1353
+ if (lm_ggml_is_view_op(node->op)) {
1354
+ continue;
1355
+ }
1356
+ int * node_backend_id = &tensor_backend_id(node);
1357
+ if (*node_backend_id != -1) {
1358
+ if (*node_backend_id == sched->n_backends - 1) {
1359
+ // skip cpu (lowest prio backend)
1360
+ cur_backend_id = -1;
1361
+ } else {
1362
+ cur_backend_id = *node_backend_id;
1363
+ }
1364
+ } else if (cur_backend_id != -1) {
1365
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1366
+ }
1367
+ }
1368
+ }
1369
+ // expand rest down
1370
+ {
1371
+ int cur_backend_id = -1;
1372
+ for (int i = 0; i < graph->n_nodes; i++) {
1373
+ struct lm_ggml_tensor * node = graph->nodes[i];
1374
+ if (lm_ggml_is_view_op(node->op)) {
1375
+ continue;
1376
+ }
1377
+ int * node_backend_id = &tensor_backend_id(node);
1378
+ if (*node_backend_id != -1) {
1379
+ cur_backend_id = *node_backend_id;
1380
+ } else if (cur_backend_id != -1) {
1381
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1382
+ }
1383
+ }
1384
+ }
1385
+ // expand rest up
1386
+ {
1387
+ int cur_backend_id = -1;
1388
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1389
+ struct lm_ggml_tensor * node = graph->nodes[i];
1390
+ if (lm_ggml_is_view_op(node->op)) {
1391
+ continue;
1392
+ }
1393
+ int * node_backend_id = &tensor_backend_id(node);
1394
+ if (*node_backend_id != -1) {
1395
+ cur_backend_id = *node_backend_id;
1396
+ } else if (cur_backend_id != -1) {
1397
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1398
+ }
1399
+ }
1400
+ }
1401
+
1402
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1403
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1404
+ // however, we also need to verify that the sources are in compatible buffer types
1405
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1406
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1407
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1408
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1409
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1410
+ for (int i = 0; i < graph->n_nodes; i++) {
1411
+ struct lm_ggml_tensor * node = graph->nodes[i];
1412
+ if (lm_ggml_is_view_op(node->op)) {
1413
+ continue;
1414
+ }
1415
+ int * node_backend_id = &tensor_backend_id(node);
1416
+ if (*node_backend_id == -1) {
1417
+ // unassigned node: find the backend with the most supported inputs
1418
+ int n_supported_best = -1;
1419
+ for (int b = 0; b < sched->n_backends; b++) {
1420
+ if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1421
+ int n_supported = 0;
1422
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1423
+ struct lm_ggml_tensor * src = node->src[j];
1424
+ if (src == NULL) {
1425
+ continue;
1426
+ }
1427
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1428
+ n_supported++;
1429
+ }
1430
+ }
1431
+ if (n_supported > n_supported_best) {
1432
+ n_supported_best = n_supported;
1433
+ *node_backend_id = b;
1434
+ SET_CAUSE(node, "3.best");
1435
+ }
1436
+ }
1437
+ }
1438
+ } else {
1439
+ // assigned node: upgrade to higher prio backend if possible
1440
+ for (int b = 0; b < *node_backend_id; b++) {
1441
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1442
+ bool supported = true;
1443
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1444
+ struct lm_ggml_tensor * src = node->src[j];
1445
+ if (src == NULL) {
1446
+ continue;
1447
+ }
1448
+ if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1449
+ supported = false;
1450
+ break;
1451
+ }
1452
+ }
1453
+ if (supported) {
1454
+ *node_backend_id = b;
1455
+ SET_CAUSE(node, "3.upg");
1456
+ break;
1457
+ }
1458
+ }
1459
+ }
1460
+ }
1461
+ }
1462
+
1463
+ // pass 4: assign backends to remaining src from dst and view_src
1464
+ for (int i = 0; i < graph->n_nodes; i++) {
1465
+ struct lm_ggml_tensor * node = graph->nodes[i];
1466
+ int * cur_backend_id = &tensor_backend_id(node);
1467
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1468
+ *cur_backend_id = tensor_backend_id(node->view_src);
1469
+ SET_CAUSE(node, "4.vsrc");
1470
+ }
1471
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1472
+ struct lm_ggml_tensor * src = node->src[j];
1473
+ if (src == NULL) {
1474
+ continue;
1475
+ }
1476
+ int * src_backend_id = &tensor_backend_id(src);
1477
+ if (*src_backend_id == -1) {
1478
+ if (src->view_src != NULL) {
1479
+ // views are always on the same backend as the source
1480
+ *src_backend_id = tensor_backend_id(src->view_src);
1481
+ SET_CAUSE(src, "4.vsrc");
1482
+ } else {
1483
+ *src_backend_id = *cur_backend_id;
1484
+ SET_CAUSE(src, "4.cur");
1485
+ }
1486
+ }
1487
+ }
1488
+ }
1489
+
1490
+ // pass 5: split graph, find tensors that need to be copied
1491
+ {
1492
+ int i_split = 0;
1493
+ struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1494
+ // find the backend of the first split, skipping view ops
1495
+ int i = 0;
1496
+ for (; i < graph->n_nodes; i++) {
1497
+ struct lm_ggml_tensor * node = graph->nodes[i];
1498
+ if (!lm_ggml_is_view_op(node->op)) {
1499
+ split->backend_id = tensor_backend_id(node);
1500
+ break;
1501
+ }
1502
+ }
1503
+ split->i_start = 0;
1504
+ split->n_inputs = 0;
1505
+ int cur_backend_id = split->backend_id;
1506
+ for (; i < graph->n_nodes; i++) {
1507
+ struct lm_ggml_tensor * node = graph->nodes[i];
1508
+
1509
+ if (lm_ggml_is_view_op(node->op)) {
1510
+ continue;
1511
+ }
1512
+
1513
+ const int node_backend_id = tensor_backend_id(node);
1514
+
1515
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1516
+
1517
+ // check if we should start a new split based on the sources of the current node
1518
+ bool need_new_split = false;
1519
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1520
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1521
+ struct lm_ggml_tensor * src = node->src[j];
1522
+ if (src == NULL) {
1523
+ continue;
1524
+ }
1525
+ // check if a weight is on a different backend
1526
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1527
+ if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1528
+ int src_backend_id = tensor_backend_id(src);
1529
+ if (src_backend_id != cur_backend_id) {
1530
+ need_new_split = true;
1531
+ break;
1532
+ }
1533
+ }
1534
+ // check if the split has too many inputs
1535
+ // FIXME: count the number of inputs instead of only checking when full
1536
+ if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1537
+ const size_t id = hash_id(src);
1538
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1539
+ bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1540
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1541
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1542
+ need_new_split = true;
1543
+ break;
1544
+ }
1545
+ }
1546
+ }
1547
+ }
1548
+
1549
+ if (node_backend_id != cur_backend_id || need_new_split) {
1550
+ split->i_end = i;
1551
+ i_split++;
1552
+ if (i_split >= sched->splits_capacity) {
1553
+ sched->splits_capacity *= 2;
1554
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1555
+ LM_GGML_ASSERT(sched->splits != NULL);
1556
+ }
1557
+ LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
1558
+ split = &sched->splits[i_split];
1559
+ split->backend_id = node_backend_id;
1560
+ split->i_start = i;
1561
+ split->n_inputs = 0;
1562
+ cur_backend_id = node_backend_id;
1563
+ }
1564
+
1565
+ // find inputs that are not on the same backend
1566
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1567
+ struct lm_ggml_tensor * src = node->src[j];
1568
+ if (src == NULL) {
1569
+ continue;
1570
+ }
1571
+
1572
+ size_t src_id = hash_id(src);
1573
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1574
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1575
+
1576
+ if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1577
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1578
+ lm_ggml_backend_t backend = sched->backends[src_backend_id];
1579
+ for (int c = 0; c < sched->n_copies; c++) {
1580
+ struct lm_ggml_tensor * tensor_copy;
1581
+ if (c == sched->cur_copy) {
1582
+ tensor_copy = src; // use the original tensor as the current copy
1583
+ } else {
1584
+ tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1585
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1586
+ }
1587
+ if (sched->n_copies > 1) {
1588
+ lm_ggml_set_input(tensor_copy);
1589
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1590
+ }
1591
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1592
+ SET_CAUSE(tensor_copy, "4.cpy");
1593
+ }
1594
+ int n_graph_inputs = sched->n_graph_inputs++;
1595
+ LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1596
+ sched->graph_inputs[n_graph_inputs] = src;
1597
+ }
1598
+ }
1599
+
1600
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1601
+ // create a copy of the input in the split's backend
1602
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1603
+ lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1604
+ for (int c = 0; c < sched->n_copies; c++) {
1605
+ struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1606
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1607
+ if (sched->n_copies > 1) {
1608
+ lm_ggml_set_input(tensor_copy);
1609
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1610
+ }
1611
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1612
+ SET_CAUSE(tensor_copy, "4.cpy");
1613
+ }
1614
+ int n_inputs = split->n_inputs++;
1615
+ LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1616
+ split->inputs[n_inputs] = src;
1617
+ }
1618
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1619
+ }
1620
+ }
1621
+ }
1622
+ split->i_end = graph->n_nodes;
1623
+ sched->n_splits = i_split + 1;
1624
+ }
1625
+
1626
+ if (sched->debug) {
1627
+ lm_ggml_backend_sched_print_assignments(sched, graph);
1628
+ }
1629
+
1630
+ // swap node_backend_ids and leaf _backend_ids with prevs
1631
+ {
1632
+ int * tmp = sched->node_backend_ids;
1633
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1634
+ sched->prev_node_backend_ids = tmp;
1635
+
1636
+ tmp = sched->leaf_backend_ids;
1637
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1638
+ sched->prev_leaf_backend_ids = tmp;
1639
+ }
1640
+
1641
+ int graph_size = graph->n_nodes + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1642
+ if (sched->graph.size < graph_size) {
1643
+ sched->graph.size = graph_size;
1644
+ sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1645
+ sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1646
+ LM_GGML_ASSERT(sched->graph.nodes != NULL);
1647
+ LM_GGML_ASSERT(sched->graph.leafs != NULL);
1648
+ }
1649
+ sched->graph.n_nodes = 0;
1650
+ sched->graph.n_leafs = 0;
1651
+
1652
+ struct lm_ggml_cgraph * graph_copy = &sched->graph;
1653
+
1654
+ for (int i = 0; i < sched->n_splits; i++) {
1655
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1656
+ split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1657
+
1658
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1659
+ for (int j = 0; j < split->n_inputs; j++) {
1660
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1661
+
1662
+ struct lm_ggml_tensor * input = split->inputs[j];
1663
+ const size_t input_id = hash_id(input);
1664
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1665
+
1666
+ // add a dependency to the input source so that it is not freed before the copy is done
1667
+ struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1668
+ input_dep->src[0] = input;
1669
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1670
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1671
+
1672
+ // add a dependency to the input copy so that it is allocated at the start of the split
1673
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1674
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1675
+ }
1676
+
1677
+ for (int j = split->i_start; j < split->i_end; j++) {
1678
+ assert(graph_copy->size > graph_copy->n_nodes);
1679
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1680
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1681
+ }
1682
+ }
1683
+
1684
+ if (sched->n_copies > 1) {
1685
+ // add input copies as leafs so that they are allocated first
1686
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1687
+ struct lm_ggml_tensor * input = sched->graph_inputs[i];
1688
+ size_t id = hash_id(input);
1689
+ int backend_id = tensor_backend_id(input);
1690
+ for (int c = 0; c < sched->n_copies; c++) {
1691
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1692
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1693
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1694
+ }
1695
+ }
1696
+
1697
+ for (int i = 0; i < sched->n_splits; i++) {
1698
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1699
+ int backend_id = split->backend_id;
1700
+ for (int j = 0; j < split->n_inputs; j++) {
1701
+ struct lm_ggml_tensor * input = split->inputs[j];
1702
+ size_t id = hash_id(input);
1703
+ for (int c = 0; c < sched->n_copies; c++) {
1704
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1705
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1706
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1707
+ }
1708
+ }
1709
+ }
1710
+ }
1711
+
1712
+ // add leafs from the original graph
1713
+ for (int i = 0; i < graph->n_leafs; i++) {
1714
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1715
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1716
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1717
+ }
1718
+ }
1719
+
1720
+ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1721
+ bool backend_ids_changed = false;
1722
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1723
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1724
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1725
+ backend_ids_changed = true;
1726
+ break;
1727
+ }
1728
+ }
1729
+ if (!backend_ids_changed) {
1730
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1731
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1732
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1733
+ backend_ids_changed = true;
1734
+ break;
1735
+ }
1736
+ }
1737
+ }
1738
+
1739
+ // allocate graph
1740
+ if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1741
+ // the re-allocation may cause the split inputs to be moved to a different address
1742
+ lm_ggml_backend_sched_synchronize(sched);
1743
+ #ifndef NDEBUG
1744
+ fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1745
+ #endif
1746
+ lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1747
+ if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1748
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1749
+ return false;
1750
+ }
1751
+ }
1752
+
1753
+ return true;
1754
+ }
1755
+
1756
+ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1757
+ struct lm_ggml_backend_sched_split * splits = sched->splits;
1758
+
1759
+ for (int i = 0; i < sched->n_splits; i++) {
1760
+ struct lm_ggml_backend_sched_split * split = &splits[i];
1761
+ int split_backend_id = split->backend_id;
1762
+ lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1763
+
1764
+ // copy the input tensors to the split backend
1765
+ for (int j = 0; j < split->n_inputs; j++) {
1766
+ lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1767
+ struct lm_ggml_tensor * input = split->inputs[j];
1768
+ struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1769
+
1770
+ if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1771
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1772
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1773
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1774
+ } else {
1775
+ lm_ggml_backend_synchronize(split_backend);
1776
+ }
1777
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1778
+ } else {
1779
+ // wait for the split backend to finish using the input before overwriting it
1780
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1781
+ lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1782
+ } else {
1783
+ lm_ggml_backend_synchronize(split_backend);
1784
+ }
1785
+ lm_ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1786
+ }
1787
+ }
1788
+
1789
+ if (!sched->callback_eval) {
1790
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1791
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1792
+ return ec;
1793
+ }
1794
+ } else {
1795
+ // similar to lm_ggml_backend_compare_graph_backend
1796
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1797
+ struct lm_ggml_tensor * t = split->graph.nodes[j0];
1798
+
1799
+ // check if the user needs data from this node
1800
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1801
+
1802
+ int j1 = j0;
1803
+
1804
+ // determine the range [j0, j1] of nodes that can be computed together
1805
+ while (!need && j1 < split->graph.n_nodes - 1) {
1806
+ t = split->graph.nodes[++j1];
1807
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1808
+ }
1809
+
1810
+ struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1811
+
1812
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1813
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1814
+ return ec;
1815
+ }
1816
+
1817
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1818
+ lm_ggml_backend_synchronize(split_backend);
1819
+
1820
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1821
+ break;
1822
+ }
1823
+
1824
+ j0 = j1;
1825
+ }
1826
+ }
1827
+
1828
+ // record the event of this copy
1829
+ if (split->n_inputs > 0) {
1830
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1831
+ lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1832
+ }
1833
+ }
1834
+ }
1835
+
1836
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1837
+
1838
+ return LM_GGML_STATUS_SUCCESS;
1839
+ }
1840
+
1841
+ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1842
+ lm_ggml_backend_t * backends,
1843
+ lm_ggml_backend_buffer_type_t * bufts,
1844
+ int n_backends,
1845
+ size_t graph_size,
1846
+ bool parallel) {
1847
+ LM_GGML_ASSERT(n_backends > 0);
1848
+ LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1849
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1850
+
1851
+ struct lm_ggml_backend_sched * sched = calloc(1, sizeof(struct lm_ggml_backend_sched));
1852
+
1853
+ sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
1854
+ sched->n_backends = n_backends;
1855
+ sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1856
+
1857
+ // initialize hash table
1858
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1859
+ sched->hash_set = lm_ggml_hash_set_new(graph_size);
1860
+ sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1861
+ sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1862
+
1863
+ const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1865
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1866
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1867
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1868
+
1869
+ sched->context_buffer_size = LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1870
+ sched->context_buffer = malloc(sched->context_buffer_size);
1871
+
1872
+ const int initial_splits_capacity = 16;
1873
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1874
+ sched->splits_capacity = initial_splits_capacity;
1875
+
1876
+ for (int b = 0; b < n_backends; b++) {
1877
+ sched->backends[b] = backends[b];
1878
+ sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1879
+ LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1880
+ if (sched->n_copies > 1) {
1881
+ for (int c = 0; c < sched->n_copies; c++) {
1882
+ sched->events[b][c] = lm_ggml_backend_event_new(backends[b]);
1883
+ }
1884
+ }
1885
+ }
1886
+
1887
+ sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1888
+
1889
+ lm_ggml_backend_sched_reset(sched);
1890
+
1891
+ return sched;
1892
+ }
1893
+
1894
+ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1895
+ if (sched == NULL) {
1896
+ return;
1897
+ }
1898
+ for (int b = 0; b < sched->n_backends; b++) {
1899
+ for (int c = 0; c < sched->n_copies; c++) {
1900
+ lm_ggml_backend_event_free(sched->events[b][c]);
1901
+ }
1902
+ }
1903
+ lm_ggml_gallocr_free(sched->galloc);
1904
+ lm_ggml_free(sched->ctx);
1905
+ lm_ggml_hash_set_free(&sched->hash_set);
1906
+ free(sched->splits);
1907
+ free(sched->hv_tensor_backend_ids);
1908
+ free(sched->hv_tensor_copies);
1909
+ free(sched->node_backend_ids);
1910
+ free(sched->leaf_backend_ids);
1911
+ free(sched->prev_node_backend_ids);
1912
+ free(sched->prev_leaf_backend_ids);
1913
+ free(sched->context_buffer);
1914
+ free(sched->graph.nodes);
1915
+ free(sched->graph.leafs);
1916
+ free(sched);
1917
+ }
1918
+
1919
+ void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1920
+ // reset state for the next run
1921
+ if (!sched->is_reset) {
1922
+ lm_ggml_hash_set_reset(&sched->hash_set);
1923
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1924
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1925
+ sched->is_reset = true;
1926
+ }
1927
+ sched->is_alloc = false;
1928
+ }
1929
+
1930
+ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1931
+ LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1932
+
1933
+ lm_ggml_backend_sched_split_graph(sched, measure_graph);
1934
+
1935
+ if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1936
+ return false;
1937
+ }
1938
+
1939
+ lm_ggml_backend_sched_reset(sched);
1940
+ lm_ggml_backend_sched_synchronize(sched);
1941
+
1942
+ return true;
1943
+ }
1944
+
1945
+ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1946
+ LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1947
+
1948
+ lm_ggml_backend_sched_split_graph(sched, graph);
1949
+
1950
+
1951
+ if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1952
+ return false;
1953
+ }
1954
+
1955
+ sched->is_alloc = true;
1956
+
1957
+ return true;
1958
+ }
1959
+
1960
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1961
+ enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1962
+ lm_ggml_backend_sched_synchronize(sched);
1963
+ return err;
1964
+ }
1965
+
1966
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1967
+ if (!sched->is_reset && !sched->is_alloc) {
1968
+ lm_ggml_backend_sched_reset(sched);
1969
+ }
1970
+
1971
+ if (!sched->is_alloc) {
1972
+ if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1973
+ return LM_GGML_STATUS_ALLOC_FAILED;
1974
+ }
1975
+ }
1976
+
1977
+ return lm_ggml_backend_sched_compute_splits(sched);
1978
+ }
1979
+
1980
+ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1981
+ for (int i = 0; i < sched->n_backends; i++) {
1982
+ lm_ggml_backend_synchronize(sched->backends[i]);
1983
+ }
1984
+ }
1985
+
1986
+ void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1987
+ sched->callback_eval = callback;
1988
+ sched->callback_eval_user_data = user_data;
1989
+ }
1990
+
1991
+ int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1992
+ return sched->n_splits;
1993
+ }
1994
+
1995
+ int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1996
+ return sched->n_copies;
1997
+ }
1998
+
1999
+ int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
2000
+ return sched->n_backends;
2001
+ }
2002
+
2003
+ lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
2004
+ LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
2005
+ return sched->backends[i];
2006
+ }
2007
+
2008
+ size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
2009
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2010
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2011
+
2012
+ return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2013
+ }
2014
+
2015
+ void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
2016
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2017
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2018
+ tensor_backend_id(node) = backend_index;
2019
+ SET_CAUSE(node, "usr");
2020
+ sched->is_reset = false;
2021
+ }
2022
+
2023
+ lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
2024
+ int backend_index = tensor_backend_id(node);
2025
+ if (backend_index == -1) {
2026
+ return NULL;
2027
+ }
2028
+ return sched->backends[backend_index];
2029
+ }
2030
+
2031
+ // utils
2032
+
2033
+ void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
2034
+ LM_GGML_ASSERT(tensor->buffer == NULL);
2035
+ LM_GGML_ASSERT(tensor->view_src != NULL);
2036
+ LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
2037
+ LM_GGML_ASSERT(tensor->view_src->data != NULL);
2038
+
2039
+ tensor->buffer = tensor->view_src->buffer;
2040
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2041
+ lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2042
+ }
2043
+
2044
+ void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
2045
+ LM_GGML_ASSERT(tensor->buffer == NULL);
2046
+ LM_GGML_ASSERT(tensor->data == NULL);
2047
+ LM_GGML_ASSERT(tensor->view_src == NULL);
2048
+ LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
2049
+ LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2050
+ (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
2051
+
2052
+ tensor->buffer = buffer;
2053
+ tensor->data = addr;
2054
+ lm_ggml_backend_buffer_init_tensor(buffer, tensor);
2055
+ }
2056
+
2057
+ static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
2058
+ struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
2059
+
2060
+ LM_GGML_ASSERT(src != NULL);
2061
+ LM_GGML_ASSERT(src->data && "graph must be allocated");
2062
+
2063
+ size_t id = lm_ggml_hash_insert(&hash_set, src);
2064
+ if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
2065
+ return node_copies[lm_ggml_hash_find(&hash_set, src)];
2066
+ }
2067
+
2068
+ struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2069
+ if (src->view_src != NULL) {
2070
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2071
+ dst->view_offs = src->view_offs;
2072
+ }
2073
+ dst->op = src->op;
2074
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2075
+ lm_ggml_set_name(dst, src->name);
2076
+
2077
+ // copy src
2078
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2079
+ struct lm_ggml_tensor * s = src->src[i];
2080
+ if (s == NULL) {
2081
+ continue;
2082
+ }
2083
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2084
+ }
2085
+
2086
+ node_copies[id] = dst;
2087
+ return dst;
2088
+ }
2089
+
2090
+ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
2091
+ size_t id = lm_ggml_hash_find(hash_set, src);
2092
+ if (node_init[id]) {
2093
+ return;
2094
+ }
2095
+ node_init[id] = true;
2096
+
2097
+ struct lm_ggml_tensor * dst = node_copies[id];
2098
+ if (dst->view_src != NULL) {
2099
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2100
+ lm_ggml_backend_view_init(dst);
2101
+ }
2102
+ else {
2103
+ lm_ggml_backend_tensor_copy(src, dst);
2104
+ }
2105
+
2106
+ // init src
2107
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2108
+ struct lm_ggml_tensor * s = src->src[i];
2109
+ if (s == NULL) {
2110
+ continue;
2111
+ }
2112
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2113
+ }
2114
+ }
2115
+
2116
+ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
2117
+ struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
2118
+ struct lm_ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2119
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2120
+
2121
+ struct lm_ggml_init_params params = {
2122
+ /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
2123
+ /* .mem_buffer = */ NULL,
2124
+ /* .no_alloc = */ true
2125
+ };
2126
+
2127
+ struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
2128
+ struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
2129
+
2130
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2131
+ fprintf(stderr, "failed to allocate context for graph copy\n");
2132
+ lm_ggml_hash_set_free(&hash_set);
2133
+ free(node_copies);
2134
+ free(node_init);
2135
+ lm_ggml_free(ctx_allocated);
2136
+ lm_ggml_free(ctx_unallocated);
2137
+ return (struct lm_ggml_backend_graph_copy) {
2138
+ /* .buffer = */ NULL,
2139
+ /* .ctx_allocated = */ NULL,
2140
+ /* .ctx_unallocated = */ NULL,
2141
+ /* .graph = */ NULL,
2142
+ };
2143
+ }
2144
+
2145
+ // dup nodes
2146
+ for (int i = 0; i < graph->n_nodes; i++) {
2147
+ struct lm_ggml_tensor * node = graph->nodes[i];
2148
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2149
+ }
2150
+
2151
+ // allocate nodes
2152
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2153
+ if (buffer == NULL) {
2154
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
2155
+ lm_ggml_hash_set_free(&hash_set);
2156
+ free(node_copies);
2157
+ free(node_init);
2158
+ lm_ggml_free(ctx_allocated);
2159
+ lm_ggml_free(ctx_unallocated);
2160
+ return (struct lm_ggml_backend_graph_copy) {
2161
+ /* .buffer = */ NULL,
2162
+ /* .ctx_allocated = */ NULL,
2163
+ /* .ctx_unallocated = */ NULL,
2164
+ /* .graph = */ NULL,
2165
+ };
2166
+ }
2167
+
2168
+ //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2169
+
2170
+ // copy data and init views
2171
+ for (int i = 0; i < graph->n_nodes; i++) {
2172
+ struct lm_ggml_tensor * node = graph->nodes[i];
2173
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2174
+ }
2175
+
2176
+ // build graph copy
2177
+ struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
2178
+ for (int i = 0; i < graph->n_nodes; i++) {
2179
+ struct lm_ggml_tensor * node = graph->nodes[i];
2180
+ struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
2181
+ graph_copy->nodes[i] = node_copy;
2182
+ }
2183
+ graph_copy->n_nodes = graph->n_nodes;
2184
+
2185
+ lm_ggml_hash_set_free(&hash_set);
2186
+ free(node_copies);
2187
+ free(node_init);
2188
+
2189
+ return (struct lm_ggml_backend_graph_copy) {
2190
+ /* .buffer = */ buffer,
2191
+ /* .ctx_allocated = */ ctx_allocated,
2192
+ /* .ctx_unallocated = */ ctx_unallocated,
2193
+ /* .graph = */ graph_copy,
2194
+ };
2195
+ }
2196
+
2197
+ void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
2198
+ lm_ggml_backend_buffer_free(copy.buffer);
2199
+ lm_ggml_free(copy.ctx_allocated);
2200
+ lm_ggml_free(copy.ctx_unallocated);
2201
+ }
2202
+
2203
+ bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2204
+ struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2205
+ if (copy.buffer == NULL) {
2206
+ return false;
2207
+ }
2208
+
2209
+ struct lm_ggml_cgraph * g1 = graph;
2210
+ struct lm_ggml_cgraph * g2 = copy.graph;
2211
+
2212
+ assert(g1->n_nodes == g2->n_nodes);
2213
+
2214
+ for (int i = 0; i < g1->n_nodes; i++) {
2215
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2216
+ struct lm_ggml_tensor * t1 = g1->nodes[i];
2217
+ struct lm_ggml_tensor * t2 = g2->nodes[i];
2218
+
2219
+ assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2220
+
2221
+ struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2222
+ struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2223
+
2224
+ lm_ggml_backend_graph_compute(backend1, &g1v);
2225
+ lm_ggml_backend_graph_compute(backend2, &g2v);
2226
+
2227
+ if (lm_ggml_is_view_op(t1->op)) {
2228
+ continue;
2229
+ }
2230
+
2231
+ // compare results, calculate rms etc
2232
+ if (!callback(i, t1, t2, user_data)) {
2233
+ break;
2234
+ }
2235
+ }
2236
+
2237
+ lm_ggml_backend_graph_copy_free(copy);
2238
+
2239
+ return true;
2240
+ }