cui-llama.rn 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +20 -5
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +222 -34
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/common.cpp +1682 -2114
  9. package/cpp/common.h +600 -613
  10. package/cpp/ggml-aarch64.c +129 -3478
  11. package/cpp/ggml-aarch64.h +19 -39
  12. package/cpp/ggml-alloc.c +1040 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +216 -216
  15. package/cpp/ggml-backend-reg.cpp +195 -0
  16. package/cpp/ggml-backend.cpp +1997 -2661
  17. package/cpp/ggml-backend.h +328 -314
  18. package/cpp/ggml-common.h +1853 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3560 -0
  21. package/cpp/ggml-cpu-aarch64.h +30 -0
  22. package/cpp/ggml-cpu-impl.h +371 -614
  23. package/cpp/ggml-cpu-quants.c +10822 -0
  24. package/cpp/ggml-cpu-quants.h +63 -0
  25. package/cpp/ggml-cpu.c +13975 -13720
  26. package/cpp/ggml-cpu.cpp +663 -0
  27. package/cpp/ggml-cpu.h +177 -150
  28. package/cpp/ggml-impl.h +550 -296
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4294 -3933
  31. package/cpp/ggml-quants.c +5247 -15739
  32. package/cpp/ggml-quants.h +100 -147
  33. package/cpp/ggml-threading.cpp +12 -0
  34. package/cpp/ggml-threading.h +12 -0
  35. package/cpp/ggml.c +8180 -8390
  36. package/cpp/ggml.h +2411 -2441
  37. package/cpp/llama-grammar.cpp +1138 -1138
  38. package/cpp/llama-grammar.h +144 -144
  39. package/cpp/llama-impl.h +181 -181
  40. package/cpp/llama-sampling.cpp +2348 -2345
  41. package/cpp/llama-sampling.h +48 -48
  42. package/cpp/llama-vocab.cpp +1984 -1984
  43. package/cpp/llama-vocab.h +170 -170
  44. package/cpp/llama.cpp +22132 -22046
  45. package/cpp/llama.h +1253 -1255
  46. package/cpp/log.cpp +401 -401
  47. package/cpp/log.h +121 -121
  48. package/cpp/rn-llama.hpp +83 -19
  49. package/cpp/sampling.cpp +466 -466
  50. package/cpp/sgemm.cpp +1884 -1276
  51. package/ios/RNLlama.mm +43 -20
  52. package/ios/RNLlamaContext.h +9 -3
  53. package/ios/RNLlamaContext.mm +133 -33
  54. package/jest/mock.js +0 -1
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  56. package/lib/commonjs/index.js +52 -15
  57. package/lib/commonjs/index.js.map +1 -1
  58. package/lib/module/NativeRNLlama.js.map +1 -1
  59. package/lib/module/index.js +51 -15
  60. package/lib/module/index.js.map +1 -1
  61. package/lib/typescript/NativeRNLlama.d.ts +29 -5
  62. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  63. package/lib/typescript/index.d.ts +12 -5
  64. package/lib/typescript/index.d.ts.map +1 -1
  65. package/package.json +1 -1
  66. package/src/NativeRNLlama.ts +41 -6
  67. package/src/index.ts +82 -27
  68. package/cpp/json-schema-to-grammar.cpp +0 -1045
  69. package/cpp/json-schema-to-grammar.h +0 -8
  70. package/cpp/json.hpp +0 -24766
@@ -1,2661 +1,1997 @@
1
- // Note: porting this file to C++ is a work in progress
2
-
3
- #ifdef _WIN32
4
- #define WIN32_LEAN_AND_MEAN
5
- #ifndef NOMINMAX
6
- # define NOMINMAX
7
- #endif
8
- #include <windows.h>
9
- #endif
10
-
11
- #include "ggml-backend.h"
12
- #include "ggml-backend-impl.h"
13
- #include "ggml-alloc.h"
14
- #include "ggml-impl.h"
15
-
16
- #include <assert.h>
17
- #include <limits.h>
18
- #include <stdarg.h>
19
- #include <stdio.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <string>
23
- #include <vector>
24
-
25
- #ifdef __APPLE__
26
- #include <sys/types.h>
27
- #include <sys/sysctl.h>
28
- #endif
29
-
30
-
31
- // backend buffer type
32
-
33
- const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
34
- return buft->iface.get_name(buft);
35
- }
36
-
37
- lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
38
- if (size == 0) {
39
- // return a dummy buffer for zero-sized allocations
40
- return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
41
- }
42
-
43
- return buft->iface.alloc_buffer(buft, size);
44
- }
45
-
46
- size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) {
47
- return buft->iface.get_alignment(buft);
48
- }
49
-
50
- size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
51
- // get_max_size is optional, defaults to SIZE_MAX
52
- if (buft->iface.get_max_size) {
53
- return buft->iface.get_max_size(buft);
54
- }
55
- return SIZE_MAX;
56
- }
57
-
58
- size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
59
- // get_alloc_size is optional, defaults to lm_ggml_nbytes
60
- if (buft->iface.get_alloc_size) {
61
- size_t size = buft->iface.get_alloc_size(buft, tensor);
62
- assert(size >= lm_ggml_nbytes(tensor));
63
- return size;
64
- }
65
- return lm_ggml_nbytes(tensor);
66
- }
67
-
68
- bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
69
- if (buft->iface.is_host) {
70
- return buft->iface.is_host(buft);
71
- }
72
- return false;
73
- }
74
-
75
- lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device(lm_ggml_backend_buffer_type_t buft) {
76
- return buft->device;
77
- }
78
-
79
- // backend buffer
80
-
81
- lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
82
- lm_ggml_backend_buffer_type_t buft,
83
- struct lm_ggml_backend_buffer_i iface,
84
- void * context,
85
- size_t size) {
86
- lm_ggml_backend_buffer_t buffer = new lm_ggml_backend_buffer {
87
- /* .interface = */ iface,
88
- /* .buft = */ buft,
89
- /* .context = */ context,
90
- /* .size = */ size,
91
- /* .usage = */ LM_GGML_BACKEND_BUFFER_USAGE_ANY
92
- };
93
-
94
- return buffer;
95
- }
96
-
97
- const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
98
- return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
99
- }
100
-
101
- void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
102
- if (buffer == NULL) {
103
- return;
104
- }
105
-
106
- if (buffer->iface.free_buffer != NULL) {
107
- buffer->iface.free_buffer(buffer);
108
- }
109
- delete buffer;
110
- }
111
-
112
- size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
113
- return buffer->size;
114
- }
115
-
116
- void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
117
- // get_base is optional if the buffer is zero-sized
118
- if (buffer->size == 0) {
119
- return NULL;
120
- }
121
-
122
- void * base = buffer->iface.get_base(buffer);
123
-
124
- LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
125
-
126
- return base;
127
- }
128
-
129
- void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
130
- // init_tensor is optional
131
- if (buffer->iface.init_tensor) {
132
- buffer->iface.init_tensor(buffer, tensor);
133
- }
134
- }
135
-
136
- void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
137
- // clear is optional if the buffer is zero-sized
138
- if (buffer->size == 0) {
139
- return;
140
- }
141
-
142
- buffer->iface.clear(buffer, value);
143
- }
144
-
145
- size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
146
- return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
147
- }
148
-
149
- size_t lm_ggml_backend_buffer_get_max_size(lm_ggml_backend_buffer_t buffer) {
150
- return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_get_type(buffer));
151
- }
152
-
153
- size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
154
- return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
155
- }
156
-
157
- bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
158
- return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
159
- }
160
-
161
- void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
162
- buffer->usage = usage;
163
-
164
- // FIXME: add a generic callback to the buffer interface
165
- if (lm_ggml_backend_buffer_is_multi_buffer(buffer)) {
166
- lm_ggml_backend_multi_buffer_set_usage(buffer, usage);
167
- }
168
- }
169
-
170
- enum lm_ggml_backend_buffer_usage lm_ggml_backend_buffer_get_usage(lm_ggml_backend_buffer_t buffer) {
171
- return buffer->usage;
172
- }
173
-
174
- lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) {
175
- return buffer->buft;
176
- }
177
-
178
- void lm_ggml_backend_buffer_reset(lm_ggml_backend_buffer_t buffer) {
179
- if (buffer->iface.reset) {
180
- buffer->iface.reset(buffer);
181
- }
182
- }
183
-
184
- bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
185
- lm_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
186
- if (dst_buf->iface.cpy_tensor) {
187
- return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
188
- }
189
- return false;
190
- }
191
-
192
- // backend
193
-
194
- lm_ggml_guid_t lm_ggml_backend_guid(lm_ggml_backend_t backend) {
195
- if (backend == NULL) {
196
- return NULL;
197
- }
198
- return backend->guid;
199
- }
200
-
201
- const char * lm_ggml_backend_name(lm_ggml_backend_t backend) {
202
- if (backend == NULL) {
203
- return "NULL";
204
- }
205
- return backend->iface.get_name(backend);
206
- }
207
-
208
- void lm_ggml_backend_free(lm_ggml_backend_t backend) {
209
- if (backend == NULL) {
210
- return;
211
- }
212
-
213
- backend->iface.free(backend);
214
- }
215
-
216
- lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
217
- return lm_ggml_backend_dev_buffer_type(backend->device);
218
- }
219
-
220
- lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
221
- return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size);
222
- }
223
-
224
- size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) {
225
- return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend));
226
- }
227
-
228
- size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend) {
229
- return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_get_default_buffer_type(backend));
230
- }
231
-
232
- void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
233
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
234
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
235
-
236
- if (backend->iface.set_tensor_async == NULL) {
237
- lm_ggml_backend_tensor_set(tensor, data, offset, size);
238
- } else {
239
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
240
- }
241
- }
242
-
243
- void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
244
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
245
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
246
-
247
- if (backend->iface.get_tensor_async == NULL) {
248
- lm_ggml_backend_tensor_get(tensor, data, offset, size);
249
- } else {
250
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
251
- }
252
- }
253
-
254
- void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
- lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
256
-
257
- if (size == 0) {
258
- return;
259
- }
260
-
261
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
262
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
263
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
264
-
265
- buf->iface.set_tensor(buf, tensor, data, offset, size);
266
- }
267
-
268
- void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
269
- lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
270
-
271
- if (size == 0) {
272
- return;
273
- }
274
-
275
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
276
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
277
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
278
-
279
- buf->iface.get_tensor(buf, tensor, data, offset, size);
280
- }
281
-
282
- LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
283
- lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
284
-
285
- if (size == 0) {
286
- return;
287
- }
288
-
289
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
292
- LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
293
-
294
- buf->iface.memset_tensor(buf, tensor, value, offset, size);
295
- }
296
-
297
- void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) {
298
- if (backend->iface.synchronize == NULL) {
299
- return;
300
- }
301
-
302
- backend->iface.synchronize(backend);
303
- }
304
-
305
- lm_ggml_backend_graph_plan_t lm_ggml_backend_graph_plan_create(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
306
- LM_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
307
-
308
- return backend->iface.graph_plan_create(backend, cgraph);
309
- }
310
-
311
- void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
312
- LM_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
313
-
314
- backend->iface.graph_plan_free(backend, plan);
315
- }
316
-
317
- enum lm_ggml_status lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
318
- LM_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
319
-
320
- return backend->iface.graph_plan_compute(backend, plan);
321
- }
322
-
323
- enum lm_ggml_status lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
324
- enum lm_ggml_status err = lm_ggml_backend_graph_compute_async(backend, cgraph);
325
- lm_ggml_backend_synchronize(backend);
326
- return err;
327
- }
328
-
329
- enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
330
- return backend->iface.graph_compute(backend, cgraph);
331
- }
332
-
333
- bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
334
- return lm_ggml_backend_dev_supports_op(backend->device, op);
335
- }
336
-
337
- bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
338
- return lm_ggml_backend_dev_supports_buft(backend->device, buft);
339
- }
340
-
341
- bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
342
- return lm_ggml_backend_dev_offload_op(backend->device, op);
343
- }
344
-
345
- lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
346
- return backend->device;
347
- }
348
-
349
- // backend copy
350
-
351
- static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
352
- if (a->type != b->type) {
353
- return false;
354
- }
355
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
356
- if (a->ne[i] != b->ne[i]) {
357
- return false;
358
- }
359
- if (a->nb[i] != b->nb[i]) {
360
- return false;
361
- }
362
- }
363
- return true;
364
- }
365
-
366
- void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
367
- LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
368
-
369
- if (src == dst) {
370
- return;
371
- }
372
-
373
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
374
- lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
375
- } else if (lm_ggml_backend_buffer_is_host(dst->buffer)) {
376
- lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
377
- } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
378
- #ifndef NDEBUG
379
- LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
380
- #endif
381
- size_t nbytes = lm_ggml_nbytes(src);
382
- void * data = malloc(nbytes);
383
- lm_ggml_backend_tensor_get(src, data, 0, nbytes);
384
- lm_ggml_backend_tensor_set(dst, data, 0, nbytes);
385
- free(data);
386
- }
387
- }
388
-
389
- void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
390
- LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
391
-
392
- if (src == dst) {
393
- return;
394
- }
395
-
396
- if (backend_dst->iface.cpy_tensor_async != NULL) {
397
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
398
- return;
399
- }
400
- }
401
-
402
- // an async copy would normally happen after all the queued operations on both backends are completed
403
- // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
404
- lm_ggml_backend_synchronize(backend_src);
405
- lm_ggml_backend_synchronize(backend_dst);
406
- lm_ggml_backend_tensor_copy(src, dst);
407
- }
408
-
409
- // events
410
-
411
- lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_dev_t device) {
412
- // null device is allowed for the transition period to the device interface
413
- if (device == NULL || device->iface.event_new == NULL) {
414
- return NULL;
415
- }
416
- return device->iface.event_new(device);
417
- }
418
-
419
- void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
420
- if (event == NULL) {
421
- return;
422
- }
423
- event->device->iface.event_free(event->device, event);
424
- }
425
-
426
- void lm_ggml_backend_event_record(lm_ggml_backend_event_t event, lm_ggml_backend_t backend) {
427
- LM_GGML_ASSERT(backend->iface.event_record != NULL);
428
-
429
- backend->iface.event_record(backend, event);
430
- }
431
-
432
- void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
433
- LM_GGML_ASSERT(event->device->iface.event_synchronize);
434
-
435
- event->device->iface.event_synchronize(event->device, event);
436
- }
437
-
438
- void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
439
- LM_GGML_ASSERT(backend->iface.event_wait != NULL);
440
-
441
- backend->iface.event_wait(backend, event);
442
- }
443
-
444
- // Backend device
445
-
446
- const char * lm_ggml_backend_dev_name(lm_ggml_backend_dev_t device) {
447
- return device->iface.get_name(device);
448
- }
449
-
450
- const char * lm_ggml_backend_dev_description(lm_ggml_backend_dev_t device) {
451
- return device->iface.get_description(device);
452
- }
453
-
454
- void lm_ggml_backend_dev_memory(lm_ggml_backend_dev_t device, size_t * free, size_t * total) {
455
- device->iface.get_memory(device, free, total);
456
- }
457
-
458
- enum lm_ggml_backend_dev_type lm_ggml_backend_dev_type(lm_ggml_backend_dev_t device) {
459
- return device->iface.get_type(device);
460
- }
461
-
462
- void lm_ggml_backend_dev_get_props(lm_ggml_backend_dev_t device, struct lm_ggml_backend_dev_props * props) {
463
- memset(props, 0, sizeof(*props));
464
- device->iface.get_props(device, props);
465
- }
466
-
467
- lm_ggml_backend_reg_t lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_t device) {
468
- return device->reg;
469
- }
470
-
471
- lm_ggml_backend_t lm_ggml_backend_dev_init(lm_ggml_backend_dev_t device, const char * params) {
472
- return device->iface.init_backend(device, params);
473
- }
474
-
475
- lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_buffer_type(lm_ggml_backend_dev_t device) {
476
- return device->iface.get_buffer_type(device);
477
- }
478
-
479
- lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_host_buffer_type(lm_ggml_backend_dev_t device) {
480
- if (device->iface.get_host_buffer_type == NULL) {
481
- return NULL;
482
- }
483
-
484
- return device->iface.get_host_buffer_type(device);
485
- }
486
-
487
- lm_ggml_backend_buffer_t lm_ggml_backend_dev_buffer_from_host_ptr(lm_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
488
- return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
489
- }
490
-
491
- bool lm_ggml_backend_dev_supports_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
492
- return device->iface.supports_op(device, op);
493
- }
494
-
495
- bool lm_ggml_backend_dev_supports_buft(lm_ggml_backend_dev_t device, lm_ggml_backend_buffer_type_t buft) {
496
- return device->iface.supports_buft(device, buft);
497
- }
498
-
499
- bool lm_ggml_backend_dev_offload_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
500
- if (device->iface.offload_op != NULL) {
501
- return device->iface.offload_op(device, op);
502
- }
503
-
504
- return false;
505
- }
506
-
507
- // Backend (reg)
508
-
509
- const char * lm_ggml_backend_reg_name(lm_ggml_backend_reg_t reg) {
510
- return reg->iface.get_name(reg);
511
- }
512
-
513
- size_t lm_ggml_backend_reg_dev_count(lm_ggml_backend_reg_t reg) {
514
- return reg->iface.get_device_count(reg);
515
- }
516
-
517
- lm_ggml_backend_dev_t lm_ggml_backend_reg_dev_get(lm_ggml_backend_reg_t reg, size_t index) {
518
- return reg->iface.get_device(reg, index);
519
- }
520
-
521
- void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
522
- if (!reg->iface.get_proc_address) {
523
- return NULL;
524
- }
525
- return reg->iface.get_proc_address(reg, name);
526
- }
527
-
528
- // Backend registry
529
-
530
- #ifdef LM_GGML_USE_CUDA
531
- #include "ggml-cuda.h"
532
- #endif
533
-
534
- #ifdef LM_GGML_USE_METAL
535
- #include "ggml-metal.h"
536
- #endif
537
-
538
- #ifdef LM_GGML_USE_SYCL
539
- #include "ggml-sycl.h"
540
- #endif
541
-
542
- #ifdef LM_GGML_USE_VULKAN
543
- #include "ggml-vulkan.h"
544
- #endif
545
-
546
- #ifdef LM_GGML_USE_BLAS
547
- #include "ggml-blas.h"
548
- #endif
549
-
550
- #ifdef LM_GGML_USE_RPC
551
- #include "ggml-rpc.h"
552
- #endif
553
-
554
- #ifndef __AMX_INT8__
555
- #undef LM_GGML_USE_AMX
556
- #endif
557
-
558
- #ifdef LM_GGML_USE_AMX
559
- # include "ggml-amx.h"
560
- #endif
561
-
562
- #ifdef LM_GGML_USE_CANN
563
- #include "ggml-cann.h"
564
- #endif
565
-
566
- #ifdef LM_GGML_USE_KOMPUTE
567
- #include "ggml-kompute.h"
568
- #endif
569
-
570
- #include "ggml-cpu.h"
571
-
572
- struct lm_ggml_backend_registry {
573
- std::vector<lm_ggml_backend_reg_t> backends;
574
- std::vector<lm_ggml_backend_dev_t> devices;
575
-
576
- lm_ggml_backend_registry() {
577
- #ifdef LM_GGML_USE_CUDA
578
- register_backend(lm_ggml_backend_cuda_reg());
579
- #endif
580
- #ifdef LM_GGML_USE_METAL
581
- register_backend(lm_ggml_backend_metal_reg());
582
- #endif
583
- #ifdef LM_GGML_USE_SYCL
584
- register_backend(lm_ggml_backend_sycl_reg());
585
- #endif
586
- #ifdef LM_GGML_USE_VULKAN
587
- register_backend(lm_ggml_backend_vk_reg());
588
- #endif
589
- #ifdef LM_GGML_USE_CANN
590
- register_backend(lm_ggml_backend_cann_reg());
591
- #endif
592
- #ifdef LM_GGML_USE_BLAS
593
- register_backend(lm_ggml_backend_blas_reg());
594
- #endif
595
- #ifdef LM_GGML_USE_RPC
596
- register_backend(lm_ggml_backend_rpc_reg());
597
- #endif
598
- #ifdef LM_GGML_USE_AMX
599
- register_backend(lm_ggml_backend_amx_reg());
600
- #endif
601
- #ifdef LM_GGML_USE_KOMPUTE
602
- register_backend(lm_ggml_backend_kompute_reg());
603
- #endif
604
-
605
- register_backend(lm_ggml_backend_cpu_reg());
606
- }
607
-
608
- void register_backend(lm_ggml_backend_reg_t reg) {
609
- #ifndef NDEBUG
610
- LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
611
- __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
612
- #endif
613
- backends.push_back(reg);
614
- for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); i++) {
615
- register_device(lm_ggml_backend_reg_dev_get(reg, i));
616
- }
617
- }
618
-
619
- void register_device(lm_ggml_backend_dev_t device) {
620
- #ifndef NDEBUG
621
- LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
622
- #endif
623
- devices.push_back(device);
624
- }
625
- };
626
-
627
- static lm_ggml_backend_registry & get_reg() {
628
- static lm_ggml_backend_registry reg;
629
- return reg;
630
- }
631
-
632
- // Internal API
633
- void lm_ggml_backend_register(lm_ggml_backend_reg_t reg) {
634
- get_reg().register_backend(reg);
635
- }
636
-
637
- void lm_ggml_backend_device_register(lm_ggml_backend_dev_t device) {
638
- get_reg().register_device(device);
639
- }
640
-
641
- // Backend (reg) enumeration
642
- size_t lm_ggml_backend_reg_count() {
643
- return get_reg().backends.size();
644
- }
645
-
646
- lm_ggml_backend_reg_t lm_ggml_backend_reg_get(size_t index) {
647
- LM_GGML_ASSERT(index < lm_ggml_backend_reg_count());
648
- return get_reg().backends[index];
649
- }
650
-
651
- lm_ggml_backend_reg_t lm_ggml_backend_reg_by_name(const char * name) {
652
- for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
653
- lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
654
- if (strcmp(lm_ggml_backend_reg_name(reg), name) == 0) {
655
- return reg;
656
- }
657
- }
658
- return NULL;
659
- }
660
-
661
- // Device enumeration
662
- size_t lm_ggml_backend_dev_count() {
663
- return get_reg().devices.size();
664
- }
665
-
666
- lm_ggml_backend_dev_t lm_ggml_backend_dev_get(size_t index) {
667
- LM_GGML_ASSERT(index < lm_ggml_backend_dev_count());
668
- return get_reg().devices[index];
669
- }
670
-
671
- lm_ggml_backend_dev_t lm_ggml_backend_dev_by_name(const char * name) {
672
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
673
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
674
- if (strcmp(lm_ggml_backend_dev_name(dev), name) == 0) {
675
- return dev;
676
- }
677
- }
678
- return NULL;
679
- }
680
-
681
- lm_ggml_backend_dev_t lm_ggml_backend_dev_by_type(enum lm_ggml_backend_dev_type type) {
682
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
683
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
684
- if (lm_ggml_backend_dev_type(dev) == type) {
685
- return dev;
686
- }
687
- }
688
- return NULL;
689
- }
690
-
691
- // Convenience functions
692
- lm_ggml_backend_t lm_ggml_backend_init_by_name(const char * name, const char * params) {
693
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_name(name);
694
- if (!dev) {
695
- return NULL;
696
- }
697
- return lm_ggml_backend_dev_init(dev, params);
698
- }
699
-
700
- lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type type, const char * params) {
701
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(type);
702
- if (!dev) {
703
- return NULL;
704
- }
705
- return lm_ggml_backend_dev_init(dev, params);
706
- }
707
-
708
- lm_ggml_backend_t lm_ggml_backend_init_best(void) {
709
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
710
- if (!dev) {
711
- dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
712
- }
713
- if (!dev) {
714
- return NULL;
715
- }
716
- return lm_ggml_backend_dev_init(dev, NULL);
717
- }
718
-
719
- // multi-buffer buffer
720
-
721
- struct lm_ggml_backend_multi_buffer_context {
722
- lm_ggml_backend_buffer_t * buffers;
723
- size_t n_buffers;
724
- };
725
-
726
- static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
727
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
728
- for (size_t i = 0; i < ctx->n_buffers; i++) {
729
- lm_ggml_backend_buffer_free(ctx->buffers[i]);
730
- }
731
-
732
- free(ctx->buffers);
733
- free(ctx);
734
- }
735
-
736
- static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
737
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
738
- for (size_t i = 0; i < ctx->n_buffers; i++) {
739
- lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
740
- }
741
- }
742
-
743
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
744
- /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
745
- /* .get_base = */ NULL,
746
- /* .init_tensor = */ NULL,
747
- /* .memset_tensor = */ NULL,
748
- /* .set_tensor = */ NULL,
749
- /* .get_tensor = */ NULL,
750
- /* .cpy_tensor = */ NULL,
751
- /* .clear = */ lm_ggml_backend_multi_buffer_clear,
752
- /* .reset = */ NULL,
753
- };
754
-
755
- lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
756
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
757
- ctx->n_buffers = n_buffers;
758
- ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
759
-
760
- LM_GGML_ASSERT(ctx->buffers != NULL);
761
-
762
- size_t total_size = 0;
763
- for (size_t i = 0; i < n_buffers; i++) {
764
- ctx->buffers[i] = buffers[i];
765
- total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
766
- }
767
-
768
- return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
769
- }
770
-
771
- bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
772
- return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
773
- }
774
-
775
- void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
776
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
777
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
778
- for (size_t i = 0; i < ctx->n_buffers; i++) {
779
- lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
780
- }
781
- }
782
-
783
- // creates a copy of the tensor with the same memory layout
784
- static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
785
- struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
786
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
787
- dup->nb[i] = tensor->nb[i];
788
- }
789
- return dup;
790
- }
791
-
792
- static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
793
- return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
794
- }
795
-
796
- // scheduler
797
-
798
- #ifndef LM_GGML_SCHED_MAX_BACKENDS
799
- #define LM_GGML_SCHED_MAX_BACKENDS 16
800
- #endif
801
-
802
- #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
803
- #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
804
- #endif
805
-
806
- #ifndef LM_GGML_SCHED_MAX_COPIES
807
- #define LM_GGML_SCHED_MAX_COPIES 4
808
- #endif
809
-
810
- struct lm_ggml_backend_sched_split {
811
- int backend_id;
812
- int i_start;
813
- int i_end;
814
- struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
815
- int n_inputs;
816
- // graph view of this split
817
- struct lm_ggml_cgraph graph;
818
- };
819
-
820
- struct lm_ggml_backend_sched {
821
- bool is_reset; // true if the scheduler has been reset since the last graph split
822
- bool is_alloc;
823
-
824
- int n_backends;
825
-
826
- lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
827
- lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
828
- lm_ggml_gallocr_t galloc;
829
-
830
- // hash map of the nodes in the graph
831
- struct lm_ggml_hash_set hash_set;
832
- int * hv_tensor_backend_ids; // [hash_set.size]
833
- struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
834
-
835
- int * node_backend_ids; // [graph_size]
836
- int * leaf_backend_ids; // [graph_size]
837
-
838
- int * prev_node_backend_ids; // [graph_size]
839
- int * prev_leaf_backend_ids; // [graph_size]
840
-
841
- // copy of the graph with modified inputs
842
- struct lm_ggml_cgraph graph;
843
-
844
- // graph splits
845
- struct lm_ggml_backend_sched_split * splits;
846
- int n_splits;
847
- int splits_capacity;
848
-
849
- // pipeline parallelism support
850
- int n_copies;
851
- int cur_copy;
852
- lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
853
- struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
854
- int n_graph_inputs;
855
-
856
- struct lm_ggml_context * ctx;
857
-
858
- lm_ggml_backend_sched_eval_callback callback_eval;
859
- void * callback_eval_user_data;
860
-
861
- char * context_buffer;
862
- size_t context_buffer_size;
863
-
864
- int debug;
865
- };
866
-
867
- #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
868
- #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
869
- #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
870
- #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
871
-
872
- // returns the priority of the backend, lower id is higher priority
873
- static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
874
- for (int i = 0; i < sched->n_backends; i++) {
875
- if (sched->backends[i] == backend) {
876
- return i;
877
- }
878
- }
879
- return -1;
880
- }
881
-
882
- static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
883
- lm_ggml_backend_buffer_t buffer = tensor->buffer;
884
- if (buffer == NULL) {
885
- return -1;
886
- }
887
-
888
- // find highest prio backend that supports the buffer type and the op
889
- for (int i = 0; i < sched->n_backends; i++) {
890
- if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
891
- lm_ggml_backend_supports_op(sched->backends[i], op)) {
892
- return i;
893
- }
894
- }
895
-
896
- #ifndef NDEBUG
897
- LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
898
- __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
899
- #endif
900
-
901
- return -1;
902
- }
903
-
904
- #if 0
905
- #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
906
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
907
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
908
- #define GET_CAUSE(node) causes[hash_id(node)]
909
- #else
910
- #define SET_CAUSE(node, ...)
911
- #define GET_CAUSE(node) ""
912
- #endif
913
-
914
- // returns the backend that should be used for the node based on the current locations
915
- static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
916
- // TODO: use supports_op to check if the backend supports the op
917
-
918
- // assign pre-allocated nodes to their backend
919
- int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
920
- if (cur_backend_id != -1) {
921
- SET_CAUSE(tensor, "1.dst");
922
- return cur_backend_id;
923
- }
924
-
925
- // view_src
926
- if (tensor->view_src != NULL) {
927
- cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
928
- if (cur_backend_id != -1) {
929
- SET_CAUSE(tensor, "1.vsrc");
930
- return cur_backend_id;
931
- }
932
- }
933
-
934
- if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
935
- // since the tensor is pre-allocated, it cannot be moved to another backend
936
- LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
937
- }
938
-
939
- // graph input
940
- if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
941
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
942
- SET_CAUSE(tensor, "1.inp");
943
- return cur_backend_id;
944
- }
945
-
946
- // operations with weights are preferably run on the same backend as the weights
947
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
948
- const struct lm_ggml_tensor * src = tensor->src[i];
949
- if (src == NULL) {
950
- continue;
951
- }
952
- // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
953
- // not an ideal solution
954
- if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
955
- int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
956
- // check if a backend with higher prio wants to offload the op
957
- if (src_backend_id == sched->n_backends - 1) {
958
- for (int b = 0; b < src_backend_id; b++) {
959
- if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
960
- SET_CAUSE(tensor, "1.off");
961
- return b;
962
- }
963
- }
964
- }
965
- SET_CAUSE(tensor, "1.wgt%d", i);
966
- return src_backend_id;
967
- }
968
- }
969
-
970
- return -1;
971
- }
972
-
973
- static char * fmt_size(size_t size) {
974
- static char buffer[128];
975
- if (size >= 1024*1024) {
976
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
977
- } else {
978
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
979
- }
980
- return buffer;
981
- }
982
-
983
- static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
984
- int cur_split = 0;
985
- for (int i = 0; i < graph->n_nodes; i++) {
986
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
987
- lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
988
- LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
989
- sched->splits[cur_split].n_inputs);
990
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
991
- LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
992
- fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
993
- }
994
- LM_GGML_LOG_DEBUG("\n");
995
- cur_split++;
996
- }
997
- struct lm_ggml_tensor * node = graph->nodes[i];
998
- if (lm_ggml_is_view_op(node->op)) {
999
- continue;
1000
- }
1001
- if (sched->debug > 1) {
1002
- lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1003
- LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1004
- fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1005
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1006
- struct lm_ggml_tensor * src = node->src[j];
1007
- if (src == NULL) {
1008
- continue;
1009
- }
1010
- lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1011
- LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1012
- fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1013
- }
1014
- LM_GGML_LOG_DEBUG("\n");
1015
- }
1016
- }
1017
- }
1018
-
1019
- static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1020
- lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1021
- lm_ggml_backend_buffer_type_t buft = NULL;
1022
-
1023
- if (buf) {
1024
- // the tensor is already allocated
1025
- buft = buf->buft;
1026
- } else {
1027
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1028
- int tensor_backend_id = tensor_backend_id(t);
1029
- if (tensor_backend_id == -1 && t->view_src) {
1030
- tensor_backend_id = tensor_backend_id(t->view_src);
1031
- }
1032
- if (tensor_backend_id != -1) {
1033
- buft = sched->bufts[tensor_backend_id];
1034
- }
1035
- }
1036
-
1037
- return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1038
- }
1039
-
1040
- static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1041
- if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1042
- *node_backend_id = cur_backend_id;
1043
- SET_CAUSE(node, "2.sup");
1044
- }
1045
- }
1046
-
1047
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1048
- static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1049
- // reset splits
1050
- sched->n_splits = 0;
1051
- sched->n_graph_inputs = 0;
1052
- sched->is_reset = false;
1053
-
1054
- struct lm_ggml_init_params params = {
1055
- /* .mem_size = */ sched->context_buffer_size,
1056
- /* .mem_buffer = */ sched->context_buffer,
1057
- /* .no_alloc = */ true
1058
- };
1059
-
1060
- lm_ggml_free(sched->ctx);
1061
-
1062
- sched->ctx = lm_ggml_init(params);
1063
- if (sched->ctx == NULL) {
1064
- LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
1065
- }
1066
-
1067
- // pass 1: assign backends to ops with pre-allocated inputs
1068
- for (int i = 0; i < graph->n_leafs; i++) {
1069
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1070
- int * leaf_backend_id = &tensor_backend_id(leaf);
1071
- // do not overwrite user assignments
1072
- if (*leaf_backend_id == -1) {
1073
- *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1074
- }
1075
- }
1076
-
1077
- for (int i = 0; i < graph->n_nodes; i++) {
1078
- struct lm_ggml_tensor * node = graph->nodes[i];
1079
- int * node_backend_id = &tensor_backend_id(node);
1080
- // do not overwrite user assignments
1081
- if (*node_backend_id == -1) {
1082
- *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1083
-
1084
- #if 0
1085
- // src
1086
- if (node->op == LM_GGML_OP_NONE) {
1087
- continue;
1088
- }
1089
-
1090
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1091
- struct lm_ggml_tensor * src = node->src[j];
1092
- if (src == NULL) {
1093
- continue;
1094
- }
1095
- int * src_backend_id = &tensor_backend_id(src);
1096
- if (*src_backend_id == -1) {
1097
- *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1098
- }
1099
- }
1100
- #endif
1101
- }
1102
- }
1103
-
1104
- // pass 2: expand current backend assignments
1105
- // assign the same backend to adjacent nodes
1106
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1107
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1108
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1109
- // expand gpu down
1110
- {
1111
- int cur_backend_id = -1;
1112
- for (int i = 0; i < graph->n_nodes; i++) {
1113
- struct lm_ggml_tensor * node = graph->nodes[i];
1114
- if (lm_ggml_is_view_op(node->op)) {
1115
- continue;
1116
- }
1117
- int * node_backend_id = &tensor_backend_id(node);
1118
- if (*node_backend_id != -1) {
1119
- if (*node_backend_id == sched->n_backends - 1) {
1120
- // skip cpu (lowest prio backend)
1121
- cur_backend_id = -1;
1122
- } else {
1123
- cur_backend_id = *node_backend_id;
1124
- }
1125
- } else if (cur_backend_id != -1) {
1126
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1127
- }
1128
- }
1129
- }
1130
- // expand gpu up
1131
- {
1132
- int cur_backend_id = -1;
1133
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1134
- struct lm_ggml_tensor * node = graph->nodes[i];
1135
- if (lm_ggml_is_view_op(node->op)) {
1136
- continue;
1137
- }
1138
- int * node_backend_id = &tensor_backend_id(node);
1139
- if (*node_backend_id != -1) {
1140
- if (*node_backend_id == sched->n_backends - 1) {
1141
- // skip cpu (lowest prio backend)
1142
- cur_backend_id = -1;
1143
- } else {
1144
- cur_backend_id = *node_backend_id;
1145
- }
1146
- } else if (cur_backend_id != -1) {
1147
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1148
- }
1149
- }
1150
- }
1151
- // expand rest down
1152
- {
1153
- int cur_backend_id = -1;
1154
- for (int i = 0; i < graph->n_nodes; i++) {
1155
- struct lm_ggml_tensor * node = graph->nodes[i];
1156
- if (lm_ggml_is_view_op(node->op)) {
1157
- continue;
1158
- }
1159
- int * node_backend_id = &tensor_backend_id(node);
1160
- if (*node_backend_id != -1) {
1161
- cur_backend_id = *node_backend_id;
1162
- } else if (cur_backend_id != -1) {
1163
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1164
- }
1165
- }
1166
- }
1167
- // expand rest up
1168
- {
1169
- int cur_backend_id = -1;
1170
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1171
- struct lm_ggml_tensor * node = graph->nodes[i];
1172
- if (lm_ggml_is_view_op(node->op)) {
1173
- continue;
1174
- }
1175
- int * node_backend_id = &tensor_backend_id(node);
1176
- if (*node_backend_id != -1) {
1177
- cur_backend_id = *node_backend_id;
1178
- } else if (cur_backend_id != -1) {
1179
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1180
- }
1181
- }
1182
- }
1183
-
1184
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1185
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1186
- // however, we also need to verify that the sources are in compatible buffer types
1187
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1188
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1189
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1190
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1191
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1192
- for (int i = 0; i < graph->n_nodes; i++) {
1193
- struct lm_ggml_tensor * node = graph->nodes[i];
1194
- if (lm_ggml_is_view_op(node->op)) {
1195
- continue;
1196
- }
1197
- int * node_backend_id = &tensor_backend_id(node);
1198
- if (*node_backend_id == -1) {
1199
- // unassigned node: find the backend with the most supported inputs
1200
- int n_supported_best = -1;
1201
- for (int b = 0; b < sched->n_backends; b++) {
1202
- if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1203
- int n_supported = 0;
1204
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1205
- struct lm_ggml_tensor * src = node->src[j];
1206
- if (src == NULL) {
1207
- continue;
1208
- }
1209
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1210
- n_supported++;
1211
- }
1212
- }
1213
- if (n_supported > n_supported_best) {
1214
- n_supported_best = n_supported;
1215
- *node_backend_id = b;
1216
- SET_CAUSE(node, "3.best");
1217
- }
1218
- }
1219
- }
1220
- } else {
1221
- // assigned node: upgrade to higher prio backend if possible
1222
- for (int b = 0; b < *node_backend_id; b++) {
1223
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1224
- bool supported = true;
1225
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1226
- struct lm_ggml_tensor * src = node->src[j];
1227
- if (src == NULL) {
1228
- continue;
1229
- }
1230
- if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1231
- supported = false;
1232
- break;
1233
- }
1234
- }
1235
- if (supported) {
1236
- *node_backend_id = b;
1237
- SET_CAUSE(node, "3.upg");
1238
- break;
1239
- }
1240
- }
1241
- }
1242
- }
1243
- }
1244
-
1245
- // pass 4: assign backends to remaining src from dst and view_src
1246
- for (int i = 0; i < graph->n_nodes; i++) {
1247
- struct lm_ggml_tensor * node = graph->nodes[i];
1248
- int * cur_backend_id = &tensor_backend_id(node);
1249
- if (node->view_src != NULL && *cur_backend_id == -1) {
1250
- *cur_backend_id = tensor_backend_id(node->view_src);
1251
- SET_CAUSE(node, "4.vsrc");
1252
- }
1253
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1254
- struct lm_ggml_tensor * src = node->src[j];
1255
- if (src == NULL) {
1256
- continue;
1257
- }
1258
- int * src_backend_id = &tensor_backend_id(src);
1259
- if (*src_backend_id == -1) {
1260
- if (src->view_src != NULL) {
1261
- // views are always on the same backend as the source
1262
- *src_backend_id = tensor_backend_id(src->view_src);
1263
- SET_CAUSE(src, "4.vsrc");
1264
- } else {
1265
- *src_backend_id = *cur_backend_id;
1266
- SET_CAUSE(src, "4.cur");
1267
- }
1268
- }
1269
- }
1270
- }
1271
-
1272
- // pass 5: split graph, find tensors that need to be copied
1273
- {
1274
- int i_split = 0;
1275
- struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1276
- // find the backend of the first split, skipping view ops
1277
- int i = 0;
1278
- for (; i < graph->n_nodes; i++) {
1279
- struct lm_ggml_tensor * node = graph->nodes[i];
1280
- if (!lm_ggml_is_view_op(node->op)) {
1281
- split->backend_id = tensor_backend_id(node);
1282
- break;
1283
- }
1284
- }
1285
- split->i_start = 0;
1286
- split->n_inputs = 0;
1287
- int cur_backend_id = split->backend_id;
1288
- for (; i < graph->n_nodes; i++) {
1289
- struct lm_ggml_tensor * node = graph->nodes[i];
1290
-
1291
- if (lm_ggml_is_view_op(node->op)) {
1292
- continue;
1293
- }
1294
-
1295
- const int node_backend_id = tensor_backend_id(node);
1296
-
1297
- assert(node_backend_id != -1); // all nodes should be assigned by now
1298
-
1299
- // check if we should start a new split based on the sources of the current node
1300
- bool need_new_split = false;
1301
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1302
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1303
- struct lm_ggml_tensor * src = node->src[j];
1304
- if (src == NULL) {
1305
- continue;
1306
- }
1307
- // check if a weight is on a different and incompatible backend
1308
- // by starting a new split, the memory of the previously offloaded weights can be reused
1309
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1310
- int src_backend_id = tensor_backend_id(src);
1311
- if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1312
- need_new_split = true;
1313
- break;
1314
- }
1315
- }
1316
- // check if the split has too many inputs
1317
- // FIXME: count the number of inputs instead of only checking when full
1318
- if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1319
- const size_t id = hash_id(src);
1320
- int src_backend_id = sched->hv_tensor_backend_ids[id];
1321
- bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1322
- if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1323
- need_new_split = true;
1324
- break;
1325
- }
1326
- }
1327
- }
1328
- }
1329
-
1330
- if (node_backend_id != cur_backend_id || need_new_split) {
1331
- split->i_end = i;
1332
- i_split++;
1333
- if (i_split >= sched->splits_capacity) {
1334
- sched->splits_capacity *= 2;
1335
- sched->splits = (lm_ggml_backend_sched_split *)
1336
- realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1337
- LM_GGML_ASSERT(sched->splits != NULL);
1338
- }
1339
- split = &sched->splits[i_split];
1340
- split->backend_id = node_backend_id;
1341
- split->i_start = i;
1342
- split->n_inputs = 0;
1343
- cur_backend_id = node_backend_id;
1344
- }
1345
-
1346
- // find inputs that are not on the same backend
1347
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1348
- struct lm_ggml_tensor * src = node->src[j];
1349
- if (src == NULL) {
1350
- continue;
1351
- }
1352
-
1353
- size_t src_id = hash_id(src);
1354
- const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1355
- assert(src_backend_id != -1); // all inputs should be assigned by now
1356
-
1357
- if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1358
- if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1359
- lm_ggml_backend_t backend = sched->backends[src_backend_id];
1360
- for (int c = 0; c < sched->n_copies; c++) {
1361
- struct lm_ggml_tensor * tensor_copy;
1362
- if (c == sched->cur_copy) {
1363
- tensor_copy = src; // use the original tensor as the current copy
1364
- } else {
1365
- tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1366
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1367
- }
1368
- if (sched->n_copies > 1) {
1369
- lm_ggml_set_input(tensor_copy);
1370
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1371
- }
1372
- tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1373
- SET_CAUSE(tensor_copy, "4.cpy");
1374
- }
1375
- int n_graph_inputs = sched->n_graph_inputs++;
1376
- LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1377
- sched->graph_inputs[n_graph_inputs] = src;
1378
- }
1379
- }
1380
-
1381
- if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1382
- // create a copy of the input in the split's backend
1383
- if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1384
- lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1385
- for (int c = 0; c < sched->n_copies; c++) {
1386
- struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1387
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1388
- if (sched->n_copies > 1) {
1389
- lm_ggml_set_input(tensor_copy);
1390
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1391
- }
1392
- tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1393
- SET_CAUSE(tensor_copy, "4.cpy");
1394
- }
1395
- int n_inputs = split->n_inputs++;
1396
- LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1397
- split->inputs[n_inputs] = src;
1398
- }
1399
- node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1400
- }
1401
- }
1402
- }
1403
- split->i_end = graph->n_nodes;
1404
- sched->n_splits = i_split + 1;
1405
- }
1406
-
1407
- if (sched->debug) {
1408
- lm_ggml_backend_sched_print_assignments(sched, graph);
1409
- }
1410
-
1411
- // swap node_backend_ids and leaf _backend_ids with prevs
1412
- {
1413
- int * tmp = sched->node_backend_ids;
1414
- sched->node_backend_ids = sched->prev_node_backend_ids;
1415
- sched->prev_node_backend_ids = tmp;
1416
-
1417
- tmp = sched->leaf_backend_ids;
1418
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1419
- sched->prev_leaf_backend_ids = tmp;
1420
- }
1421
-
1422
- int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1423
- if (sched->graph.size < graph_size) {
1424
- sched->graph.size = graph_size;
1425
- sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1426
- sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1427
- LM_GGML_ASSERT(sched->graph.nodes != NULL);
1428
- LM_GGML_ASSERT(sched->graph.leafs != NULL);
1429
- }
1430
- sched->graph.n_nodes = 0;
1431
- sched->graph.n_leafs = 0;
1432
-
1433
- struct lm_ggml_cgraph * graph_copy = &sched->graph;
1434
-
1435
- for (int i = 0; i < sched->n_splits; i++) {
1436
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1437
- split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1438
-
1439
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1440
- for (int j = 0; j < split->n_inputs; j++) {
1441
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1442
-
1443
- struct lm_ggml_tensor * input = split->inputs[j];
1444
- const size_t input_id = hash_id(input);
1445
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1446
-
1447
- // add a dependency to the input source so that it is not freed before the copy is done
1448
- struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1449
- input_dep->src[0] = input;
1450
- sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1451
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1452
-
1453
- // add a dependency to the input copy so that it is allocated at the start of the split
1454
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1455
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1456
- }
1457
-
1458
- for (int j = split->i_start; j < split->i_end; j++) {
1459
- assert(graph_copy->size > graph_copy->n_nodes);
1460
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1461
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1462
- }
1463
- }
1464
-
1465
- if (sched->n_copies > 1) {
1466
- // add input copies as leafs so that they are allocated first
1467
- for (int i = 0; i < sched->n_graph_inputs; i++) {
1468
- struct lm_ggml_tensor * input = sched->graph_inputs[i];
1469
- size_t id = hash_id(input);
1470
- int backend_id = tensor_backend_id(input);
1471
- for (int c = 0; c < sched->n_copies; c++) {
1472
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1473
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1474
- assert(graph_copy->size > graph_copy->n_leafs);
1475
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1476
- }
1477
- }
1478
-
1479
- for (int i = 0; i < sched->n_splits; i++) {
1480
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1481
- int backend_id = split->backend_id;
1482
- for (int j = 0; j < split->n_inputs; j++) {
1483
- struct lm_ggml_tensor * input = split->inputs[j];
1484
- size_t id = hash_id(input);
1485
- for (int c = 0; c < sched->n_copies; c++) {
1486
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1487
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1488
- assert(graph_copy->size > graph_copy->n_leafs);
1489
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1490
- }
1491
- }
1492
- }
1493
- }
1494
-
1495
- // add leafs from the original graph
1496
- for (int i = 0; i < graph->n_leafs; i++) {
1497
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1498
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1499
- assert(graph_copy->size > graph_copy->n_leafs);
1500
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1501
- }
1502
- }
1503
-
1504
- static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1505
- bool backend_ids_changed = false;
1506
- for (int i = 0; i < sched->graph.n_nodes; i++) {
1507
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1508
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1509
- backend_ids_changed = true;
1510
- break;
1511
- }
1512
- }
1513
- if (!backend_ids_changed) {
1514
- for (int i = 0; i < sched->graph.n_leafs; i++) {
1515
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1516
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1517
- backend_ids_changed = true;
1518
- break;
1519
- }
1520
- }
1521
- }
1522
-
1523
- // allocate graph
1524
- if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1525
- // the re-allocation may cause the split inputs to be moved to a different address
1526
- lm_ggml_backend_sched_synchronize(sched);
1527
- #ifndef NDEBUG
1528
- LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1529
- #endif
1530
- lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1531
- if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1532
- LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1533
- return false;
1534
- }
1535
- }
1536
-
1537
- return true;
1538
- }
1539
-
1540
- static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1541
- struct lm_ggml_backend_sched_split * splits = sched->splits;
1542
-
1543
- for (int i = 0; i < sched->n_splits; i++) {
1544
- struct lm_ggml_backend_sched_split * split = &splits[i];
1545
- int split_backend_id = split->backend_id;
1546
- lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1547
-
1548
- // copy the input tensors to the split backend
1549
- for (int j = 0; j < split->n_inputs; j++) {
1550
- lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1551
- struct lm_ggml_tensor * input = split->inputs[j];
1552
- struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1553
-
1554
- if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1555
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1556
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1557
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1558
- } else {
1559
- lm_ggml_backend_synchronize(split_backend);
1560
- }
1561
- lm_ggml_backend_tensor_copy(input, input_cpy);
1562
- } else {
1563
- // wait for the split backend to finish using the input before overwriting it
1564
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1565
- lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1566
- } else {
1567
- lm_ggml_backend_synchronize(split_backend);
1568
- }
1569
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1570
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1571
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1572
- lm_ggml_backend_synchronize(input_backend);
1573
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1574
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1575
- } else {
1576
- lm_ggml_backend_synchronize(split_backend);
1577
- }
1578
- lm_ggml_backend_tensor_copy(input, input_cpy);
1579
- }
1580
- }
1581
- }
1582
-
1583
- if (!sched->callback_eval) {
1584
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1585
- if (ec != LM_GGML_STATUS_SUCCESS) {
1586
- return ec;
1587
- }
1588
- } else {
1589
- // similar to lm_ggml_backend_compare_graph_backend
1590
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1591
- struct lm_ggml_tensor * t = split->graph.nodes[j0];
1592
-
1593
- // check if the user needs data from this node
1594
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1595
-
1596
- int j1 = j0;
1597
-
1598
- // determine the range [j0, j1] of nodes that can be computed together
1599
- while (!need && j1 < split->graph.n_nodes - 1) {
1600
- t = split->graph.nodes[++j1];
1601
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1602
- }
1603
-
1604
- struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1605
-
1606
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1607
- if (ec != LM_GGML_STATUS_SUCCESS) {
1608
- return ec;
1609
- }
1610
-
1611
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1612
- lm_ggml_backend_synchronize(split_backend);
1613
-
1614
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1615
- break;
1616
- }
1617
-
1618
- j0 = j1;
1619
- }
1620
- }
1621
-
1622
- // record the event of this copy
1623
- if (split->n_inputs > 0) {
1624
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1625
- lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1626
- }
1627
- }
1628
- }
1629
-
1630
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1631
-
1632
- return LM_GGML_STATUS_SUCCESS;
1633
- }
1634
-
1635
- lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1636
- lm_ggml_backend_t * backends,
1637
- lm_ggml_backend_buffer_type_t * bufts,
1638
- int n_backends,
1639
- size_t graph_size,
1640
- bool parallel) {
1641
- LM_GGML_ASSERT(n_backends > 0);
1642
- LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1643
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1644
-
1645
- struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1646
-
1647
- const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
1648
- sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
1649
- sched->n_backends = n_backends;
1650
- sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1651
-
1652
- // initialize hash table
1653
- // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1654
- sched->hash_set = lm_ggml_hash_set_new(graph_size);
1655
- sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1656
- sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1657
-
1658
- const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1659
- const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1660
- sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1661
- sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1662
- sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1663
- sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1664
-
1665
- sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1666
- sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1667
-
1668
- const int initial_splits_capacity = 16;
1669
- sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1670
- sched->splits_capacity = initial_splits_capacity;
1671
-
1672
- for (int b = 0; b < n_backends; b++) {
1673
- sched->backends[b] = backends[b];
1674
- sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1675
- LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1676
-
1677
- if (sched->n_copies > 1) {
1678
- for (int c = 0; c < sched->n_copies; c++) {
1679
- sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
1680
- }
1681
- }
1682
- }
1683
-
1684
- sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1685
-
1686
- lm_ggml_backend_sched_reset(sched);
1687
-
1688
- return sched;
1689
- }
1690
-
1691
- void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1692
- if (sched == NULL) {
1693
- return;
1694
- }
1695
- for (int b = 0; b < sched->n_backends; b++) {
1696
- for (int c = 0; c < sched->n_copies; c++) {
1697
- lm_ggml_backend_event_free(sched->events[b][c]);
1698
- }
1699
- }
1700
- lm_ggml_gallocr_free(sched->galloc);
1701
- lm_ggml_free(sched->ctx);
1702
- lm_ggml_hash_set_free(&sched->hash_set);
1703
- free(sched->splits);
1704
- free(sched->hv_tensor_backend_ids);
1705
- free(sched->hv_tensor_copies);
1706
- free(sched->node_backend_ids);
1707
- free(sched->leaf_backend_ids);
1708
- free(sched->prev_node_backend_ids);
1709
- free(sched->prev_leaf_backend_ids);
1710
- free(sched->context_buffer);
1711
- free(sched->graph.nodes);
1712
- free(sched->graph.leafs);
1713
- free(sched);
1714
- }
1715
-
1716
- void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1717
- // reset state for the next run
1718
- if (!sched->is_reset) {
1719
- lm_ggml_hash_set_reset(&sched->hash_set);
1720
- memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1721
- memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1722
- sched->is_reset = true;
1723
- }
1724
- sched->is_alloc = false;
1725
- }
1726
-
1727
- bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1728
- LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1729
-
1730
- lm_ggml_backend_sched_split_graph(sched, measure_graph);
1731
-
1732
- if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1733
- return false;
1734
- }
1735
-
1736
- lm_ggml_backend_sched_reset(sched);
1737
- lm_ggml_backend_sched_synchronize(sched);
1738
-
1739
- return true;
1740
- }
1741
-
1742
- bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1743
- LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1744
-
1745
- lm_ggml_backend_sched_split_graph(sched, graph);
1746
-
1747
-
1748
- if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1749
- return false;
1750
- }
1751
-
1752
- sched->is_alloc = true;
1753
-
1754
- return true;
1755
- }
1756
-
1757
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1758
- enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1759
- lm_ggml_backend_sched_synchronize(sched);
1760
- return err;
1761
- }
1762
-
1763
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1764
- if (!sched->is_reset && !sched->is_alloc) {
1765
- lm_ggml_backend_sched_reset(sched);
1766
- }
1767
-
1768
- if (!sched->is_alloc) {
1769
- if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1770
- return LM_GGML_STATUS_ALLOC_FAILED;
1771
- }
1772
- }
1773
-
1774
- return lm_ggml_backend_sched_compute_splits(sched);
1775
- }
1776
-
1777
- void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1778
- for (int i = 0; i < sched->n_backends; i++) {
1779
- lm_ggml_backend_synchronize(sched->backends[i]);
1780
- }
1781
- }
1782
-
1783
- void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1784
- sched->callback_eval = callback;
1785
- sched->callback_eval_user_data = user_data;
1786
- }
1787
-
1788
- int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1789
- return sched->n_splits;
1790
- }
1791
-
1792
- int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1793
- return sched->n_copies;
1794
- }
1795
-
1796
- int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
1797
- return sched->n_backends;
1798
- }
1799
-
1800
- lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
1801
- LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1802
- return sched->backends[i];
1803
- }
1804
-
1805
- size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1806
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1807
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1808
-
1809
- return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1810
- }
1811
-
1812
- void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
1813
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1814
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1815
- tensor_backend_id(node) = backend_index;
1816
- SET_CAUSE(node, "usr");
1817
- sched->is_reset = false;
1818
- }
1819
-
1820
- lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
1821
- int backend_index = tensor_backend_id(node);
1822
- if (backend_index == -1) {
1823
- return NULL;
1824
- }
1825
- return sched->backends[backend_index];
1826
- }
1827
-
1828
- // utils
1829
-
1830
- void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
1831
- LM_GGML_ASSERT(tensor->buffer == NULL);
1832
- LM_GGML_ASSERT(tensor->view_src != NULL);
1833
- LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
1834
- LM_GGML_ASSERT(tensor->view_src->data != NULL);
1835
-
1836
- tensor->buffer = tensor->view_src->buffer;
1837
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1838
- lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1839
- }
1840
-
1841
- void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
1842
- LM_GGML_ASSERT(tensor->buffer == NULL);
1843
- LM_GGML_ASSERT(tensor->data == NULL);
1844
- LM_GGML_ASSERT(tensor->view_src == NULL);
1845
- LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
1846
- LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1847
- (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
1848
-
1849
- tensor->buffer = buffer;
1850
- tensor->data = addr;
1851
- lm_ggml_backend_buffer_init_tensor(buffer, tensor);
1852
- }
1853
-
1854
- static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
1855
- struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
1856
-
1857
- LM_GGML_ASSERT(src != NULL);
1858
- LM_GGML_ASSERT(src->data && "graph must be allocated");
1859
-
1860
- size_t id = lm_ggml_hash_insert(&hash_set, src);
1861
- if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
1862
- return node_copies[lm_ggml_hash_find(&hash_set, src)];
1863
- }
1864
-
1865
- struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1866
- if (src->view_src != NULL) {
1867
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1868
- dst->view_offs = src->view_offs;
1869
- }
1870
- dst->op = src->op;
1871
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1872
- lm_ggml_set_name(dst, src->name);
1873
-
1874
- // copy src
1875
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1876
- struct lm_ggml_tensor * s = src->src[i];
1877
- if (s == NULL) {
1878
- continue;
1879
- }
1880
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1881
- }
1882
-
1883
- node_copies[id] = dst;
1884
- return dst;
1885
- }
1886
-
1887
- static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
1888
- size_t id = lm_ggml_hash_find(hash_set, src);
1889
- if (node_init[id]) {
1890
- return;
1891
- }
1892
- node_init[id] = true;
1893
-
1894
- struct lm_ggml_tensor * dst = node_copies[id];
1895
- if (dst->view_src != NULL) {
1896
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1897
- lm_ggml_backend_view_init(dst);
1898
- }
1899
- else {
1900
- lm_ggml_backend_tensor_copy(src, dst);
1901
- }
1902
-
1903
- // init src
1904
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1905
- struct lm_ggml_tensor * s = src->src[i];
1906
- if (s == NULL) {
1907
- continue;
1908
- }
1909
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1910
- }
1911
- }
1912
-
1913
- struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
1914
- struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
1915
- struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1916
- bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
1917
-
1918
- struct lm_ggml_init_params params = {
1919
- /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
1920
- /* .mem_buffer = */ NULL,
1921
- /* .no_alloc = */ true
1922
- };
1923
-
1924
- struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
1925
- struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
1926
-
1927
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1928
- LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1929
- lm_ggml_hash_set_free(&hash_set);
1930
- free(node_copies);
1931
- free(node_init);
1932
- lm_ggml_free(ctx_allocated);
1933
- lm_ggml_free(ctx_unallocated);
1934
- return {
1935
- /* .buffer = */ NULL,
1936
- /* .ctx_allocated = */ NULL,
1937
- /* .ctx_unallocated = */ NULL,
1938
- /* .graph = */ NULL,
1939
- };
1940
- }
1941
-
1942
- // dup nodes
1943
- for (int i = 0; i < graph->n_nodes; i++) {
1944
- struct lm_ggml_tensor * node = graph->nodes[i];
1945
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1946
- }
1947
-
1948
- // allocate nodes
1949
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1950
- if (buffer == NULL) {
1951
- LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1952
- lm_ggml_hash_set_free(&hash_set);
1953
- free(node_copies);
1954
- free(node_init);
1955
- lm_ggml_free(ctx_allocated);
1956
- lm_ggml_free(ctx_unallocated);
1957
- return {
1958
- /* .buffer = */ NULL,
1959
- /* .ctx_allocated = */ NULL,
1960
- /* .ctx_unallocated = */ NULL,
1961
- /* .graph = */ NULL,
1962
- };
1963
- }
1964
-
1965
- //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1966
-
1967
- // copy data and init views
1968
- for (int i = 0; i < graph->n_nodes; i++) {
1969
- struct lm_ggml_tensor * node = graph->nodes[i];
1970
- graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
1971
- }
1972
-
1973
- // build graph copy
1974
- struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
1975
- for (int i = 0; i < graph->n_nodes; i++) {
1976
- struct lm_ggml_tensor * node = graph->nodes[i];
1977
- struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
1978
- graph_copy->nodes[i] = node_copy;
1979
- }
1980
- graph_copy->n_nodes = graph->n_nodes;
1981
-
1982
- lm_ggml_hash_set_free(&hash_set);
1983
- free(node_copies);
1984
- free(node_init);
1985
-
1986
- return {
1987
- /* .buffer = */ buffer,
1988
- /* .ctx_allocated = */ ctx_allocated,
1989
- /* .ctx_unallocated = */ ctx_unallocated,
1990
- /* .graph = */ graph_copy,
1991
- };
1992
- }
1993
-
1994
- void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
1995
- lm_ggml_backend_buffer_free(copy.buffer);
1996
- lm_ggml_free(copy.ctx_allocated);
1997
- lm_ggml_free(copy.ctx_unallocated);
1998
- }
1999
-
2000
- bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2001
- struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2002
- if (copy.buffer == NULL) {
2003
- return false;
2004
- }
2005
-
2006
- struct lm_ggml_cgraph * g1 = graph;
2007
- struct lm_ggml_cgraph * g2 = copy.graph;
2008
-
2009
- assert(g1->n_nodes == g2->n_nodes);
2010
-
2011
- for (int i = 0; i < g1->n_nodes; i++) {
2012
- //printf("eval %d/%d\n", i, g1->n_nodes);
2013
- struct lm_ggml_tensor * t1 = g1->nodes[i];
2014
- struct lm_ggml_tensor * t2 = g2->nodes[i];
2015
-
2016
- assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2017
-
2018
- struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2019
- struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2020
-
2021
- lm_ggml_backend_graph_compute(backend1, &g1v);
2022
- lm_ggml_backend_graph_compute(backend2, &g2v);
2023
-
2024
- if (lm_ggml_is_view_op(t1->op)) {
2025
- continue;
2026
- }
2027
-
2028
- // compare results, calculate rms etc
2029
- if (!callback(i, t1, t2, user_data)) {
2030
- break;
2031
- }
2032
- }
2033
-
2034
- lm_ggml_backend_graph_copy_free(copy);
2035
-
2036
- return true;
2037
- }
2038
-
2039
-
2040
-
2041
- #include "ggml-backend.h"
2042
- #include "ggml-backend-impl.h"
2043
- #include "ggml-cpu.h"
2044
- #include "ggml-impl.h"
2045
- #include <cctype>
2046
- #include <string>
2047
-
2048
- // ggml-backend interface
2049
-
2050
- // CPU backend - buffer
2051
-
2052
- static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
2053
- uintptr_t data = (uintptr_t)buffer->context;
2054
-
2055
- // align the buffer
2056
- if (data % TENSOR_ALIGNMENT != 0) {
2057
- data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
2058
- }
2059
-
2060
- return (void *)data;
2061
- }
2062
-
2063
- static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2064
- lm_ggml_aligned_free(buffer->context, buffer->size);
2065
- }
2066
-
2067
- static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2068
- memset((char *)tensor->data + offset, value, size);
2069
-
2070
- LM_GGML_UNUSED(buffer);
2071
- }
2072
-
2073
- static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2074
- memcpy((char *)tensor->data + offset, data, size);
2075
-
2076
- LM_GGML_UNUSED(buffer);
2077
- }
2078
-
2079
- static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2080
- memcpy(data, (const char *)tensor->data + offset, size);
2081
-
2082
- LM_GGML_UNUSED(buffer);
2083
- }
2084
-
2085
- static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
2086
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
2087
- memcpy(dst->data, src->data, lm_ggml_nbytes(src));
2088
- return true;
2089
- }
2090
- return false;
2091
-
2092
- LM_GGML_UNUSED(buffer);
2093
- }
2094
-
2095
- static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
2096
- memset(buffer->context, value, buffer->size);
2097
- }
2098
-
2099
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
2100
- /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
2101
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2102
- /* .init_tensor = */ NULL, // no initialization required
2103
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2104
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2105
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2106
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2107
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2108
- /* .reset = */ NULL,
2109
- };
2110
-
2111
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
2112
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
2113
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2114
- /* .init_tensor = */ NULL, // no initialization required
2115
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2116
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2117
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2118
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2119
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2120
- /* .reset = */ NULL,
2121
- };
2122
-
2123
- // CPU backend - buffer type
2124
-
2125
- static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2126
- return "CPU";
2127
-
2128
- LM_GGML_UNUSED(buft);
2129
- }
2130
-
2131
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2132
- void * data = lm_ggml_aligned_malloc(size);
2133
-
2134
- if (data == NULL) {
2135
- LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
2136
- return NULL;
2137
- }
2138
-
2139
- return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
2140
- }
2141
-
2142
- static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
2143
- return TENSOR_ALIGNMENT;
2144
-
2145
- LM_GGML_UNUSED(buft);
2146
- }
2147
-
2148
- static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
2149
- return true;
2150
-
2151
- LM_GGML_UNUSED(buft);
2152
- }
2153
-
2154
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
2155
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2156
- /* .iface = */ {
2157
- /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
2158
- /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2159
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2160
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2161
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2162
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2163
- },
2164
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2165
- /* .context = */ NULL,
2166
- };
2167
-
2168
- return &lm_ggml_backend_cpu_buffer_type;
2169
- }
2170
-
2171
- static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2172
- return "CPU_Mapped";
2173
-
2174
- LM_GGML_UNUSED(buft);
2175
- }
2176
-
2177
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
2178
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2179
- /* .iface = */ {
2180
- /* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
2181
- /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2182
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2183
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2184
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2185
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2186
- },
2187
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2188
- /* .context = */ NULL,
2189
- };
2190
-
2191
- return &lm_ggml_backend_cpu_buffer_type;
2192
- }
2193
-
2194
- #ifdef LM_GGML_USE_CPU_HBM
2195
-
2196
- // buffer type HBM
2197
-
2198
- #include <hbwmalloc.h>
2199
-
2200
- static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2201
- return "CPU_HBM";
2202
-
2203
- LM_GGML_UNUSED(buft);
2204
- }
2205
-
2206
- static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2207
- hbw_free(buffer->context);
2208
- }
2209
-
2210
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2211
- void * ptr;
2212
- int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
2213
- if (result != 0) {
2214
- LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
2215
- return NULL;
2216
- }
2217
-
2218
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2219
- buffer->buft = buft;
2220
- buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
2221
-
2222
- return buffer;
2223
- }
2224
-
2225
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
2226
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
2227
- /* .iface = */ {
2228
- /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
2229
- /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
2230
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2231
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2232
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2233
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2234
- },
2235
- /* .context = */ NULL,
2236
- };
2237
-
2238
- return &lm_ggml_backend_cpu_buffer_type_hbm;
2239
- }
2240
- #endif
2241
-
2242
- static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
2243
- static lm_ggml_backend_buffer_type_t bufts[] = {
2244
- #ifdef LM_GGML_USE_CPU_HBM
2245
- lm_ggml_backend_cpu_hbm_buffer_type(),
2246
- #endif
2247
- NULL
2248
- };
2249
-
2250
- return bufts;
2251
-
2252
- LM_GGML_UNUSED(device);
2253
- }
2254
-
2255
- // CPU backend - backend (stream)
2256
-
2257
- struct lm_ggml_backend_cpu_context {
2258
- int n_threads;
2259
- lm_ggml_threadpool_t threadpool;
2260
-
2261
- uint8_t * work_data;
2262
- size_t work_size;
2263
-
2264
- lm_ggml_abort_callback abort_callback;
2265
- void * abort_callback_data;
2266
- };
2267
-
2268
- static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
2269
- return "CPU";
2270
-
2271
- LM_GGML_UNUSED(backend);
2272
- }
2273
-
2274
- static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
2275
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2276
- delete[] cpu_ctx->work_data;
2277
- delete cpu_ctx;
2278
- delete backend;
2279
- }
2280
-
2281
- struct lm_ggml_backend_plan_cpu {
2282
- struct lm_ggml_cplan cplan;
2283
- struct lm_ggml_cgraph cgraph;
2284
- };
2285
-
2286
- static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
2287
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2288
-
2289
- struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
2290
-
2291
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2292
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
2293
-
2294
- if (cpu_plan->cplan.work_size > 0) {
2295
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
2296
- if (cpu_plan->cplan.work_data == NULL) {
2297
- delete cpu_plan;
2298
- return NULL;
2299
- }
2300
- }
2301
-
2302
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
2303
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2304
-
2305
- return cpu_plan;
2306
- }
2307
-
2308
- static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2309
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2310
-
2311
- delete[] cpu_plan->cplan.work_data;
2312
- delete cpu_plan;
2313
-
2314
- LM_GGML_UNUSED(backend);
2315
- }
2316
-
2317
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2318
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2319
-
2320
- return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
2321
-
2322
- LM_GGML_UNUSED(backend);
2323
- }
2324
-
2325
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
2326
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2327
-
2328
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2329
-
2330
- if (cpu_ctx->work_size < cplan.work_size) {
2331
- delete[] cpu_ctx->work_data;
2332
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
2333
- if (cpu_ctx->work_data == NULL) {
2334
- cpu_ctx->work_size = 0;
2335
- return LM_GGML_STATUS_ALLOC_FAILED;
2336
- }
2337
- cpu_ctx->work_size = cplan.work_size;
2338
- }
2339
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
2340
-
2341
- cplan.abort_callback = cpu_ctx->abort_callback;
2342
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2343
-
2344
- return lm_ggml_graph_compute(cgraph, &cplan);
2345
- }
2346
-
2347
- static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
2348
- /* .get_name = */ lm_ggml_backend_cpu_get_name,
2349
- /* .free = */ lm_ggml_backend_cpu_free,
2350
- /* .set_tensor_async = */ NULL,
2351
- /* .get_tensor_async = */ NULL,
2352
- /* .cpy_tensor_async = */ NULL,
2353
- /* .synchronize = */ NULL,
2354
- /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
2355
- /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
2356
- /* .graph_plan_update = */ NULL,
2357
- /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
2358
- /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
2359
- /* .event_record = */ NULL,
2360
- /* .event_wait = */ NULL,
2361
- };
2362
-
2363
- static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
2364
- static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
2365
- return &guid;
2366
- }
2367
-
2368
- lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
2369
- // initialize CPU backend now to avoid slowing the first graph computation
2370
- lm_ggml_cpu_init();
2371
-
2372
- struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
2373
- if (ctx == NULL) {
2374
- return NULL;
2375
- }
2376
-
2377
- ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
2378
- ctx->threadpool = NULL;
2379
- ctx->work_data = NULL;
2380
- ctx->work_size = 0;
2381
- ctx->abort_callback = NULL;
2382
- ctx->abort_callback_data = NULL;
2383
-
2384
- lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
2385
- /* .guid = */ lm_ggml_backend_cpu_guid(),
2386
- /* .interface = */ lm_ggml_backend_cpu_i,
2387
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2388
- /* .context = */ ctx,
2389
- };
2390
-
2391
- if (cpu_backend == NULL) {
2392
- delete ctx;
2393
- return NULL;
2394
- }
2395
-
2396
- return cpu_backend;
2397
- }
2398
-
2399
- bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
2400
- return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
2401
- }
2402
-
2403
- void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
2404
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2405
-
2406
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2407
- ctx->n_threads = n_threads;
2408
- }
2409
-
2410
- void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
2411
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2412
-
2413
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2414
-
2415
- if (ctx->threadpool && ctx->threadpool != threadpool) {
2416
- // already had a different threadpool, pause/suspend it before switching
2417
- lm_ggml_threadpool_pause(ctx->threadpool);
2418
- }
2419
- ctx->threadpool = threadpool;
2420
- }
2421
-
2422
- void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
2423
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2424
-
2425
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2426
- ctx->abort_callback = abort_callback;
2427
- ctx->abort_callback_data = abort_callback_data;
2428
- }
2429
-
2430
- lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2431
- LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
2432
- return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
2433
- }
2434
-
2435
- // CPU backend - device
2436
-
2437
- struct lm_ggml_backend_cpu_device_context {
2438
- std::string description = "CPU";
2439
-
2440
- lm_ggml_backend_cpu_device_context() {
2441
- #ifdef __APPLE__
2442
- size_t len = 0;
2443
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
2444
- description.resize(len);
2445
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
2446
- }
2447
- #elif defined(__linux__)
2448
- FILE * f = fopen("/proc/cpuinfo", "r");
2449
- if (f) {
2450
- char buf[1024];
2451
- while (fgets(buf, sizeof(buf), f)) {
2452
- if (strncmp(buf, "model name", 10) == 0) {
2453
- char * p = strchr(buf, ':');
2454
- if (p) {
2455
- p++;
2456
- while (std::isspace(*p)) {
2457
- p++;
2458
- }
2459
- while (std::isspace(p[strlen(p) - 1])) {
2460
- p[strlen(p) - 1] = '\0';
2461
- }
2462
- description = p;
2463
- break;
2464
- }
2465
- }
2466
- }
2467
- fclose(f);
2468
- }
2469
- #elif defined(_WIN32)
2470
- HKEY hKey;
2471
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
2472
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
2473
- 0,
2474
- KEY_READ,
2475
- &hKey) == ERROR_SUCCESS) {
2476
- DWORD cpu_brand_size = 0;
2477
- if (RegQueryValueExA(hKey,
2478
- TEXT("ProcessorNameString"),
2479
- NULL,
2480
- NULL,
2481
- NULL,
2482
- &cpu_brand_size) == ERROR_SUCCESS) {
2483
- description.resize(cpu_brand_size);
2484
- if (RegQueryValueExA(hKey,
2485
- TEXT("ProcessorNameString"),
2486
- NULL,
2487
- NULL,
2488
- (LPBYTE)&description[0], // NOLINT
2489
- &cpu_brand_size) == ERROR_SUCCESS) {
2490
- if (description.find('\0') != std::string::npos) {
2491
- description.resize(description.find('\0'));
2492
- }
2493
- }
2494
- }
2495
- RegCloseKey(hKey);
2496
- }
2497
- #endif
2498
- }
2499
- };
2500
-
2501
- static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
2502
- return "CPU";
2503
-
2504
- LM_GGML_UNUSED(dev);
2505
- }
2506
-
2507
- static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
2508
- struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
2509
-
2510
- return ctx->description.c_str();
2511
- }
2512
-
2513
- static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
2514
- // TODO
2515
- *free = 0;
2516
- *total = 0;
2517
-
2518
- LM_GGML_UNUSED(dev);
2519
- }
2520
-
2521
- static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
2522
- return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
2523
-
2524
- LM_GGML_UNUSED(dev);
2525
- }
2526
-
2527
- static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
2528
- props->name = lm_ggml_backend_cpu_device_get_name(dev);
2529
- props->description = lm_ggml_backend_cpu_device_get_description(dev);
2530
- props->type = lm_ggml_backend_cpu_device_get_type(dev);
2531
- lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
2532
- props->caps = {
2533
- /* .async = */ false,
2534
- /* .host_buffer = */ false,
2535
- /* .buffer_from_host_ptr = */ true,
2536
- /* .events = */ false,
2537
- };
2538
- }
2539
-
2540
- static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
2541
- return lm_ggml_backend_cpu_init();
2542
-
2543
- LM_GGML_UNUSED(dev);
2544
- LM_GGML_UNUSED(params);
2545
- }
2546
-
2547
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
2548
- return lm_ggml_backend_cpu_buffer_type();
2549
-
2550
- LM_GGML_UNUSED(dev);
2551
- }
2552
-
2553
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2554
- return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2555
-
2556
- LM_GGML_UNUSED(dev);
2557
- LM_GGML_UNUSED(max_tensor_size);
2558
- }
2559
-
2560
- static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
2561
- switch (op->op) {
2562
- case LM_GGML_OP_CPY:
2563
- return
2564
- op->type != LM_GGML_TYPE_IQ2_XXS &&
2565
- op->type != LM_GGML_TYPE_IQ2_XS &&
2566
- op->type != LM_GGML_TYPE_IQ1_S &&
2567
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
2568
- case LM_GGML_OP_MUL_MAT:
2569
- return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
2570
- case LM_GGML_OP_ROPE_BACK:
2571
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
2572
- case LM_GGML_OP_IM2COL_BACK:
2573
- return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
2574
- case LM_GGML_OP_OUT_PROD:
2575
- return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
2576
- default:
2577
- return true;
2578
- }
2579
-
2580
- LM_GGML_UNUSED(dev);
2581
- }
2582
-
2583
- static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
2584
- return lm_ggml_backend_buft_is_host(buft);
2585
-
2586
- LM_GGML_UNUSED(dev);
2587
- }
2588
-
2589
- static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
2590
- /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
2591
- /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
2592
- /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
2593
- /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
2594
- /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
2595
- /* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
2596
- /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
2597
- /* .get_host_buffer_type = */ NULL,
2598
- /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
2599
- /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
2600
- /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
2601
- /* .offload_op = */ NULL,
2602
- /* .event_new = */ NULL,
2603
- /* .event_free = */ NULL,
2604
- /* .event_synchronize = */ NULL,
2605
- };
2606
-
2607
- // CPU backend - backend (reg)
2608
-
2609
- static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
2610
- return "CPU";
2611
-
2612
- LM_GGML_UNUSED(reg);
2613
- }
2614
-
2615
- static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
2616
- return 1;
2617
-
2618
- LM_GGML_UNUSED(reg);
2619
- }
2620
-
2621
- static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
2622
- LM_GGML_ASSERT(index == 0);
2623
-
2624
- static lm_ggml_backend_cpu_device_context ctx;
2625
- static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
2626
- /* .iface = */ lm_ggml_backend_cpu_device_i,
2627
- /* .reg = */ reg,
2628
- /* .context = */ &ctx,
2629
- };
2630
-
2631
- return &lm_ggml_backend_cpu_device;
2632
- }
2633
-
2634
- static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
2635
- if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
2636
- return (void *)lm_ggml_backend_cpu_set_n_threads;
2637
- }
2638
- if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
2639
- return (void *)lm_ggml_backend_cpu_get_extra_bufts;
2640
- }
2641
-
2642
- return NULL;
2643
-
2644
- LM_GGML_UNUSED(reg);
2645
- }
2646
-
2647
- static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
2648
- /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
2649
- /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
2650
- /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
2651
- /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
2652
- };
2653
-
2654
- lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
2655
- static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
2656
- /* .iface = */ lm_ggml_backend_cpu_reg_i,
2657
- /* .context = */ NULL,
2658
- };
2659
-
2660
- return &lm_ggml_backend_cpu_reg;
2661
- }
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
11
+ #include "ggml-backend.h"
12
+ #include "ggml-backend-impl.h"
13
+ #include "ggml-alloc.h"
14
+ #include "ggml-impl.h"
15
+
16
+ #include <assert.h>
17
+ #include <limits.h>
18
+ #include <stdarg.h>
19
+ #include <stdio.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #ifdef __APPLE__
26
+ #include <sys/types.h>
27
+ #include <sys/sysctl.h>
28
+ #endif
29
+
30
+
31
+ // backend buffer type
32
+
33
+ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
34
+ return buft->iface.get_name(buft);
35
+ }
36
+
37
+ lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
43
+ return buft->iface.alloc_buffer(buft, size);
44
+ }
45
+
46
+ size_t lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type_t buft) {
47
+ return buft->iface.get_alignment(buft);
48
+ }
49
+
50
+ size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
51
+ // get_max_size is optional, defaults to SIZE_MAX
52
+ if (buft->iface.get_max_size) {
53
+ return buft->iface.get_max_size(buft);
54
+ }
55
+ return SIZE_MAX;
56
+ }
57
+
58
+ size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
59
+ // get_alloc_size is optional, defaults to lm_ggml_nbytes
60
+ if (buft->iface.get_alloc_size) {
61
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
62
+ assert(size >= lm_ggml_nbytes(tensor));
63
+ return size;
64
+ }
65
+ return lm_ggml_nbytes(tensor);
66
+ }
67
+
68
+ bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
69
+ if (buft->iface.is_host) {
70
+ return buft->iface.is_host(buft);
71
+ }
72
+ return false;
73
+ }
74
+
75
+ lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device(lm_ggml_backend_buffer_type_t buft) {
76
+ return buft->device;
77
+ }
78
+
79
+ // backend buffer
80
+
81
+ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
82
+ lm_ggml_backend_buffer_type_t buft,
83
+ struct lm_ggml_backend_buffer_i iface,
84
+ void * context,
85
+ size_t size) {
86
+ lm_ggml_backend_buffer_t buffer = new lm_ggml_backend_buffer {
87
+ /* .interface = */ iface,
88
+ /* .buft = */ buft,
89
+ /* .context = */ context,
90
+ /* .size = */ size,
91
+ /* .usage = */ LM_GGML_BACKEND_BUFFER_USAGE_ANY
92
+ };
93
+
94
+ return buffer;
95
+ }
96
+
97
+ const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
98
+ return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
99
+ }
100
+
101
+ void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
102
+ if (buffer == NULL) {
103
+ return;
104
+ }
105
+
106
+ if (buffer->iface.free_buffer != NULL) {
107
+ buffer->iface.free_buffer(buffer);
108
+ }
109
+ delete buffer;
110
+ }
111
+
112
+ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
113
+ return buffer->size;
114
+ }
115
+
116
+ void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
122
+ void * base = buffer->iface.get_base(buffer);
123
+
124
+ LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
125
+
126
+ return base;
127
+ }
128
+
129
+ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
130
+ // init_tensor is optional
131
+ if (buffer->iface.init_tensor) {
132
+ buffer->iface.init_tensor(buffer, tensor);
133
+ }
134
+ }
135
+
136
+ void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
145
+ size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
146
+ return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
147
+ }
148
+
149
+ size_t lm_ggml_backend_buffer_get_max_size(lm_ggml_backend_buffer_t buffer) {
150
+ return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_get_type(buffer));
151
+ }
152
+
153
+ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
154
+ return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
155
+ }
156
+
157
+ bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
158
+ return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
159
+ }
160
+
161
+ void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
162
+ buffer->usage = usage;
163
+
164
+ // FIXME: add a generic callback to the buffer interface
165
+ if (lm_ggml_backend_buffer_is_multi_buffer(buffer)) {
166
+ lm_ggml_backend_multi_buffer_set_usage(buffer, usage);
167
+ }
168
+ }
169
+
170
+ enum lm_ggml_backend_buffer_usage lm_ggml_backend_buffer_get_usage(lm_ggml_backend_buffer_t buffer) {
171
+ return buffer->usage;
172
+ }
173
+
174
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) {
175
+ return buffer->buft;
176
+ }
177
+
178
+ void lm_ggml_backend_buffer_reset(lm_ggml_backend_buffer_t buffer) {
179
+ if (buffer->iface.reset) {
180
+ buffer->iface.reset(buffer);
181
+ }
182
+ }
183
+
184
+ bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
185
+ lm_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
186
+ if (dst_buf->iface.cpy_tensor) {
187
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
188
+ }
189
+ return false;
190
+ }
191
+
192
+ // backend
193
+
194
+ lm_ggml_guid_t lm_ggml_backend_guid(lm_ggml_backend_t backend) {
195
+ if (backend == NULL) {
196
+ return NULL;
197
+ }
198
+ return backend->guid;
199
+ }
200
+
201
+ const char * lm_ggml_backend_name(lm_ggml_backend_t backend) {
202
+ if (backend == NULL) {
203
+ return "NULL";
204
+ }
205
+ return backend->iface.get_name(backend);
206
+ }
207
+
208
+ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
209
+ if (backend == NULL) {
210
+ return;
211
+ }
212
+
213
+ backend->iface.free(backend);
214
+ }
215
+
216
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
217
+ return lm_ggml_backend_dev_buffer_type(backend->device);
218
+ }
219
+
220
+ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
221
+ return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_get_default_buffer_type(backend), size);
222
+ }
223
+
224
+ size_t lm_ggml_backend_get_alignment(lm_ggml_backend_t backend) {
225
+ return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_get_default_buffer_type(backend));
226
+ }
227
+
228
+ size_t lm_ggml_backend_get_max_size(lm_ggml_backend_t backend) {
229
+ return lm_ggml_backend_buft_get_max_size(lm_ggml_backend_get_default_buffer_type(backend));
230
+ }
231
+
232
+ void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
233
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
234
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
235
+
236
+ if (backend->iface.set_tensor_async == NULL) {
237
+ lm_ggml_backend_tensor_set(tensor, data, offset, size);
238
+ } else {
239
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
240
+ }
241
+ }
242
+
243
+ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
244
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
245
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
246
+
247
+ if (backend->iface.get_tensor_async == NULL) {
248
+ lm_ggml_backend_tensor_get(tensor, data, offset, size);
249
+ } else {
250
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
251
+ }
252
+ }
253
+
254
+ void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
+ lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
256
+
257
+ if (size == 0) {
258
+ return;
259
+ }
260
+
261
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
262
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
263
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
264
+
265
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
266
+ }
267
+
268
+ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
269
+ lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
270
+
271
+ if (size == 0) {
272
+ return;
273
+ }
274
+
275
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
276
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
277
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
278
+
279
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
280
+ }
281
+
282
+ LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
283
+ lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
284
+
285
+ if (size == 0) {
286
+ return;
287
+ }
288
+
289
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
292
+ LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
293
+
294
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
295
+ }
296
+
297
+ void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) {
298
+ if (backend->iface.synchronize == NULL) {
299
+ return;
300
+ }
301
+
302
+ backend->iface.synchronize(backend);
303
+ }
304
+
305
+ lm_ggml_backend_graph_plan_t lm_ggml_backend_graph_plan_create(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
306
+ LM_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
307
+
308
+ return backend->iface.graph_plan_create(backend, cgraph);
309
+ }
310
+
311
+ void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
312
+ LM_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
313
+
314
+ backend->iface.graph_plan_free(backend, plan);
315
+ }
316
+
317
+ enum lm_ggml_status lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
318
+ LM_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
319
+
320
+ return backend->iface.graph_plan_compute(backend, plan);
321
+ }
322
+
323
+ enum lm_ggml_status lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
324
+ enum lm_ggml_status err = lm_ggml_backend_graph_compute_async(backend, cgraph);
325
+ lm_ggml_backend_synchronize(backend);
326
+ return err;
327
+ }
328
+
329
+ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
330
+ return backend->iface.graph_compute(backend, cgraph);
331
+ }
332
+
333
+ bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
334
+ return lm_ggml_backend_dev_supports_op(backend->device, op);
335
+ }
336
+
337
+ bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
338
+ return lm_ggml_backend_dev_supports_buft(backend->device, buft);
339
+ }
340
+
341
+ bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
342
+ return lm_ggml_backend_dev_offload_op(backend->device, op);
343
+ }
344
+
345
+ lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
346
+ return backend->device;
347
+ }
348
+
349
+ // backend copy
350
+
351
+ static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
352
+ if (a->type != b->type) {
353
+ return false;
354
+ }
355
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
356
+ if (a->ne[i] != b->ne[i]) {
357
+ return false;
358
+ }
359
+ if (a->nb[i] != b->nb[i]) {
360
+ return false;
361
+ }
362
+ }
363
+ return true;
364
+ }
365
+
366
+ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
367
+ LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
368
+
369
+ if (src == dst) {
370
+ return;
371
+ }
372
+
373
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
374
+ lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src));
375
+ } else if (lm_ggml_backend_buffer_is_host(dst->buffer)) {
376
+ lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
377
+ } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
378
+ #ifndef NDEBUG
379
+ LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
380
+ #endif
381
+ size_t nbytes = lm_ggml_nbytes(src);
382
+ void * data = malloc(nbytes);
383
+ lm_ggml_backend_tensor_get(src, data, 0, nbytes);
384
+ lm_ggml_backend_tensor_set(dst, data, 0, nbytes);
385
+ free(data);
386
+ }
387
+ }
388
+
389
+ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
390
+ LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
391
+
392
+ if (src == dst) {
393
+ return;
394
+ }
395
+
396
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
397
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
398
+ return;
399
+ }
400
+ }
401
+
402
+ // an async copy would normally happen after all the queued operations on both backends are completed
403
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
404
+ lm_ggml_backend_synchronize(backend_src);
405
+ lm_ggml_backend_synchronize(backend_dst);
406
+ lm_ggml_backend_tensor_copy(src, dst);
407
+ }
408
+
409
+ // events
410
+
411
+ lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_dev_t device) {
412
+ // null device is allowed for the transition period to the device interface
413
+ if (device == NULL || device->iface.event_new == NULL) {
414
+ return NULL;
415
+ }
416
+ return device->iface.event_new(device);
417
+ }
418
+
419
+ void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
420
+ if (event == NULL) {
421
+ return;
422
+ }
423
+ event->device->iface.event_free(event->device, event);
424
+ }
425
+
426
+ void lm_ggml_backend_event_record(lm_ggml_backend_event_t event, lm_ggml_backend_t backend) {
427
+ LM_GGML_ASSERT(backend->iface.event_record != NULL);
428
+
429
+ backend->iface.event_record(backend, event);
430
+ }
431
+
432
+ void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
433
+ LM_GGML_ASSERT(event->device->iface.event_synchronize);
434
+
435
+ event->device->iface.event_synchronize(event->device, event);
436
+ }
437
+
438
+ void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
439
+ LM_GGML_ASSERT(backend->iface.event_wait != NULL);
440
+
441
+ backend->iface.event_wait(backend, event);
442
+ }
443
+
444
+ // Backend device
445
+
446
+ const char * lm_ggml_backend_dev_name(lm_ggml_backend_dev_t device) {
447
+ return device->iface.get_name(device);
448
+ }
449
+
450
+ const char * lm_ggml_backend_dev_description(lm_ggml_backend_dev_t device) {
451
+ return device->iface.get_description(device);
452
+ }
453
+
454
+ void lm_ggml_backend_dev_memory(lm_ggml_backend_dev_t device, size_t * free, size_t * total) {
455
+ device->iface.get_memory(device, free, total);
456
+ }
457
+
458
+ enum lm_ggml_backend_dev_type lm_ggml_backend_dev_type(lm_ggml_backend_dev_t device) {
459
+ return device->iface.get_type(device);
460
+ }
461
+
462
+ void lm_ggml_backend_dev_get_props(lm_ggml_backend_dev_t device, struct lm_ggml_backend_dev_props * props) {
463
+ memset(props, 0, sizeof(*props));
464
+ device->iface.get_props(device, props);
465
+ }
466
+
467
+ lm_ggml_backend_reg_t lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_t device) {
468
+ return device->reg;
469
+ }
470
+
471
+ lm_ggml_backend_t lm_ggml_backend_dev_init(lm_ggml_backend_dev_t device, const char * params) {
472
+ return device->iface.init_backend(device, params);
473
+ }
474
+
475
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_buffer_type(lm_ggml_backend_dev_t device) {
476
+ return device->iface.get_buffer_type(device);
477
+ }
478
+
479
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_host_buffer_type(lm_ggml_backend_dev_t device) {
480
+ if (device->iface.get_host_buffer_type == NULL) {
481
+ return NULL;
482
+ }
483
+
484
+ return device->iface.get_host_buffer_type(device);
485
+ }
486
+
487
+ lm_ggml_backend_buffer_t lm_ggml_backend_dev_buffer_from_host_ptr(lm_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
488
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
489
+ }
490
+
491
+ bool lm_ggml_backend_dev_supports_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
492
+ return device->iface.supports_op(device, op);
493
+ }
494
+
495
+ bool lm_ggml_backend_dev_supports_buft(lm_ggml_backend_dev_t device, lm_ggml_backend_buffer_type_t buft) {
496
+ return device->iface.supports_buft(device, buft);
497
+ }
498
+
499
+ bool lm_ggml_backend_dev_offload_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
500
+ if (device->iface.offload_op != NULL) {
501
+ return device->iface.offload_op(device, op);
502
+ }
503
+
504
+ return false;
505
+ }
506
+
507
+ // Backend (reg)
508
+
509
+ const char * lm_ggml_backend_reg_name(lm_ggml_backend_reg_t reg) {
510
+ return reg->iface.get_name(reg);
511
+ }
512
+
513
+ size_t lm_ggml_backend_reg_dev_count(lm_ggml_backend_reg_t reg) {
514
+ return reg->iface.get_device_count(reg);
515
+ }
516
+
517
+ lm_ggml_backend_dev_t lm_ggml_backend_reg_dev_get(lm_ggml_backend_reg_t reg, size_t index) {
518
+ return reg->iface.get_device(reg, index);
519
+ }
520
+
521
+ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
522
+ if (!reg->iface.get_proc_address) {
523
+ return NULL;
524
+ }
525
+ return reg->iface.get_proc_address(reg, name);
526
+ }
527
+
528
+ // multi-buffer buffer
529
+
530
+ struct lm_ggml_backend_multi_buffer_context {
531
+ lm_ggml_backend_buffer_t * buffers;
532
+ size_t n_buffers;
533
+ };
534
+
535
+ static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
536
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
537
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
538
+ lm_ggml_backend_buffer_free(ctx->buffers[i]);
539
+ }
540
+
541
+ free(ctx->buffers);
542
+ free(ctx);
543
+ }
544
+
545
+ static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
546
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
547
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
548
+ lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
549
+ }
550
+ }
551
+
552
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
553
+ /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
554
+ /* .get_base = */ NULL,
555
+ /* .init_tensor = */ NULL,
556
+ /* .memset_tensor = */ NULL,
557
+ /* .set_tensor = */ NULL,
558
+ /* .get_tensor = */ NULL,
559
+ /* .cpy_tensor = */ NULL,
560
+ /* .clear = */ lm_ggml_backend_multi_buffer_clear,
561
+ /* .reset = */ NULL,
562
+ };
563
+
564
+ lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
565
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
566
+ ctx->n_buffers = n_buffers;
567
+ ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
568
+
569
+ LM_GGML_ASSERT(ctx->buffers != NULL);
570
+
571
+ size_t total_size = 0;
572
+ for (size_t i = 0; i < n_buffers; i++) {
573
+ ctx->buffers[i] = buffers[i];
574
+ total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
575
+ }
576
+
577
+ return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
578
+ }
579
+
580
+ bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
581
+ return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
582
+ }
583
+
584
+ void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
585
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
586
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
587
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
588
+ lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
589
+ }
590
+ }
591
+
592
+ // creates a copy of the tensor with the same memory layout
593
+ static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
594
+ struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
595
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
596
+ dup->nb[i] = tensor->nb[i];
597
+ }
598
+ return dup;
599
+ }
600
+
601
+ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
602
+ return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
603
+ }
604
+
605
+ // scheduler
606
+
607
+ #ifndef LM_GGML_SCHED_MAX_BACKENDS
608
+ #define LM_GGML_SCHED_MAX_BACKENDS 16
609
+ #endif
610
+
611
+ #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
612
+ #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
613
+ #endif
614
+
615
+ #ifndef LM_GGML_SCHED_MAX_COPIES
616
+ #define LM_GGML_SCHED_MAX_COPIES 4
617
+ #endif
618
+
619
+ struct lm_ggml_backend_sched_split {
620
+ int backend_id;
621
+ int i_start;
622
+ int i_end;
623
+ struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
624
+ int n_inputs;
625
+ // graph view of this split
626
+ struct lm_ggml_cgraph graph;
627
+ };
628
+
629
+ struct lm_ggml_backend_sched {
630
+ bool is_reset; // true if the scheduler has been reset since the last graph split
631
+ bool is_alloc;
632
+
633
+ int n_backends;
634
+
635
+ lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
636
+ lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
637
+ lm_ggml_gallocr_t galloc;
638
+
639
+ // hash map of the nodes in the graph
640
+ struct lm_ggml_hash_set hash_set;
641
+ int * hv_tensor_backend_ids; // [hash_set.size]
642
+ struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
643
+
644
+ int * node_backend_ids; // [graph_size]
645
+ int * leaf_backend_ids; // [graph_size]
646
+
647
+ int * prev_node_backend_ids; // [graph_size]
648
+ int * prev_leaf_backend_ids; // [graph_size]
649
+
650
+ // copy of the graph with modified inputs
651
+ struct lm_ggml_cgraph graph;
652
+
653
+ // graph splits
654
+ struct lm_ggml_backend_sched_split * splits;
655
+ int n_splits;
656
+ int splits_capacity;
657
+
658
+ // pipeline parallelism support
659
+ int n_copies;
660
+ int cur_copy;
661
+ lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
662
+ struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
663
+ int n_graph_inputs;
664
+
665
+ struct lm_ggml_context * ctx;
666
+
667
+ lm_ggml_backend_sched_eval_callback callback_eval;
668
+ void * callback_eval_user_data;
669
+
670
+ char * context_buffer;
671
+ size_t context_buffer_size;
672
+
673
+ int debug;
674
+ };
675
+
676
+ #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
677
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
678
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
679
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
680
+
681
+ // returns the priority of the backend, lower id is higher priority
682
+ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
683
+ for (int i = 0; i < sched->n_backends; i++) {
684
+ if (sched->backends[i] == backend) {
685
+ return i;
686
+ }
687
+ }
688
+ return -1;
689
+ }
690
+
691
+ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
692
+ lm_ggml_backend_buffer_t buffer = tensor->buffer;
693
+ if (buffer == NULL) {
694
+ return -1;
695
+ }
696
+
697
+ // find highest prio backend that supports the buffer type and the op
698
+ for (int i = 0; i < sched->n_backends; i++) {
699
+ if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
700
+ lm_ggml_backend_supports_op(sched->backends[i], op)) {
701
+ return i;
702
+ }
703
+ }
704
+
705
+ #ifndef NDEBUG
706
+ LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
707
+ __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
708
+ #endif
709
+
710
+ return -1;
711
+ }
712
+
713
+ #if 0
714
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
715
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
716
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
717
+ #define GET_CAUSE(node) causes[hash_id(node)]
718
+ #else
719
+ #define SET_CAUSE(node, ...)
720
+ #define GET_CAUSE(node) ""
721
+ #endif
722
+
723
+ // returns the backend that should be used for the node based on the current locations
724
+ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
725
+ // TODO: use supports_op to check if the backend supports the op
726
+
727
+ // assign pre-allocated nodes to their backend
728
+ int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
729
+ if (cur_backend_id != -1) {
730
+ SET_CAUSE(tensor, "1.dst");
731
+ return cur_backend_id;
732
+ }
733
+
734
+ // view_src
735
+ if (tensor->view_src != NULL) {
736
+ cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
737
+ if (cur_backend_id != -1) {
738
+ SET_CAUSE(tensor, "1.vsrc");
739
+ return cur_backend_id;
740
+ }
741
+ }
742
+
743
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
744
+ // since the tensor is pre-allocated, it cannot be moved to another backend
745
+ LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
746
+ }
747
+
748
+ // graph input
749
+ if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
750
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
751
+ SET_CAUSE(tensor, "1.inp");
752
+ return cur_backend_id;
753
+ }
754
+
755
+ // operations with weights are preferably run on the same backend as the weights
756
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
757
+ const struct lm_ggml_tensor * src = tensor->src[i];
758
+ if (src == NULL) {
759
+ continue;
760
+ }
761
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
762
+ // not an ideal solution
763
+ if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
764
+ int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
765
+ // check if a backend with higher prio wants to offload the op
766
+ if (src_backend_id == sched->n_backends - 1) {
767
+ for (int b = 0; b < src_backend_id; b++) {
768
+ if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
769
+ SET_CAUSE(tensor, "1.off");
770
+ return b;
771
+ }
772
+ }
773
+ }
774
+ SET_CAUSE(tensor, "1.wgt%d", i);
775
+ return src_backend_id;
776
+ }
777
+ }
778
+
779
+ return -1;
780
+ }
781
+
782
+ static char * fmt_size(size_t size) {
783
+ static char buffer[128];
784
+ if (size >= 1024*1024) {
785
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
786
+ } else {
787
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
788
+ }
789
+ return buffer;
790
+ }
791
+
792
+ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
793
+ int cur_split = 0;
794
+ for (int i = 0; i < graph->n_nodes; i++) {
795
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
796
+ lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
797
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
798
+ sched->splits[cur_split].n_inputs);
799
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
800
+ LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
801
+ fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
802
+ }
803
+ LM_GGML_LOG_DEBUG("\n");
804
+ cur_split++;
805
+ }
806
+ struct lm_ggml_tensor * node = graph->nodes[i];
807
+ if (lm_ggml_is_view_op(node->op)) {
808
+ continue;
809
+ }
810
+ if (sched->debug > 1) {
811
+ lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
812
+ LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
813
+ fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
814
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
815
+ struct lm_ggml_tensor * src = node->src[j];
816
+ if (src == NULL) {
817
+ continue;
818
+ }
819
+ lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
820
+ LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
821
+ fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
822
+ }
823
+ LM_GGML_LOG_DEBUG("\n");
824
+ }
825
+ }
826
+ }
827
+
828
+ static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
829
+ lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
830
+ lm_ggml_backend_buffer_type_t buft = NULL;
831
+
832
+ if (buf) {
833
+ // the tensor is already allocated
834
+ buft = buf->buft;
835
+ } else {
836
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
837
+ int tensor_backend_id = tensor_backend_id(t);
838
+ if (tensor_backend_id == -1 && t->view_src) {
839
+ tensor_backend_id = tensor_backend_id(t->view_src);
840
+ }
841
+ if (tensor_backend_id != -1) {
842
+ buft = sched->bufts[tensor_backend_id];
843
+ }
844
+ }
845
+
846
+ return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
847
+ }
848
+
849
+ static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
850
+ if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
851
+ *node_backend_id = cur_backend_id;
852
+ SET_CAUSE(node, "2.sup");
853
+ }
854
+ }
855
+
856
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
857
+ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
858
+ // reset splits
859
+ sched->n_splits = 0;
860
+ sched->n_graph_inputs = 0;
861
+ sched->is_reset = false;
862
+
863
+ struct lm_ggml_init_params params = {
864
+ /* .mem_size = */ sched->context_buffer_size,
865
+ /* .mem_buffer = */ sched->context_buffer,
866
+ /* .no_alloc = */ true
867
+ };
868
+
869
+ lm_ggml_free(sched->ctx);
870
+
871
+ sched->ctx = lm_ggml_init(params);
872
+ if (sched->ctx == NULL) {
873
+ LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
874
+ }
875
+
876
+ // pass 1: assign backends to ops with pre-allocated inputs
877
+ for (int i = 0; i < graph->n_leafs; i++) {
878
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
879
+ int * leaf_backend_id = &tensor_backend_id(leaf);
880
+ // do not overwrite user assignments
881
+ if (*leaf_backend_id == -1) {
882
+ *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
883
+ }
884
+ }
885
+
886
+ for (int i = 0; i < graph->n_nodes; i++) {
887
+ struct lm_ggml_tensor * node = graph->nodes[i];
888
+ int * node_backend_id = &tensor_backend_id(node);
889
+ // do not overwrite user assignments
890
+ if (*node_backend_id == -1) {
891
+ *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
892
+
893
+ #if 0
894
+ // src
895
+ if (node->op == LM_GGML_OP_NONE) {
896
+ continue;
897
+ }
898
+
899
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
900
+ struct lm_ggml_tensor * src = node->src[j];
901
+ if (src == NULL) {
902
+ continue;
903
+ }
904
+ int * src_backend_id = &tensor_backend_id(src);
905
+ if (*src_backend_id == -1) {
906
+ *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
907
+ }
908
+ }
909
+ #endif
910
+ }
911
+ }
912
+
913
+ // pass 2: expand current backend assignments
914
+ // assign the same backend to adjacent nodes
915
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
916
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
917
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
918
+ // expand gpu down
919
+ {
920
+ int cur_backend_id = -1;
921
+ for (int i = 0; i < graph->n_nodes; i++) {
922
+ struct lm_ggml_tensor * node = graph->nodes[i];
923
+ if (lm_ggml_is_view_op(node->op)) {
924
+ continue;
925
+ }
926
+ int * node_backend_id = &tensor_backend_id(node);
927
+ if (*node_backend_id != -1) {
928
+ if (*node_backend_id == sched->n_backends - 1) {
929
+ // skip cpu (lowest prio backend)
930
+ cur_backend_id = -1;
931
+ } else {
932
+ cur_backend_id = *node_backend_id;
933
+ }
934
+ } else if (cur_backend_id != -1) {
935
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
936
+ }
937
+ }
938
+ }
939
+ // expand gpu up
940
+ {
941
+ int cur_backend_id = -1;
942
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
943
+ struct lm_ggml_tensor * node = graph->nodes[i];
944
+ if (lm_ggml_is_view_op(node->op)) {
945
+ continue;
946
+ }
947
+ int * node_backend_id = &tensor_backend_id(node);
948
+ if (*node_backend_id != -1) {
949
+ if (*node_backend_id == sched->n_backends - 1) {
950
+ // skip cpu (lowest prio backend)
951
+ cur_backend_id = -1;
952
+ } else {
953
+ cur_backend_id = *node_backend_id;
954
+ }
955
+ } else if (cur_backend_id != -1) {
956
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
957
+ }
958
+ }
959
+ }
960
+ // expand rest down
961
+ {
962
+ int cur_backend_id = -1;
963
+ for (int i = 0; i < graph->n_nodes; i++) {
964
+ struct lm_ggml_tensor * node = graph->nodes[i];
965
+ if (lm_ggml_is_view_op(node->op)) {
966
+ continue;
967
+ }
968
+ int * node_backend_id = &tensor_backend_id(node);
969
+ if (*node_backend_id != -1) {
970
+ cur_backend_id = *node_backend_id;
971
+ } else if (cur_backend_id != -1) {
972
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
973
+ }
974
+ }
975
+ }
976
+ // expand rest up
977
+ {
978
+ int cur_backend_id = -1;
979
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
980
+ struct lm_ggml_tensor * node = graph->nodes[i];
981
+ if (lm_ggml_is_view_op(node->op)) {
982
+ continue;
983
+ }
984
+ int * node_backend_id = &tensor_backend_id(node);
985
+ if (*node_backend_id != -1) {
986
+ cur_backend_id = *node_backend_id;
987
+ } else if (cur_backend_id != -1) {
988
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
989
+ }
990
+ }
991
+ }
992
+
993
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
994
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
995
+ // however, we also need to verify that the sources are in compatible buffer types
996
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
997
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
998
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
999
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1000
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1001
+ for (int i = 0; i < graph->n_nodes; i++) {
1002
+ struct lm_ggml_tensor * node = graph->nodes[i];
1003
+ if (lm_ggml_is_view_op(node->op)) {
1004
+ continue;
1005
+ }
1006
+ int * node_backend_id = &tensor_backend_id(node);
1007
+ if (*node_backend_id == -1) {
1008
+ // unassigned node: find the backend with the most supported inputs
1009
+ int n_supported_best = -1;
1010
+ for (int b = 0; b < sched->n_backends; b++) {
1011
+ if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1012
+ int n_supported = 0;
1013
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1014
+ struct lm_ggml_tensor * src = node->src[j];
1015
+ if (src == NULL) {
1016
+ continue;
1017
+ }
1018
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1019
+ n_supported++;
1020
+ }
1021
+ }
1022
+ if (n_supported > n_supported_best) {
1023
+ n_supported_best = n_supported;
1024
+ *node_backend_id = b;
1025
+ SET_CAUSE(node, "3.best");
1026
+ }
1027
+ }
1028
+ }
1029
+ } else {
1030
+ // assigned node: upgrade to higher prio backend if possible
1031
+ for (int b = 0; b < *node_backend_id; b++) {
1032
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1033
+ bool supported = true;
1034
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1035
+ struct lm_ggml_tensor * src = node->src[j];
1036
+ if (src == NULL) {
1037
+ continue;
1038
+ }
1039
+ if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1040
+ supported = false;
1041
+ break;
1042
+ }
1043
+ }
1044
+ if (supported) {
1045
+ *node_backend_id = b;
1046
+ SET_CAUSE(node, "3.upg");
1047
+ break;
1048
+ }
1049
+ }
1050
+ }
1051
+ }
1052
+ }
1053
+
1054
+ // pass 4: assign backends to remaining src from dst and view_src
1055
+ for (int i = 0; i < graph->n_nodes; i++) {
1056
+ struct lm_ggml_tensor * node = graph->nodes[i];
1057
+ int * cur_backend_id = &tensor_backend_id(node);
1058
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1059
+ *cur_backend_id = tensor_backend_id(node->view_src);
1060
+ SET_CAUSE(node, "4.vsrc");
1061
+ }
1062
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1063
+ struct lm_ggml_tensor * src = node->src[j];
1064
+ if (src == NULL) {
1065
+ continue;
1066
+ }
1067
+ int * src_backend_id = &tensor_backend_id(src);
1068
+ if (*src_backend_id == -1) {
1069
+ if (src->view_src != NULL) {
1070
+ // views are always on the same backend as the source
1071
+ *src_backend_id = tensor_backend_id(src->view_src);
1072
+ SET_CAUSE(src, "4.vsrc");
1073
+ } else {
1074
+ *src_backend_id = *cur_backend_id;
1075
+ SET_CAUSE(src, "4.cur");
1076
+ }
1077
+ }
1078
+ }
1079
+ }
1080
+
1081
+ // pass 5: split graph, find tensors that need to be copied
1082
+ {
1083
+ int i_split = 0;
1084
+ struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1085
+ // find the backend of the first split, skipping view ops
1086
+ int i = 0;
1087
+ for (; i < graph->n_nodes; i++) {
1088
+ struct lm_ggml_tensor * node = graph->nodes[i];
1089
+ if (!lm_ggml_is_view_op(node->op)) {
1090
+ split->backend_id = tensor_backend_id(node);
1091
+ break;
1092
+ }
1093
+ }
1094
+ split->i_start = 0;
1095
+ split->n_inputs = 0;
1096
+ int cur_backend_id = split->backend_id;
1097
+ for (; i < graph->n_nodes; i++) {
1098
+ struct lm_ggml_tensor * node = graph->nodes[i];
1099
+
1100
+ if (lm_ggml_is_view_op(node->op)) {
1101
+ continue;
1102
+ }
1103
+
1104
+ const int node_backend_id = tensor_backend_id(node);
1105
+
1106
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1107
+
1108
+ // check if we should start a new split based on the sources of the current node
1109
+ bool need_new_split = false;
1110
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1111
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1112
+ struct lm_ggml_tensor * src = node->src[j];
1113
+ if (src == NULL) {
1114
+ continue;
1115
+ }
1116
+ // check if a weight is on a different and incompatible backend
1117
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1118
+ if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1119
+ int src_backend_id = tensor_backend_id(src);
1120
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1121
+ need_new_split = true;
1122
+ break;
1123
+ }
1124
+ }
1125
+ // check if the split has too many inputs
1126
+ // FIXME: count the number of inputs instead of only checking when full
1127
+ if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1128
+ const size_t id = hash_id(src);
1129
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1130
+ bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1131
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1132
+ need_new_split = true;
1133
+ break;
1134
+ }
1135
+ }
1136
+ }
1137
+ }
1138
+
1139
+ if (node_backend_id != cur_backend_id || need_new_split) {
1140
+ split->i_end = i;
1141
+ i_split++;
1142
+ if (i_split >= sched->splits_capacity) {
1143
+ sched->splits_capacity *= 2;
1144
+ sched->splits = (lm_ggml_backend_sched_split *)
1145
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1146
+ LM_GGML_ASSERT(sched->splits != NULL);
1147
+ }
1148
+ split = &sched->splits[i_split];
1149
+ split->backend_id = node_backend_id;
1150
+ split->i_start = i;
1151
+ split->n_inputs = 0;
1152
+ cur_backend_id = node_backend_id;
1153
+ }
1154
+
1155
+ // find inputs that are not on the same backend
1156
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1157
+ struct lm_ggml_tensor * src = node->src[j];
1158
+ if (src == NULL) {
1159
+ continue;
1160
+ }
1161
+
1162
+ size_t src_id = hash_id(src);
1163
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1164
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1165
+
1166
+ if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1167
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1168
+ lm_ggml_backend_t backend = sched->backends[src_backend_id];
1169
+ for (int c = 0; c < sched->n_copies; c++) {
1170
+ struct lm_ggml_tensor * tensor_copy;
1171
+ if (c == sched->cur_copy) {
1172
+ tensor_copy = src; // use the original tensor as the current copy
1173
+ } else {
1174
+ tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1175
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1176
+ }
1177
+ if (sched->n_copies > 1) {
1178
+ lm_ggml_set_input(tensor_copy);
1179
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1180
+ }
1181
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1182
+ SET_CAUSE(tensor_copy, "4.cpy");
1183
+ }
1184
+ int n_graph_inputs = sched->n_graph_inputs++;
1185
+ LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1186
+ sched->graph_inputs[n_graph_inputs] = src;
1187
+ }
1188
+ }
1189
+
1190
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1191
+ // create a copy of the input in the split's backend
1192
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1193
+ lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1194
+ for (int c = 0; c < sched->n_copies; c++) {
1195
+ struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1196
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1197
+ if (sched->n_copies > 1) {
1198
+ lm_ggml_set_input(tensor_copy);
1199
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1200
+ }
1201
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1202
+ SET_CAUSE(tensor_copy, "4.cpy");
1203
+ }
1204
+ int n_inputs = split->n_inputs++;
1205
+ LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1206
+ split->inputs[n_inputs] = src;
1207
+ }
1208
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1209
+ }
1210
+ }
1211
+ }
1212
+ split->i_end = graph->n_nodes;
1213
+ sched->n_splits = i_split + 1;
1214
+ }
1215
+
1216
+ if (sched->debug) {
1217
+ lm_ggml_backend_sched_print_assignments(sched, graph);
1218
+ }
1219
+
1220
+ // swap node_backend_ids and leaf _backend_ids with prevs
1221
+ {
1222
+ int * tmp = sched->node_backend_ids;
1223
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1224
+ sched->prev_node_backend_ids = tmp;
1225
+
1226
+ tmp = sched->leaf_backend_ids;
1227
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1228
+ sched->prev_leaf_backend_ids = tmp;
1229
+ }
1230
+
1231
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1232
+ if (sched->graph.size < graph_size) {
1233
+ sched->graph.size = graph_size;
1234
+ sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1235
+ sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1236
+ LM_GGML_ASSERT(sched->graph.nodes != NULL);
1237
+ LM_GGML_ASSERT(sched->graph.leafs != NULL);
1238
+ }
1239
+ sched->graph.n_nodes = 0;
1240
+ sched->graph.n_leafs = 0;
1241
+
1242
+ struct lm_ggml_cgraph * graph_copy = &sched->graph;
1243
+
1244
+ for (int i = 0; i < sched->n_splits; i++) {
1245
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1246
+ split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1247
+
1248
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1249
+ for (int j = 0; j < split->n_inputs; j++) {
1250
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1251
+
1252
+ struct lm_ggml_tensor * input = split->inputs[j];
1253
+ const size_t input_id = hash_id(input);
1254
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1255
+
1256
+ // add a dependency to the input source so that it is not freed before the copy is done
1257
+ struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1258
+ input_dep->src[0] = input;
1259
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1260
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1261
+
1262
+ // add a dependency to the input copy so that it is allocated at the start of the split
1263
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1264
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1265
+ }
1266
+
1267
+ for (int j = split->i_start; j < split->i_end; j++) {
1268
+ assert(graph_copy->size > graph_copy->n_nodes);
1269
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1270
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1271
+ }
1272
+ }
1273
+
1274
+ if (sched->n_copies > 1) {
1275
+ // add input copies as leafs so that they are allocated first
1276
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1277
+ struct lm_ggml_tensor * input = sched->graph_inputs[i];
1278
+ size_t id = hash_id(input);
1279
+ int backend_id = tensor_backend_id(input);
1280
+ for (int c = 0; c < sched->n_copies; c++) {
1281
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1282
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1283
+ assert(graph_copy->size > graph_copy->n_leafs);
1284
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1285
+ }
1286
+ }
1287
+
1288
+ for (int i = 0; i < sched->n_splits; i++) {
1289
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1290
+ int backend_id = split->backend_id;
1291
+ for (int j = 0; j < split->n_inputs; j++) {
1292
+ struct lm_ggml_tensor * input = split->inputs[j];
1293
+ size_t id = hash_id(input);
1294
+ for (int c = 0; c < sched->n_copies; c++) {
1295
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1296
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1297
+ assert(graph_copy->size > graph_copy->n_leafs);
1298
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1299
+ }
1300
+ }
1301
+ }
1302
+ }
1303
+
1304
+ // add leafs from the original graph
1305
+ for (int i = 0; i < graph->n_leafs; i++) {
1306
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1307
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1308
+ assert(graph_copy->size > graph_copy->n_leafs);
1309
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1310
+ }
1311
+ }
1312
+
1313
+ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1314
+ bool backend_ids_changed = false;
1315
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1316
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1317
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1318
+ backend_ids_changed = true;
1319
+ break;
1320
+ }
1321
+ }
1322
+ if (!backend_ids_changed) {
1323
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1324
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1325
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1326
+ backend_ids_changed = true;
1327
+ break;
1328
+ }
1329
+ }
1330
+ }
1331
+
1332
+ // allocate graph
1333
+ if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1334
+ // the re-allocation may cause the split inputs to be moved to a different address
1335
+ lm_ggml_backend_sched_synchronize(sched);
1336
+ #ifndef NDEBUG
1337
+ LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1338
+ #endif
1339
+ lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1340
+ if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1341
+ LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1342
+ return false;
1343
+ }
1344
+ }
1345
+
1346
+ return true;
1347
+ }
1348
+
1349
+ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1350
+ struct lm_ggml_backend_sched_split * splits = sched->splits;
1351
+
1352
+ for (int i = 0; i < sched->n_splits; i++) {
1353
+ struct lm_ggml_backend_sched_split * split = &splits[i];
1354
+ int split_backend_id = split->backend_id;
1355
+ lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1356
+
1357
+ // copy the input tensors to the split backend
1358
+ for (int j = 0; j < split->n_inputs; j++) {
1359
+ lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1360
+ struct lm_ggml_tensor * input = split->inputs[j];
1361
+ struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1362
+
1363
+ if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1364
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1365
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1366
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1367
+ } else {
1368
+ lm_ggml_backend_synchronize(split_backend);
1369
+ }
1370
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1371
+ } else {
1372
+ // wait for the split backend to finish using the input before overwriting it
1373
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1374
+ lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1375
+ } else {
1376
+ lm_ggml_backend_synchronize(split_backend);
1377
+ }
1378
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1379
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1380
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1381
+ lm_ggml_backend_synchronize(input_backend);
1382
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1383
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1384
+ } else {
1385
+ lm_ggml_backend_synchronize(split_backend);
1386
+ }
1387
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1388
+ }
1389
+ }
1390
+ }
1391
+
1392
+ if (!sched->callback_eval) {
1393
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1394
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1395
+ return ec;
1396
+ }
1397
+ } else {
1398
+ // similar to lm_ggml_backend_compare_graph_backend
1399
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1400
+ struct lm_ggml_tensor * t = split->graph.nodes[j0];
1401
+
1402
+ // check if the user needs data from this node
1403
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1404
+
1405
+ int j1 = j0;
1406
+
1407
+ // determine the range [j0, j1] of nodes that can be computed together
1408
+ while (!need && j1 < split->graph.n_nodes - 1) {
1409
+ t = split->graph.nodes[++j1];
1410
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1411
+ }
1412
+
1413
+ struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1414
+
1415
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1416
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1417
+ return ec;
1418
+ }
1419
+
1420
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1421
+ lm_ggml_backend_synchronize(split_backend);
1422
+
1423
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1424
+ break;
1425
+ }
1426
+
1427
+ j0 = j1;
1428
+ }
1429
+ }
1430
+
1431
+ // record the event of this copy
1432
+ if (split->n_inputs > 0) {
1433
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1434
+ lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1435
+ }
1436
+ }
1437
+ }
1438
+
1439
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1440
+
1441
+ return LM_GGML_STATUS_SUCCESS;
1442
+ }
1443
+
1444
+ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1445
+ lm_ggml_backend_t * backends,
1446
+ lm_ggml_backend_buffer_type_t * bufts,
1447
+ int n_backends,
1448
+ size_t graph_size,
1449
+ bool parallel) {
1450
+ LM_GGML_ASSERT(n_backends > 0);
1451
+ LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1452
+ LM_GGML_ASSERT(lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backends[n_backends - 1])) == LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1453
+
1454
+ struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1455
+
1456
+ const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
1457
+ sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
1458
+ sched->n_backends = n_backends;
1459
+ sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1460
+
1461
+ // initialize hash table
1462
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1463
+ sched->hash_set = lm_ggml_hash_set_new(graph_size);
1464
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1465
+ sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1466
+
1467
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1468
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1469
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1470
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1471
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1472
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1473
+
1474
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1475
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1476
+
1477
+ const int initial_splits_capacity = 16;
1478
+ sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1479
+ sched->splits_capacity = initial_splits_capacity;
1480
+
1481
+ for (int b = 0; b < n_backends; b++) {
1482
+ sched->backends[b] = backends[b];
1483
+ sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1484
+ LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1485
+
1486
+ if (sched->n_copies > 1) {
1487
+ for (int c = 0; c < sched->n_copies; c++) {
1488
+ sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
1489
+ }
1490
+ }
1491
+ }
1492
+
1493
+ sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1494
+
1495
+ lm_ggml_backend_sched_reset(sched);
1496
+
1497
+ return sched;
1498
+ }
1499
+
1500
+ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1501
+ if (sched == NULL) {
1502
+ return;
1503
+ }
1504
+ for (int b = 0; b < sched->n_backends; b++) {
1505
+ for (int c = 0; c < sched->n_copies; c++) {
1506
+ lm_ggml_backend_event_free(sched->events[b][c]);
1507
+ }
1508
+ }
1509
+ lm_ggml_gallocr_free(sched->galloc);
1510
+ lm_ggml_free(sched->ctx);
1511
+ lm_ggml_hash_set_free(&sched->hash_set);
1512
+ free(sched->splits);
1513
+ free(sched->hv_tensor_backend_ids);
1514
+ free(sched->hv_tensor_copies);
1515
+ free(sched->node_backend_ids);
1516
+ free(sched->leaf_backend_ids);
1517
+ free(sched->prev_node_backend_ids);
1518
+ free(sched->prev_leaf_backend_ids);
1519
+ free(sched->context_buffer);
1520
+ free(sched->graph.nodes);
1521
+ free(sched->graph.leafs);
1522
+ free(sched);
1523
+ }
1524
+
1525
+ void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1526
+ // reset state for the next run
1527
+ if (!sched->is_reset) {
1528
+ lm_ggml_hash_set_reset(&sched->hash_set);
1529
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1530
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1531
+ sched->is_reset = true;
1532
+ }
1533
+ sched->is_alloc = false;
1534
+ }
1535
+
1536
+ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1537
+ LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1538
+
1539
+ lm_ggml_backend_sched_split_graph(sched, measure_graph);
1540
+
1541
+ if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1542
+ return false;
1543
+ }
1544
+
1545
+ lm_ggml_backend_sched_reset(sched);
1546
+ lm_ggml_backend_sched_synchronize(sched);
1547
+
1548
+ return true;
1549
+ }
1550
+
1551
+ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1552
+ LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1553
+
1554
+ lm_ggml_backend_sched_split_graph(sched, graph);
1555
+
1556
+
1557
+ if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1558
+ return false;
1559
+ }
1560
+
1561
+ sched->is_alloc = true;
1562
+
1563
+ return true;
1564
+ }
1565
+
1566
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1567
+ enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1568
+ lm_ggml_backend_sched_synchronize(sched);
1569
+ return err;
1570
+ }
1571
+
1572
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1573
+ if (!sched->is_reset && !sched->is_alloc) {
1574
+ lm_ggml_backend_sched_reset(sched);
1575
+ }
1576
+
1577
+ if (!sched->is_alloc) {
1578
+ if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1579
+ return LM_GGML_STATUS_ALLOC_FAILED;
1580
+ }
1581
+ }
1582
+
1583
+ return lm_ggml_backend_sched_compute_splits(sched);
1584
+ }
1585
+
1586
+ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1587
+ for (int i = 0; i < sched->n_backends; i++) {
1588
+ lm_ggml_backend_synchronize(sched->backends[i]);
1589
+ }
1590
+ }
1591
+
1592
+ void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1593
+ sched->callback_eval = callback;
1594
+ sched->callback_eval_user_data = user_data;
1595
+ }
1596
+
1597
+ int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1598
+ return sched->n_splits;
1599
+ }
1600
+
1601
+ int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1602
+ return sched->n_copies;
1603
+ }
1604
+
1605
+ int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
1606
+ return sched->n_backends;
1607
+ }
1608
+
1609
+ lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
1610
+ LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1611
+ return sched->backends[i];
1612
+ }
1613
+
1614
+ size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1615
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1616
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1617
+
1618
+ return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1619
+ }
1620
+
1621
+ void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
1622
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1623
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1624
+ tensor_backend_id(node) = backend_index;
1625
+ SET_CAUSE(node, "usr");
1626
+ sched->is_reset = false;
1627
+ }
1628
+
1629
+ lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
1630
+ int backend_index = tensor_backend_id(node);
1631
+ if (backend_index == -1) {
1632
+ return NULL;
1633
+ }
1634
+ return sched->backends[backend_index];
1635
+ }
1636
+
1637
+ // utils
1638
+
1639
+ void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
1640
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1641
+ LM_GGML_ASSERT(tensor->view_src != NULL);
1642
+ LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
1643
+ LM_GGML_ASSERT(tensor->view_src->data != NULL);
1644
+
1645
+ tensor->buffer = tensor->view_src->buffer;
1646
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1647
+ lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1648
+ }
1649
+
1650
+ void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
1651
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1652
+ LM_GGML_ASSERT(tensor->data == NULL);
1653
+ LM_GGML_ASSERT(tensor->view_src == NULL);
1654
+ LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
1655
+ LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1656
+ (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
1657
+
1658
+ tensor->buffer = buffer;
1659
+ tensor->data = addr;
1660
+ lm_ggml_backend_buffer_init_tensor(buffer, tensor);
1661
+ }
1662
+
1663
+ static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
1664
+ struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
1665
+
1666
+ LM_GGML_ASSERT(src != NULL);
1667
+ LM_GGML_ASSERT(src->data && "graph must be allocated");
1668
+
1669
+ size_t id = lm_ggml_hash_insert(&hash_set, src);
1670
+ if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
1671
+ return node_copies[lm_ggml_hash_find(&hash_set, src)];
1672
+ }
1673
+
1674
+ struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1675
+ if (src->view_src != NULL) {
1676
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1677
+ dst->view_offs = src->view_offs;
1678
+ }
1679
+ dst->op = src->op;
1680
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1681
+ lm_ggml_set_name(dst, src->name);
1682
+
1683
+ // copy src
1684
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1685
+ struct lm_ggml_tensor * s = src->src[i];
1686
+ if (s == NULL) {
1687
+ continue;
1688
+ }
1689
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1690
+ }
1691
+
1692
+ node_copies[id] = dst;
1693
+ return dst;
1694
+ }
1695
+
1696
+ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
1697
+ size_t id = lm_ggml_hash_find(hash_set, src);
1698
+ if (node_init[id]) {
1699
+ return;
1700
+ }
1701
+ node_init[id] = true;
1702
+
1703
+ struct lm_ggml_tensor * dst = node_copies[id];
1704
+ if (dst->view_src != NULL) {
1705
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1706
+ lm_ggml_backend_view_init(dst);
1707
+ }
1708
+ else {
1709
+ lm_ggml_backend_tensor_copy(src, dst);
1710
+ }
1711
+
1712
+ // init src
1713
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1714
+ struct lm_ggml_tensor * s = src->src[i];
1715
+ if (s == NULL) {
1716
+ continue;
1717
+ }
1718
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1719
+ }
1720
+ }
1721
+
1722
+ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
1723
+ struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
1724
+ struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1725
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
1726
+
1727
+ struct lm_ggml_init_params params = {
1728
+ /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
1729
+ /* .mem_buffer = */ NULL,
1730
+ /* .no_alloc = */ true
1731
+ };
1732
+
1733
+ struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
1734
+ struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
1735
+
1736
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1737
+ LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1738
+ lm_ggml_hash_set_free(&hash_set);
1739
+ free(node_copies);
1740
+ free(node_init);
1741
+ lm_ggml_free(ctx_allocated);
1742
+ lm_ggml_free(ctx_unallocated);
1743
+ return {
1744
+ /* .buffer = */ NULL,
1745
+ /* .ctx_allocated = */ NULL,
1746
+ /* .ctx_unallocated = */ NULL,
1747
+ /* .graph = */ NULL,
1748
+ };
1749
+ }
1750
+
1751
+ // dup nodes
1752
+ for (int i = 0; i < graph->n_nodes; i++) {
1753
+ struct lm_ggml_tensor * node = graph->nodes[i];
1754
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1755
+ }
1756
+
1757
+ // allocate nodes
1758
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1759
+ if (buffer == NULL) {
1760
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1761
+ lm_ggml_hash_set_free(&hash_set);
1762
+ free(node_copies);
1763
+ free(node_init);
1764
+ lm_ggml_free(ctx_allocated);
1765
+ lm_ggml_free(ctx_unallocated);
1766
+ return {
1767
+ /* .buffer = */ NULL,
1768
+ /* .ctx_allocated = */ NULL,
1769
+ /* .ctx_unallocated = */ NULL,
1770
+ /* .graph = */ NULL,
1771
+ };
1772
+ }
1773
+
1774
+ //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1775
+
1776
+ // copy data and init views
1777
+ for (int i = 0; i < graph->n_nodes; i++) {
1778
+ struct lm_ggml_tensor * node = graph->nodes[i];
1779
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
1780
+ }
1781
+
1782
+ // build graph copy
1783
+ struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
1784
+ for (int i = 0; i < graph->n_nodes; i++) {
1785
+ struct lm_ggml_tensor * node = graph->nodes[i];
1786
+ struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
1787
+ graph_copy->nodes[i] = node_copy;
1788
+ }
1789
+ graph_copy->n_nodes = graph->n_nodes;
1790
+
1791
+ lm_ggml_hash_set_free(&hash_set);
1792
+ free(node_copies);
1793
+ free(node_init);
1794
+
1795
+ return {
1796
+ /* .buffer = */ buffer,
1797
+ /* .ctx_allocated = */ ctx_allocated,
1798
+ /* .ctx_unallocated = */ ctx_unallocated,
1799
+ /* .graph = */ graph_copy,
1800
+ };
1801
+ }
1802
+
1803
+ void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
1804
+ lm_ggml_backend_buffer_free(copy.buffer);
1805
+ lm_ggml_free(copy.ctx_allocated);
1806
+ lm_ggml_free(copy.ctx_unallocated);
1807
+ }
1808
+
1809
+ bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
1810
+ struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
1811
+ if (copy.buffer == NULL) {
1812
+ return false;
1813
+ }
1814
+
1815
+ struct lm_ggml_cgraph * g1 = graph;
1816
+ struct lm_ggml_cgraph * g2 = copy.graph;
1817
+
1818
+ assert(g1->n_nodes == g2->n_nodes);
1819
+
1820
+ for (int i = 0; i < g1->n_nodes; i++) {
1821
+ //printf("eval %d/%d\n", i, g1->n_nodes);
1822
+ struct lm_ggml_tensor * t1 = g1->nodes[i];
1823
+ struct lm_ggml_tensor * t2 = g2->nodes[i];
1824
+
1825
+ assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
1826
+
1827
+ struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
1828
+ struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
1829
+
1830
+ lm_ggml_backend_graph_compute(backend1, &g1v);
1831
+ lm_ggml_backend_graph_compute(backend2, &g2v);
1832
+
1833
+ if (lm_ggml_is_view_op(t1->op)) {
1834
+ continue;
1835
+ }
1836
+
1837
+ // compare results, calculate rms etc
1838
+ if (!callback(i, t1, t2, user_data)) {
1839
+ break;
1840
+ }
1841
+ }
1842
+
1843
+ lm_ggml_backend_graph_copy_free(copy);
1844
+
1845
+ return true;
1846
+ }
1847
+
1848
+ // CPU backend - buffer
1849
+
1850
+ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
1851
+ uintptr_t data = (uintptr_t)buffer->context;
1852
+
1853
+ // align the buffer
1854
+ if (data % TENSOR_ALIGNMENT != 0) {
1855
+ data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
1856
+ }
1857
+
1858
+ return (void *)data;
1859
+ }
1860
+
1861
+ static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
1862
+ lm_ggml_aligned_free(buffer->context, buffer->size);
1863
+ }
1864
+
1865
+ static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1866
+ memset((char *)tensor->data + offset, value, size);
1867
+
1868
+ LM_GGML_UNUSED(buffer);
1869
+ }
1870
+
1871
+ static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1872
+ memcpy((char *)tensor->data + offset, data, size);
1873
+
1874
+ LM_GGML_UNUSED(buffer);
1875
+ }
1876
+
1877
+ static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1878
+ memcpy(data, (const char *)tensor->data + offset, size);
1879
+
1880
+ LM_GGML_UNUSED(buffer);
1881
+ }
1882
+
1883
+ static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
1884
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
1885
+ memcpy(dst->data, src->data, lm_ggml_nbytes(src));
1886
+ return true;
1887
+ }
1888
+ return false;
1889
+
1890
+ LM_GGML_UNUSED(buffer);
1891
+ }
1892
+
1893
+ static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
1894
+ memset(buffer->context, value, buffer->size);
1895
+ }
1896
+
1897
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
1898
+ /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
1899
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
1900
+ /* .init_tensor = */ NULL, // no initialization required
1901
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
1902
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
1903
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
1904
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
1905
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
1906
+ /* .reset = */ NULL,
1907
+ };
1908
+
1909
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
1910
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1911
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
1912
+ /* .init_tensor = */ NULL, // no initialization required
1913
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
1914
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
1915
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
1916
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
1917
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
1918
+ /* .reset = */ NULL,
1919
+ };
1920
+
1921
+ // CPU backend buffer type
1922
+
1923
+ // this buffer type is defined here to make it available to all backends
1924
+
1925
+ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
1926
+ return "CPU";
1927
+
1928
+ LM_GGML_UNUSED(buft);
1929
+ }
1930
+
1931
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
1932
+ void * data = lm_ggml_aligned_malloc(size);
1933
+
1934
+ if (data == NULL) {
1935
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1936
+ return NULL;
1937
+ }
1938
+
1939
+ return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
1940
+ }
1941
+
1942
+ static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
1943
+ return TENSOR_ALIGNMENT;
1944
+
1945
+ LM_GGML_UNUSED(buft);
1946
+ }
1947
+
1948
+ static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
1949
+ return true;
1950
+
1951
+ LM_GGML_UNUSED(buft);
1952
+ }
1953
+
1954
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
1955
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
1956
+ /* .iface = */ {
1957
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
1958
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
1959
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
1960
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1961
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
1962
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
1963
+ },
1964
+ /* .device = */ NULL, // FIXME lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1965
+ /* .context = */ NULL,
1966
+ };
1967
+
1968
+ return &lm_ggml_backend_cpu_buffer_type;
1969
+ }
1970
+
1971
+ static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
1972
+ return "CPU_Mapped";
1973
+
1974
+ LM_GGML_UNUSED(buft);
1975
+ }
1976
+
1977
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
1978
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
1979
+ /* .iface = */ {
1980
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
1981
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
1982
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
1983
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1984
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
1985
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
1986
+ },
1987
+ /* .device = */ NULL, // FIXME lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1988
+ /* .context = */ NULL,
1989
+ };
1990
+
1991
+ return &lm_ggml_backend_cpu_buffer_type;
1992
+ }
1993
+
1994
+ lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1995
+ LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1996
+ return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1997
+ }