whispercpp 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +60 -11
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -16
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/{ggml.h → ggml/include/ggml.h} +479 -596
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/{whisper.h → include/whisper.h} +23 -22
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1492 -9
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/{whisper.cpp → src/whisper.cpp} +661 -492
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -21755
@@ -0,0 +1,1999 @@
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
11
+ #include "ggml-backend.h"
12
+ #include "ggml-backend-impl.h"
13
+ #include "ggml-alloc.h"
14
+ #include "ggml-impl.h"
15
+
16
+ #include <assert.h>
17
+ #include <limits.h>
18
+ #include <stdarg.h>
19
+ #include <stdio.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #ifdef __APPLE__
26
+ #include <sys/types.h>
27
+ #include <sys/sysctl.h>
28
+ #endif
29
+
30
+
31
+ // backend buffer type
32
+
33
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34
+ return buft->iface.get_name(buft);
35
+ }
36
+
37
+ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
43
+ return buft->iface.alloc_buffer(buft, size);
44
+ }
45
+
46
+ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
47
+ return buft->iface.get_alignment(buft);
48
+ }
49
+
50
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
51
+ // get_max_size is optional, defaults to SIZE_MAX
52
+ if (buft->iface.get_max_size) {
53
+ return buft->iface.get_max_size(buft);
54
+ }
55
+ return SIZE_MAX;
56
+ }
57
+
58
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
59
+ // get_alloc_size is optional, defaults to ggml_nbytes
60
+ if (buft->iface.get_alloc_size) {
61
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
62
+ assert(size >= ggml_nbytes(tensor));
63
+ return size;
64
+ }
65
+ return ggml_nbytes(tensor);
66
+ }
67
+
68
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
69
+ if (buft->iface.is_host) {
70
+ return buft->iface.is_host(buft);
71
+ }
72
+ return false;
73
+ }
74
+
75
+ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
76
+ return buft->device;
77
+ }
78
+
79
+ // backend buffer
80
+
81
+ ggml_backend_buffer_t ggml_backend_buffer_init(
82
+ ggml_backend_buffer_type_t buft,
83
+ struct ggml_backend_buffer_i iface,
84
+ void * context,
85
+ size_t size) {
86
+ ggml_backend_buffer_t buffer = new ggml_backend_buffer {
87
+ /* .interface = */ iface,
88
+ /* .buft = */ buft,
89
+ /* .context = */ context,
90
+ /* .size = */ size,
91
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
92
+ };
93
+
94
+ return buffer;
95
+ }
96
+
97
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
98
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
99
+ }
100
+
101
+ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
102
+ if (buffer == NULL) {
103
+ return;
104
+ }
105
+
106
+ if (buffer->iface.free_buffer != NULL) {
107
+ buffer->iface.free_buffer(buffer);
108
+ }
109
+ delete buffer;
110
+ }
111
+
112
+ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
113
+ return buffer->size;
114
+ }
115
+
116
+ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
122
+ void * base = buffer->iface.get_base(buffer);
123
+
124
+ GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
125
+
126
+ return base;
127
+ }
128
+
129
+ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
130
+ // init_tensor is optional
131
+ if (buffer->iface.init_tensor) {
132
+ buffer->iface.init_tensor(buffer, tensor);
133
+ }
134
+ }
135
+
136
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
145
+ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
146
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
147
+ }
148
+
149
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
150
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
151
+ }
152
+
153
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
154
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
155
+ }
156
+
157
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
158
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
159
+ }
160
+
161
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
162
+ buffer->usage = usage;
163
+
164
+ // FIXME: add a generic callback to the buffer interface
165
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
166
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
167
+ }
168
+ }
169
+
170
+ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
171
+ return buffer->usage;
172
+ }
173
+
174
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
175
+ return buffer->buft;
176
+ }
177
+
178
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
179
+ if (buffer->iface.reset) {
180
+ buffer->iface.reset(buffer);
181
+ }
182
+ }
183
+
184
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
185
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
186
+ if (dst_buf->iface.cpy_tensor) {
187
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
188
+ }
189
+ return false;
190
+ }
191
+
192
+ // backend
193
+
194
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
195
+ if (backend == NULL) {
196
+ return NULL;
197
+ }
198
+ return backend->guid;
199
+ }
200
+
201
+ const char * ggml_backend_name(ggml_backend_t backend) {
202
+ if (backend == NULL) {
203
+ return "NULL";
204
+ }
205
+ return backend->iface.get_name(backend);
206
+ }
207
+
208
+ void ggml_backend_free(ggml_backend_t backend) {
209
+ if (backend == NULL) {
210
+ return;
211
+ }
212
+
213
+ backend->iface.free(backend);
214
+ }
215
+
216
+ ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
217
+ return ggml_backend_dev_buffer_type(backend->device);
218
+ }
219
+
220
+ ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
221
+ return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
222
+ }
223
+
224
+ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
225
+ return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
226
+ }
227
+
228
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
229
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
230
+ }
231
+
232
+ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
233
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
234
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
235
+
236
+ if (backend->iface.set_tensor_async == NULL) {
237
+ ggml_backend_tensor_set(tensor, data, offset, size);
238
+ } else {
239
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
240
+ }
241
+ }
242
+
243
+ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
244
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
245
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
246
+
247
+ if (backend->iface.get_tensor_async == NULL) {
248
+ ggml_backend_tensor_get(tensor, data, offset, size);
249
+ } else {
250
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
251
+ }
252
+ }
253
+
254
+ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
+ GGML_ASSERT(tensor);
256
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
257
+
258
+ if (size == 0) {
259
+ return;
260
+ }
261
+
262
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
263
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
264
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
265
+
266
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
267
+ }
268
+
269
+ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
270
+ GGML_ASSERT(tensor);
271
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
272
+
273
+ if (size == 0) {
274
+ return;
275
+ }
276
+
277
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
278
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
279
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
280
+
281
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
282
+ }
283
+
284
+ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
285
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
286
+
287
+ if (size == 0) {
288
+ return;
289
+ }
290
+
291
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
292
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
293
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
294
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
295
+
296
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
297
+ }
298
+
299
+ void ggml_backend_synchronize(ggml_backend_t backend) {
300
+ if (backend->iface.synchronize == NULL) {
301
+ return;
302
+ }
303
+
304
+ backend->iface.synchronize(backend);
305
+ }
306
+
307
+ ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
308
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
309
+
310
+ return backend->iface.graph_plan_create(backend, cgraph);
311
+ }
312
+
313
+ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
314
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
315
+
316
+ backend->iface.graph_plan_free(backend, plan);
317
+ }
318
+
319
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
320
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
321
+
322
+ return backend->iface.graph_plan_compute(backend, plan);
323
+ }
324
+
325
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
326
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
327
+ ggml_backend_synchronize(backend);
328
+ return err;
329
+ }
330
+
331
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
332
+ return backend->iface.graph_compute(backend, cgraph);
333
+ }
334
+
335
+ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
336
+ return ggml_backend_dev_supports_op(backend->device, op);
337
+ }
338
+
339
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
340
+ return ggml_backend_dev_supports_buft(backend->device, buft);
341
+ }
342
+
343
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
344
+ return ggml_backend_dev_offload_op(backend->device, op);
345
+ }
346
+
347
+ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
348
+ return backend->device;
349
+ }
350
+
351
+ // backend copy
352
+
353
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
354
+ if (a->type != b->type) {
355
+ return false;
356
+ }
357
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
358
+ if (a->ne[i] != b->ne[i]) {
359
+ return false;
360
+ }
361
+ if (a->nb[i] != b->nb[i]) {
362
+ return false;
363
+ }
364
+ }
365
+ return true;
366
+ }
367
+
368
+ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
369
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
370
+
371
+ if (src == dst) {
372
+ return;
373
+ }
374
+
375
+ if (ggml_backend_buffer_is_host(src->buffer)) {
376
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
377
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
378
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
379
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
380
+ #ifndef NDEBUG
381
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
382
+ #endif
383
+ size_t nbytes = ggml_nbytes(src);
384
+ void * data = malloc(nbytes);
385
+ ggml_backend_tensor_get(src, data, 0, nbytes);
386
+ ggml_backend_tensor_set(dst, data, 0, nbytes);
387
+ free(data);
388
+ }
389
+ }
390
+
391
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
392
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
393
+
394
+ if (src == dst) {
395
+ return;
396
+ }
397
+
398
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
399
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
400
+ return;
401
+ }
402
+ }
403
+
404
+ // an async copy would normally happen after all the queued operations on both backends are completed
405
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
406
+ ggml_backend_synchronize(backend_src);
407
+ ggml_backend_synchronize(backend_dst);
408
+ ggml_backend_tensor_copy(src, dst);
409
+ }
410
+
411
+ // events
412
+
413
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
414
+ // null device is allowed for the transition period to the device interface
415
+ if (device == NULL || device->iface.event_new == NULL) {
416
+ return NULL;
417
+ }
418
+ return device->iface.event_new(device);
419
+ }
420
+
421
+ void ggml_backend_event_free(ggml_backend_event_t event) {
422
+ if (event == NULL) {
423
+ return;
424
+ }
425
+ event->device->iface.event_free(event->device, event);
426
+ }
427
+
428
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
429
+ GGML_ASSERT(backend->iface.event_record != NULL);
430
+
431
+ backend->iface.event_record(backend, event);
432
+ }
433
+
434
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
435
+ GGML_ASSERT(event->device->iface.event_synchronize);
436
+
437
+ event->device->iface.event_synchronize(event->device, event);
438
+ }
439
+
440
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
441
+ GGML_ASSERT(backend->iface.event_wait != NULL);
442
+
443
+ backend->iface.event_wait(backend, event);
444
+ }
445
+
446
+ // Backend device
447
+
448
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
449
+ return device->iface.get_name(device);
450
+ }
451
+
452
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
453
+ return device->iface.get_description(device);
454
+ }
455
+
456
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
457
+ device->iface.get_memory(device, free, total);
458
+ }
459
+
460
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
461
+ return device->iface.get_type(device);
462
+ }
463
+
464
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
465
+ memset(props, 0, sizeof(*props));
466
+ device->iface.get_props(device, props);
467
+ }
468
+
469
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
470
+ return device->reg;
471
+ }
472
+
473
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
474
+ return device->iface.init_backend(device, params);
475
+ }
476
+
477
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
478
+ return device->iface.get_buffer_type(device);
479
+ }
480
+
481
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
482
+ if (device->iface.get_host_buffer_type == NULL) {
483
+ return NULL;
484
+ }
485
+
486
+ return device->iface.get_host_buffer_type(device);
487
+ }
488
+
489
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
490
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
491
+ }
492
+
493
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
494
+ return device->iface.supports_op(device, op);
495
+ }
496
+
497
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
498
+ return device->iface.supports_buft(device, buft);
499
+ }
500
+
501
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
502
+ if (device->iface.offload_op != NULL) {
503
+ return device->iface.offload_op(device, op);
504
+ }
505
+
506
+ return false;
507
+ }
508
+
509
+ // Backend (reg)
510
+
511
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
512
+ return reg->iface.get_name(reg);
513
+ }
514
+
515
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
516
+ return reg->iface.get_device_count(reg);
517
+ }
518
+
519
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
520
+ return reg->iface.get_device(reg, index);
521
+ }
522
+
523
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
524
+ if (!reg->iface.get_proc_address) {
525
+ return NULL;
526
+ }
527
+ return reg->iface.get_proc_address(reg, name);
528
+ }
529
+
530
+ // multi-buffer buffer
531
+
532
+ struct ggml_backend_multi_buffer_context {
533
+ ggml_backend_buffer_t * buffers;
534
+ size_t n_buffers;
535
+ };
536
+
537
+ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
538
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
539
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
540
+ ggml_backend_buffer_free(ctx->buffers[i]);
541
+ }
542
+
543
+ free(ctx->buffers);
544
+ free(ctx);
545
+ }
546
+
547
+ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
548
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
549
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
550
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
551
+ }
552
+ }
553
+
554
+ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
555
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
556
+ /* .get_base = */ NULL,
557
+ /* .init_tensor = */ NULL,
558
+ /* .memset_tensor = */ NULL,
559
+ /* .set_tensor = */ NULL,
560
+ /* .get_tensor = */ NULL,
561
+ /* .cpy_tensor = */ NULL,
562
+ /* .clear = */ ggml_backend_multi_buffer_clear,
563
+ /* .reset = */ NULL,
564
+ };
565
+
566
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
567
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
568
+ ctx->n_buffers = n_buffers;
569
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
570
+
571
+ GGML_ASSERT(ctx->buffers != NULL);
572
+
573
+ size_t total_size = 0;
574
+ for (size_t i = 0; i < n_buffers; i++) {
575
+ ctx->buffers[i] = buffers[i];
576
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
577
+ }
578
+
579
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
580
+ }
581
+
582
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
583
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
584
+ }
585
+
586
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
587
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
588
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
589
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
590
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
591
+ }
592
+ }
593
+
594
+ // creates a copy of the tensor with the same memory layout
595
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
596
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
597
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
598
+ dup->nb[i] = tensor->nb[i];
599
+ }
600
+ return dup;
601
+ }
602
+
603
+ static bool ggml_is_view_op(enum ggml_op op) {
604
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
605
+ }
606
+
607
+ // scheduler
608
+
609
+ #ifndef GGML_SCHED_MAX_BACKENDS
610
+ #define GGML_SCHED_MAX_BACKENDS 16
611
+ #endif
612
+
613
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
614
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
615
+ #endif
616
+
617
+ #ifndef GGML_SCHED_MAX_COPIES
618
+ #define GGML_SCHED_MAX_COPIES 4
619
+ #endif
620
+
621
+ struct ggml_backend_sched_split {
622
+ int backend_id;
623
+ int i_start;
624
+ int i_end;
625
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
626
+ int n_inputs;
627
+ // graph view of this split
628
+ struct ggml_cgraph graph;
629
+ };
630
+
631
+ struct ggml_backend_sched {
632
+ bool is_reset; // true if the scheduler has been reset since the last graph split
633
+ bool is_alloc;
634
+
635
+ int n_backends;
636
+
637
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
638
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
639
+ ggml_gallocr_t galloc;
640
+
641
+ // hash map of the nodes in the graph
642
+ struct ggml_hash_set hash_set;
643
+ int * hv_tensor_backend_ids; // [hash_set.size]
644
+ struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
645
+
646
+ int * node_backend_ids; // [graph_size]
647
+ int * leaf_backend_ids; // [graph_size]
648
+
649
+ int * prev_node_backend_ids; // [graph_size]
650
+ int * prev_leaf_backend_ids; // [graph_size]
651
+
652
+ // copy of the graph with modified inputs
653
+ struct ggml_cgraph graph;
654
+
655
+ // graph splits
656
+ struct ggml_backend_sched_split * splits;
657
+ int n_splits;
658
+ int splits_capacity;
659
+
660
+ // pipeline parallelism support
661
+ int n_copies;
662
+ int cur_copy;
663
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
664
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
665
+ int n_graph_inputs;
666
+
667
+ struct ggml_context * ctx;
668
+
669
+ ggml_backend_sched_eval_callback callback_eval;
670
+ void * callback_eval_user_data;
671
+
672
+ char * context_buffer;
673
+ size_t context_buffer_size;
674
+
675
+ int debug;
676
+ };
677
+
678
+ #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
679
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
680
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
681
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
682
+
683
+ // returns the priority of the backend, lower id is higher priority
684
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
685
+ for (int i = 0; i < sched->n_backends; i++) {
686
+ if (sched->backends[i] == backend) {
687
+ return i;
688
+ }
689
+ }
690
+ return -1;
691
+ }
692
+
693
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
694
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
695
+ if (buffer == NULL) {
696
+ return -1;
697
+ }
698
+
699
+ // find highest prio backend that supports the buffer type and the op
700
+ for (int i = 0; i < sched->n_backends; i++) {
701
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
702
+ ggml_backend_supports_op(sched->backends[i], op)) {
703
+ return i;
704
+ }
705
+ }
706
+
707
+ #ifndef NDEBUG
708
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
709
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
710
+ #endif
711
+
712
+ return -1;
713
+ }
714
+
715
+ #if 0
716
+ #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
717
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
718
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
719
+ #define GET_CAUSE(node) causes[hash_id(node)]
720
+ #else
721
+ #define SET_CAUSE(node, ...)
722
+ #define GET_CAUSE(node) ""
723
+ #endif
724
+
725
+ // returns the backend that should be used for the node based on the current locations
726
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
727
+ // assign pre-allocated nodes to their backend
728
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
729
+ if (cur_backend_id != -1) {
730
+ SET_CAUSE(tensor, "1.dst");
731
+ return cur_backend_id;
732
+ }
733
+
734
+ // view_src
735
+ if (tensor->view_src != NULL) {
736
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
737
+ if (cur_backend_id != -1) {
738
+ SET_CAUSE(tensor, "1.vsrc");
739
+ return cur_backend_id;
740
+ }
741
+ }
742
+
743
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
744
+ // since the tensor is pre-allocated, it cannot be moved to another backend
745
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
746
+ GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
747
+ }
748
+
749
+ // graph input
750
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
751
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
752
+ SET_CAUSE(tensor, "1.inp");
753
+ return cur_backend_id;
754
+ }
755
+
756
+ // operations with weights are preferably run on the same backend as the weights
757
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
758
+ const struct ggml_tensor * src = tensor->src[i];
759
+ if (src == NULL) {
760
+ continue;
761
+ }
762
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
763
+ // not an ideal solution
764
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
765
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
766
+ // check if a backend with higher prio wants to offload the op
767
+ if (src_backend_id == sched->n_backends - 1) {
768
+ for (int b = 0; b < src_backend_id; b++) {
769
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
770
+ SET_CAUSE(tensor, "1.off");
771
+ return b;
772
+ }
773
+ }
774
+ }
775
+ SET_CAUSE(tensor, "1.wgt%d", i);
776
+ return src_backend_id;
777
+ }
778
+ }
779
+
780
+ return -1;
781
+ }
782
+
783
+ static char * fmt_size(size_t size) {
784
+ static char buffer[128];
785
+ if (size >= 1024*1024) {
786
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
787
+ } else {
788
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
789
+ }
790
+ return buffer;
791
+ }
792
+
793
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
794
+ int cur_split = 0;
795
+ for (int i = 0; i < graph->n_nodes; i++) {
796
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
799
+ sched->splits[cur_split].n_inputs);
800
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802
+ fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
803
+ }
804
+ GGML_LOG_DEBUG("\n");
805
+ cur_split++;
806
+ }
807
+ struct ggml_tensor * node = graph->nodes[i];
808
+ if (ggml_is_view_op(node->op)) {
809
+ continue;
810
+ }
811
+ if (sched->debug > 1) {
812
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
813
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
814
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
815
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
816
+ struct ggml_tensor * src = node->src[j];
817
+ if (src == NULL) {
818
+ continue;
819
+ }
820
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
821
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
822
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
823
+ }
824
+ GGML_LOG_DEBUG("\n");
825
+ }
826
+ }
827
+ }
828
+
829
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
830
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
831
+ ggml_backend_buffer_type_t buft = NULL;
832
+
833
+ if (buf) {
834
+ // the tensor is already allocated
835
+ buft = buf->buft;
836
+ } else {
837
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
838
+ int tensor_backend_id = tensor_backend_id(t);
839
+ if (tensor_backend_id == -1 && t->view_src) {
840
+ tensor_backend_id = tensor_backend_id(t->view_src);
841
+ }
842
+ if (tensor_backend_id != -1) {
843
+ buft = sched->bufts[tensor_backend_id];
844
+ }
845
+ }
846
+
847
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
848
+ }
849
+
850
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
851
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
852
+ *node_backend_id = cur_backend_id;
853
+ SET_CAUSE(node, "2.sup");
854
+ }
855
+ }
856
+
857
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
858
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
859
+ // reset splits
860
+ sched->n_splits = 0;
861
+ sched->n_graph_inputs = 0;
862
+ sched->is_reset = false;
863
+
864
+ struct ggml_init_params params = {
865
+ /* .mem_size = */ sched->context_buffer_size,
866
+ /* .mem_buffer = */ sched->context_buffer,
867
+ /* .no_alloc = */ true
868
+ };
869
+
870
+ ggml_free(sched->ctx);
871
+
872
+ sched->ctx = ggml_init(params);
873
+ if (sched->ctx == NULL) {
874
+ GGML_ABORT("%s: failed to initialize context\n", __func__);
875
+ }
876
+
877
+ // pass 1: assign backends to ops with pre-allocated inputs
878
+ for (int i = 0; i < graph->n_leafs; i++) {
879
+ struct ggml_tensor * leaf = graph->leafs[i];
880
+ int * leaf_backend_id = &tensor_backend_id(leaf);
881
+ // do not overwrite user assignments
882
+ if (*leaf_backend_id == -1) {
883
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
884
+ }
885
+ }
886
+
887
+ for (int i = 0; i < graph->n_nodes; i++) {
888
+ struct ggml_tensor * node = graph->nodes[i];
889
+ int * node_backend_id = &tensor_backend_id(node);
890
+ // do not overwrite user assignments
891
+ if (*node_backend_id == -1) {
892
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
893
+
894
+ #if 0
895
+ // src
896
+ if (node->op == GGML_OP_NONE) {
897
+ continue;
898
+ }
899
+
900
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
901
+ struct ggml_tensor * src = node->src[j];
902
+ if (src == NULL) {
903
+ continue;
904
+ }
905
+ int * src_backend_id = &tensor_backend_id(src);
906
+ if (*src_backend_id == -1) {
907
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
908
+ }
909
+ }
910
+ #endif
911
+ }
912
+ }
913
+
914
+ // pass 2: expand current backend assignments
915
+ // assign the same backend to adjacent nodes
916
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
917
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
918
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
919
+ // expand gpu down
920
+ {
921
+ int cur_backend_id = -1;
922
+ for (int i = 0; i < graph->n_nodes; i++) {
923
+ struct ggml_tensor * node = graph->nodes[i];
924
+ if (ggml_is_view_op(node->op)) {
925
+ continue;
926
+ }
927
+ int * node_backend_id = &tensor_backend_id(node);
928
+ if (*node_backend_id != -1) {
929
+ if (*node_backend_id == sched->n_backends - 1) {
930
+ // skip cpu (lowest prio backend)
931
+ cur_backend_id = -1;
932
+ } else {
933
+ cur_backend_id = *node_backend_id;
934
+ }
935
+ } else if (cur_backend_id != -1) {
936
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
937
+ }
938
+ }
939
+ }
940
+ // expand gpu up
941
+ {
942
+ int cur_backend_id = -1;
943
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
944
+ struct ggml_tensor * node = graph->nodes[i];
945
+ if (ggml_is_view_op(node->op)) {
946
+ continue;
947
+ }
948
+ int * node_backend_id = &tensor_backend_id(node);
949
+ if (*node_backend_id != -1) {
950
+ if (*node_backend_id == sched->n_backends - 1) {
951
+ // skip cpu (lowest prio backend)
952
+ cur_backend_id = -1;
953
+ } else {
954
+ cur_backend_id = *node_backend_id;
955
+ }
956
+ } else if (cur_backend_id != -1) {
957
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
958
+ }
959
+ }
960
+ }
961
+ // expand rest down
962
+ {
963
+ int cur_backend_id = -1;
964
+ for (int i = 0; i < graph->n_nodes; i++) {
965
+ struct ggml_tensor * node = graph->nodes[i];
966
+ if (ggml_is_view_op(node->op)) {
967
+ continue;
968
+ }
969
+ int * node_backend_id = &tensor_backend_id(node);
970
+ if (*node_backend_id != -1) {
971
+ cur_backend_id = *node_backend_id;
972
+ } else if (cur_backend_id != -1) {
973
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
974
+ }
975
+ }
976
+ }
977
+ // expand rest up
978
+ {
979
+ int cur_backend_id = -1;
980
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
981
+ struct ggml_tensor * node = graph->nodes[i];
982
+ if (ggml_is_view_op(node->op)) {
983
+ continue;
984
+ }
985
+ int * node_backend_id = &tensor_backend_id(node);
986
+ if (*node_backend_id != -1) {
987
+ cur_backend_id = *node_backend_id;
988
+ } else if (cur_backend_id != -1) {
989
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
990
+ }
991
+ }
992
+ }
993
+
994
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
995
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
996
+ // however, we also need to verify that the sources are in compatible buffer types
997
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
998
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
999
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1000
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1001
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1002
+ for (int i = 0; i < graph->n_nodes; i++) {
1003
+ struct ggml_tensor * node = graph->nodes[i];
1004
+ if (ggml_is_view_op(node->op)) {
1005
+ continue;
1006
+ }
1007
+ int * node_backend_id = &tensor_backend_id(node);
1008
+ if (*node_backend_id == -1) {
1009
+ // unassigned node: find the backend with the most supported inputs
1010
+ int n_supported_best = -1;
1011
+ for (int b = 0; b < sched->n_backends; b++) {
1012
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1013
+ int n_supported = 0;
1014
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1015
+ struct ggml_tensor * src = node->src[j];
1016
+ if (src == NULL) {
1017
+ continue;
1018
+ }
1019
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1020
+ n_supported++;
1021
+ }
1022
+ }
1023
+ if (n_supported > n_supported_best) {
1024
+ n_supported_best = n_supported;
1025
+ *node_backend_id = b;
1026
+ SET_CAUSE(node, "3.best");
1027
+ }
1028
+ }
1029
+ }
1030
+ } else {
1031
+ // assigned node: upgrade to higher prio backend if possible
1032
+ for (int b = 0; b < *node_backend_id; b++) {
1033
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1034
+ bool supported = true;
1035
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1036
+ struct ggml_tensor * src = node->src[j];
1037
+ if (src == NULL) {
1038
+ continue;
1039
+ }
1040
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1041
+ supported = false;
1042
+ break;
1043
+ }
1044
+ }
1045
+ if (supported) {
1046
+ *node_backend_id = b;
1047
+ SET_CAUSE(node, "3.upg");
1048
+ break;
1049
+ }
1050
+ }
1051
+ }
1052
+ }
1053
+ }
1054
+
1055
+ // pass 4: assign backends to remaining src from dst and view_src
1056
+ for (int i = 0; i < graph->n_nodes; i++) {
1057
+ struct ggml_tensor * node = graph->nodes[i];
1058
+ int * cur_backend_id = &tensor_backend_id(node);
1059
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1060
+ *cur_backend_id = tensor_backend_id(node->view_src);
1061
+ SET_CAUSE(node, "4.vsrc");
1062
+ }
1063
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1064
+ struct ggml_tensor * src = node->src[j];
1065
+ if (src == NULL) {
1066
+ continue;
1067
+ }
1068
+ int * src_backend_id = &tensor_backend_id(src);
1069
+ if (*src_backend_id == -1) {
1070
+ if (src->view_src != NULL) {
1071
+ // views are always on the same backend as the source
1072
+ *src_backend_id = tensor_backend_id(src->view_src);
1073
+ SET_CAUSE(src, "4.vsrc");
1074
+ } else {
1075
+ *src_backend_id = *cur_backend_id;
1076
+ SET_CAUSE(src, "4.cur");
1077
+ }
1078
+ }
1079
+ }
1080
+ }
1081
+
1082
+ // pass 5: split graph, find tensors that need to be copied
1083
+ {
1084
+ int i_split = 0;
1085
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1086
+ // find the backend of the first split, skipping view ops
1087
+ int i = 0;
1088
+ for (; i < graph->n_nodes; i++) {
1089
+ struct ggml_tensor * node = graph->nodes[i];
1090
+ if (!ggml_is_view_op(node->op)) {
1091
+ split->backend_id = tensor_backend_id(node);
1092
+ break;
1093
+ }
1094
+ }
1095
+ split->i_start = 0;
1096
+ split->n_inputs = 0;
1097
+ int cur_backend_id = split->backend_id;
1098
+ for (; i < graph->n_nodes; i++) {
1099
+ struct ggml_tensor * node = graph->nodes[i];
1100
+
1101
+ if (ggml_is_view_op(node->op)) {
1102
+ continue;
1103
+ }
1104
+
1105
+ const int node_backend_id = tensor_backend_id(node);
1106
+
1107
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1108
+
1109
+ // check if we should start a new split based on the sources of the current node
1110
+ bool need_new_split = false;
1111
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1112
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1113
+ struct ggml_tensor * src = node->src[j];
1114
+ if (src == NULL) {
1115
+ continue;
1116
+ }
1117
+ // check if a weight is on a different and incompatible backend
1118
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1119
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1120
+ int src_backend_id = tensor_backend_id(src);
1121
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1122
+ need_new_split = true;
1123
+ break;
1124
+ }
1125
+ }
1126
+ // check if the split has too many inputs
1127
+ // FIXME: count the number of inputs instead of only checking when full
1128
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1129
+ const size_t id = hash_id(src);
1130
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1131
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1132
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1133
+ need_new_split = true;
1134
+ break;
1135
+ }
1136
+ }
1137
+ }
1138
+ }
1139
+
1140
+ if (node_backend_id != cur_backend_id || need_new_split) {
1141
+ split->i_end = i;
1142
+ i_split++;
1143
+ if (i_split >= sched->splits_capacity) {
1144
+ sched->splits_capacity *= 2;
1145
+ sched->splits = (ggml_backend_sched_split *)
1146
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1147
+ GGML_ASSERT(sched->splits != NULL);
1148
+ }
1149
+ split = &sched->splits[i_split];
1150
+ split->backend_id = node_backend_id;
1151
+ split->i_start = i;
1152
+ split->n_inputs = 0;
1153
+ cur_backend_id = node_backend_id;
1154
+ }
1155
+
1156
+ // find inputs that are not on the same backend
1157
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1158
+ struct ggml_tensor * src = node->src[j];
1159
+ if (src == NULL) {
1160
+ continue;
1161
+ }
1162
+
1163
+ size_t src_id = hash_id(src);
1164
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1165
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1166
+
1167
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1168
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1169
+ ggml_backend_t backend = sched->backends[src_backend_id];
1170
+ for (int c = 0; c < sched->n_copies; c++) {
1171
+ struct ggml_tensor * tensor_copy;
1172
+ if (c == sched->cur_copy) {
1173
+ tensor_copy = src; // use the original tensor as the current copy
1174
+ } else {
1175
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1176
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1177
+ }
1178
+ if (sched->n_copies > 1) {
1179
+ ggml_set_input(tensor_copy);
1180
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1181
+ }
1182
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1183
+ SET_CAUSE(tensor_copy, "4.cpy");
1184
+ }
1185
+ int n_graph_inputs = sched->n_graph_inputs++;
1186
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1187
+ sched->graph_inputs[n_graph_inputs] = src;
1188
+ }
1189
+ }
1190
+
1191
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1192
+ // create a copy of the input in the split's backend
1193
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1194
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1195
+ for (int c = 0; c < sched->n_copies; c++) {
1196
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1197
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1198
+ if (sched->n_copies > 1) {
1199
+ ggml_set_input(tensor_copy);
1200
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1201
+ }
1202
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1203
+ SET_CAUSE(tensor_copy, "4.cpy");
1204
+ }
1205
+ int n_inputs = split->n_inputs++;
1206
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1207
+ split->inputs[n_inputs] = src;
1208
+ }
1209
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1210
+ }
1211
+ }
1212
+ }
1213
+ split->i_end = graph->n_nodes;
1214
+ sched->n_splits = i_split + 1;
1215
+ }
1216
+
1217
+ if (sched->debug) {
1218
+ ggml_backend_sched_print_assignments(sched, graph);
1219
+ }
1220
+
1221
+ // swap node_backend_ids and leaf _backend_ids with prevs
1222
+ {
1223
+ int * tmp = sched->node_backend_ids;
1224
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1225
+ sched->prev_node_backend_ids = tmp;
1226
+
1227
+ tmp = sched->leaf_backend_ids;
1228
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1229
+ sched->prev_leaf_backend_ids = tmp;
1230
+ }
1231
+
1232
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1233
+ if (sched->graph.size < graph_size) {
1234
+ sched->graph.size = graph_size;
1235
+ sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1236
+ sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1237
+ GGML_ASSERT(sched->graph.nodes != NULL);
1238
+ GGML_ASSERT(sched->graph.leafs != NULL);
1239
+ }
1240
+ sched->graph.n_nodes = 0;
1241
+ sched->graph.n_leafs = 0;
1242
+
1243
+ struct ggml_cgraph * graph_copy = &sched->graph;
1244
+
1245
+ for (int i = 0; i < sched->n_splits; i++) {
1246
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1247
+ split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1248
+
1249
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1250
+ for (int j = 0; j < split->n_inputs; j++) {
1251
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1252
+
1253
+ struct ggml_tensor * input = split->inputs[j];
1254
+ const size_t input_id = hash_id(input);
1255
+ struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1256
+
1257
+ // add a dependency to the input source so that it is not freed before the copy is done
1258
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1259
+ input_dep->src[0] = input;
1260
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1261
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1262
+
1263
+ // add a dependency to the input copy so that it is allocated at the start of the split
1264
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1265
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1266
+ }
1267
+
1268
+ for (int j = split->i_start; j < split->i_end; j++) {
1269
+ assert(graph_copy->size > graph_copy->n_nodes);
1270
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1271
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1272
+ }
1273
+ }
1274
+
1275
+ if (sched->n_copies > 1) {
1276
+ // add input copies as leafs so that they are allocated first
1277
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1278
+ struct ggml_tensor * input = sched->graph_inputs[i];
1279
+ size_t id = hash_id(input);
1280
+ int backend_id = tensor_backend_id(input);
1281
+ for (int c = 0; c < sched->n_copies; c++) {
1282
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1283
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1284
+ assert(graph_copy->size > graph_copy->n_leafs);
1285
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1286
+ }
1287
+ }
1288
+
1289
+ for (int i = 0; i < sched->n_splits; i++) {
1290
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1291
+ int backend_id = split->backend_id;
1292
+ for (int j = 0; j < split->n_inputs; j++) {
1293
+ struct ggml_tensor * input = split->inputs[j];
1294
+ size_t id = hash_id(input);
1295
+ for (int c = 0; c < sched->n_copies; c++) {
1296
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1297
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1298
+ assert(graph_copy->size > graph_copy->n_leafs);
1299
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1300
+ }
1301
+ }
1302
+ }
1303
+ }
1304
+
1305
+ // add leafs from the original graph
1306
+ for (int i = 0; i < graph->n_leafs; i++) {
1307
+ struct ggml_tensor * leaf = graph->leafs[i];
1308
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1309
+ assert(graph_copy->size > graph_copy->n_leafs);
1310
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1311
+ }
1312
+ }
1313
+
1314
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1315
+ bool backend_ids_changed = false;
1316
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1317
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1318
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1319
+ backend_ids_changed = true;
1320
+ break;
1321
+ }
1322
+ }
1323
+ if (!backend_ids_changed) {
1324
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1325
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1326
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1327
+ backend_ids_changed = true;
1328
+ break;
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ // allocate graph
1334
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1335
+ // the re-allocation may cause the split inputs to be moved to a different address
1336
+ ggml_backend_sched_synchronize(sched);
1337
+ #ifndef NDEBUG
1338
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1339
+ #endif
1340
+ ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1341
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1343
+ return false;
1344
+ }
1345
+ }
1346
+
1347
+ return true;
1348
+ }
1349
+
1350
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1351
+ struct ggml_backend_sched_split * splits = sched->splits;
1352
+
1353
+ for (int i = 0; i < sched->n_splits; i++) {
1354
+ struct ggml_backend_sched_split * split = &splits[i];
1355
+ int split_backend_id = split->backend_id;
1356
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
1357
+
1358
+ // copy the input tensors to the split backend
1359
+ for (int j = 0; j < split->n_inputs; j++) {
1360
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1361
+ struct ggml_tensor * input = split->inputs[j];
1362
+ struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1363
+
1364
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1365
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1366
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1367
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1368
+ } else {
1369
+ ggml_backend_synchronize(split_backend);
1370
+ }
1371
+ ggml_backend_tensor_copy(input, input_cpy);
1372
+ } else {
1373
+ // wait for the split backend to finish using the input before overwriting it
1374
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1375
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1376
+ } else {
1377
+ ggml_backend_synchronize(split_backend);
1378
+ }
1379
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1380
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1381
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1382
+ ggml_backend_synchronize(input_backend);
1383
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1384
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1385
+ } else {
1386
+ ggml_backend_synchronize(split_backend);
1387
+ }
1388
+ ggml_backend_tensor_copy(input, input_cpy);
1389
+ }
1390
+ }
1391
+ }
1392
+
1393
+ if (!sched->callback_eval) {
1394
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1395
+ if (ec != GGML_STATUS_SUCCESS) {
1396
+ return ec;
1397
+ }
1398
+ } else {
1399
+ // similar to ggml_backend_compare_graph_backend
1400
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1401
+ struct ggml_tensor * t = split->graph.nodes[j0];
1402
+
1403
+ // check if the user needs data from this node
1404
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1405
+
1406
+ int j1 = j0;
1407
+
1408
+ // determine the range [j0, j1] of nodes that can be computed together
1409
+ while (!need && j1 < split->graph.n_nodes - 1) {
1410
+ t = split->graph.nodes[++j1];
1411
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1412
+ }
1413
+
1414
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1415
+
1416
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1417
+ if (ec != GGML_STATUS_SUCCESS) {
1418
+ return ec;
1419
+ }
1420
+
1421
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1422
+ ggml_backend_synchronize(split_backend);
1423
+
1424
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1425
+ break;
1426
+ }
1427
+
1428
+ j0 = j1;
1429
+ }
1430
+ }
1431
+
1432
+ // record the event of this copy
1433
+ if (split->n_inputs > 0) {
1434
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1435
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1436
+ }
1437
+ }
1438
+ }
1439
+
1440
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1441
+
1442
+ return GGML_STATUS_SUCCESS;
1443
+ }
1444
+
1445
+ ggml_backend_sched_t ggml_backend_sched_new(
1446
+ ggml_backend_t * backends,
1447
+ ggml_backend_buffer_type_t * bufts,
1448
+ int n_backends,
1449
+ size_t graph_size,
1450
+ bool parallel) {
1451
+ GGML_ASSERT(n_backends > 0);
1452
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1453
+ GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
1454
+
1455
+ struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
1456
+
1457
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
1458
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
1459
+ sched->n_backends = n_backends;
1460
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1461
+
1462
+ // initialize hash table
1463
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1464
+ sched->hash_set = ggml_hash_set_new(graph_size);
1465
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1466
+ sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1467
+
1468
+ const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1469
+ const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1470
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1471
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1472
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1473
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1474
+
1475
+ sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1476
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1477
+
1478
+ const int initial_splits_capacity = 16;
1479
+ sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1480
+ sched->splits_capacity = initial_splits_capacity;
1481
+
1482
+ for (int b = 0; b < n_backends; b++) {
1483
+ sched->backends[b] = backends[b];
1484
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1485
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1486
+
1487
+ if (sched->n_copies > 1) {
1488
+ for (int c = 0; c < sched->n_copies; c++) {
1489
+ sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
1490
+ }
1491
+ }
1492
+ }
1493
+
1494
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1495
+
1496
+ ggml_backend_sched_reset(sched);
1497
+
1498
+ return sched;
1499
+ }
1500
+
1501
+ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1502
+ if (sched == NULL) {
1503
+ return;
1504
+ }
1505
+ for (int b = 0; b < sched->n_backends; b++) {
1506
+ for (int c = 0; c < sched->n_copies; c++) {
1507
+ ggml_backend_event_free(sched->events[b][c]);
1508
+ }
1509
+ }
1510
+ ggml_gallocr_free(sched->galloc);
1511
+ ggml_free(sched->ctx);
1512
+ ggml_hash_set_free(&sched->hash_set);
1513
+ free(sched->splits);
1514
+ free(sched->hv_tensor_backend_ids);
1515
+ free(sched->hv_tensor_copies);
1516
+ free(sched->node_backend_ids);
1517
+ free(sched->leaf_backend_ids);
1518
+ free(sched->prev_node_backend_ids);
1519
+ free(sched->prev_leaf_backend_ids);
1520
+ free(sched->context_buffer);
1521
+ free(sched->graph.nodes);
1522
+ free(sched->graph.leafs);
1523
+ free(sched);
1524
+ }
1525
+
1526
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1527
+ // reset state for the next run
1528
+ if (!sched->is_reset) {
1529
+ ggml_hash_set_reset(&sched->hash_set);
1530
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1531
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1532
+ sched->is_reset = true;
1533
+ }
1534
+ sched->is_alloc = false;
1535
+ }
1536
+
1537
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1538
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1539
+
1540
+ ggml_backend_sched_split_graph(sched, measure_graph);
1541
+
1542
+ ggml_backend_sched_synchronize(sched);
1543
+
1544
+ if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1545
+ return false;
1546
+ }
1547
+
1548
+ ggml_backend_sched_reset(sched);
1549
+
1550
+ return true;
1551
+ }
1552
+
1553
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1554
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1555
+
1556
+ ggml_backend_sched_split_graph(sched, graph);
1557
+
1558
+
1559
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1560
+ return false;
1561
+ }
1562
+
1563
+ sched->is_alloc = true;
1564
+
1565
+ return true;
1566
+ }
1567
+
1568
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1569
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1570
+ ggml_backend_sched_synchronize(sched);
1571
+ return err;
1572
+ }
1573
+
1574
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1575
+ if (!sched->is_reset && !sched->is_alloc) {
1576
+ ggml_backend_sched_reset(sched);
1577
+ }
1578
+
1579
+ if (!sched->is_alloc) {
1580
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1581
+ return GGML_STATUS_ALLOC_FAILED;
1582
+ }
1583
+ }
1584
+
1585
+ return ggml_backend_sched_compute_splits(sched);
1586
+ }
1587
+
1588
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1589
+ for (int i = 0; i < sched->n_backends; i++) {
1590
+ ggml_backend_synchronize(sched->backends[i]);
1591
+ }
1592
+ }
1593
+
1594
+ void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1595
+ sched->callback_eval = callback;
1596
+ sched->callback_eval_user_data = user_data;
1597
+ }
1598
+
1599
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1600
+ return sched->n_splits;
1601
+ }
1602
+
1603
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1604
+ return sched->n_copies;
1605
+ }
1606
+
1607
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1608
+ return sched->n_backends;
1609
+ }
1610
+
1611
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1612
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
1613
+ return sched->backends[i];
1614
+ }
1615
+
1616
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1617
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1618
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1619
+
1620
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1621
+ }
1622
+
1623
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1624
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1625
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1626
+ tensor_backend_id(node) = backend_index;
1627
+ SET_CAUSE(node, "usr");
1628
+ sched->is_reset = false;
1629
+ }
1630
+
1631
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1632
+ int backend_index = tensor_backend_id(node);
1633
+ if (backend_index == -1) {
1634
+ return NULL;
1635
+ }
1636
+ return sched->backends[backend_index];
1637
+ }
1638
+
1639
+ // utils
1640
+
1641
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1642
+ GGML_ASSERT(tensor->buffer == NULL);
1643
+ GGML_ASSERT(tensor->view_src != NULL);
1644
+ GGML_ASSERT(tensor->view_src->buffer != NULL);
1645
+ GGML_ASSERT(tensor->view_src->data != NULL);
1646
+
1647
+ tensor->buffer = tensor->view_src->buffer;
1648
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1649
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1650
+ }
1651
+
1652
+ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
1653
+ GGML_ASSERT(tensor->buffer == NULL);
1654
+ GGML_ASSERT(tensor->data == NULL);
1655
+ GGML_ASSERT(tensor->view_src == NULL);
1656
+ GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
1657
+ GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1658
+ (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
1659
+
1660
+ tensor->buffer = buffer;
1661
+ tensor->data = addr;
1662
+ ggml_backend_buffer_init_tensor(buffer, tensor);
1663
+ }
1664
+
1665
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1666
+ struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1667
+
1668
+ GGML_ASSERT(src != NULL);
1669
+ GGML_ASSERT(src->data && "graph must be allocated");
1670
+
1671
+ size_t id = ggml_hash_insert(&hash_set, src);
1672
+ if (id == GGML_HASHSET_ALREADY_EXISTS) {
1673
+ return node_copies[ggml_hash_find(&hash_set, src)];
1674
+ }
1675
+
1676
+ struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1677
+ if (src->view_src != NULL) {
1678
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1679
+ dst->view_offs = src->view_offs;
1680
+ }
1681
+ dst->op = src->op;
1682
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1683
+ ggml_set_name(dst, src->name);
1684
+
1685
+ // copy src
1686
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1687
+ struct ggml_tensor * s = src->src[i];
1688
+ if (s == NULL) {
1689
+ continue;
1690
+ }
1691
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1692
+ }
1693
+
1694
+ node_copies[id] = dst;
1695
+ return dst;
1696
+ }
1697
+
1698
+ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1699
+ size_t id = ggml_hash_find(hash_set, src);
1700
+ if (node_init[id]) {
1701
+ return;
1702
+ }
1703
+ node_init[id] = true;
1704
+
1705
+ struct ggml_tensor * dst = node_copies[id];
1706
+ if (dst->view_src != NULL) {
1707
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1708
+ ggml_backend_view_init(dst);
1709
+ }
1710
+ else {
1711
+ ggml_backend_tensor_copy(src, dst);
1712
+ }
1713
+
1714
+ // init src
1715
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1716
+ struct ggml_tensor * s = src->src[i];
1717
+ if (s == NULL) {
1718
+ continue;
1719
+ }
1720
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1721
+ }
1722
+ }
1723
+
1724
+ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1725
+ struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
1726
+ struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1727
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
1728
+
1729
+ struct ggml_init_params params = {
1730
+ /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
1731
+ /* .mem_buffer = */ NULL,
1732
+ /* .no_alloc = */ true
1733
+ };
1734
+
1735
+ struct ggml_context * ctx_allocated = ggml_init(params);
1736
+ struct ggml_context * ctx_unallocated = ggml_init(params);
1737
+
1738
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1739
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1740
+ ggml_hash_set_free(&hash_set);
1741
+ free(node_copies);
1742
+ free(node_init);
1743
+ ggml_free(ctx_allocated);
1744
+ ggml_free(ctx_unallocated);
1745
+ return {
1746
+ /* .buffer = */ NULL,
1747
+ /* .ctx_allocated = */ NULL,
1748
+ /* .ctx_unallocated = */ NULL,
1749
+ /* .graph = */ NULL,
1750
+ };
1751
+ }
1752
+
1753
+ // dup nodes
1754
+ for (int i = 0; i < graph->n_nodes; i++) {
1755
+ struct ggml_tensor * node = graph->nodes[i];
1756
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1757
+ }
1758
+
1759
+ // allocate nodes
1760
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1761
+ if (buffer == NULL) {
1762
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1763
+ ggml_hash_set_free(&hash_set);
1764
+ free(node_copies);
1765
+ free(node_init);
1766
+ ggml_free(ctx_allocated);
1767
+ ggml_free(ctx_unallocated);
1768
+ return {
1769
+ /* .buffer = */ NULL,
1770
+ /* .ctx_allocated = */ NULL,
1771
+ /* .ctx_unallocated = */ NULL,
1772
+ /* .graph = */ NULL,
1773
+ };
1774
+ }
1775
+
1776
+ //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1777
+
1778
+ // copy data and init views
1779
+ for (int i = 0; i < graph->n_nodes; i++) {
1780
+ struct ggml_tensor * node = graph->nodes[i];
1781
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
1782
+ }
1783
+
1784
+ // build graph copy
1785
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
1786
+ for (int i = 0; i < graph->n_nodes; i++) {
1787
+ struct ggml_tensor * node = graph->nodes[i];
1788
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
1789
+ graph_copy->nodes[i] = node_copy;
1790
+ }
1791
+ graph_copy->n_nodes = graph->n_nodes;
1792
+
1793
+ ggml_hash_set_free(&hash_set);
1794
+ free(node_copies);
1795
+ free(node_init);
1796
+
1797
+ return {
1798
+ /* .buffer = */ buffer,
1799
+ /* .ctx_allocated = */ ctx_allocated,
1800
+ /* .ctx_unallocated = */ ctx_unallocated,
1801
+ /* .graph = */ graph_copy,
1802
+ };
1803
+ }
1804
+
1805
+ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1806
+ ggml_backend_buffer_free(copy.buffer);
1807
+ ggml_free(copy.ctx_allocated);
1808
+ ggml_free(copy.ctx_unallocated);
1809
+ }
1810
+
1811
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1812
+ struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1813
+ if (copy.buffer == NULL) {
1814
+ return false;
1815
+ }
1816
+
1817
+ struct ggml_cgraph * g1 = graph;
1818
+ struct ggml_cgraph * g2 = copy.graph;
1819
+
1820
+ assert(g1->n_nodes == g2->n_nodes);
1821
+
1822
+ for (int i = 0; i < g1->n_nodes; i++) {
1823
+ //printf("eval %d/%d\n", i, g1->n_nodes);
1824
+ struct ggml_tensor * t1 = g1->nodes[i];
1825
+ struct ggml_tensor * t2 = g2->nodes[i];
1826
+
1827
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1828
+
1829
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1830
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1831
+
1832
+ ggml_backend_graph_compute(backend1, &g1v);
1833
+ ggml_backend_graph_compute(backend2, &g2v);
1834
+
1835
+ if (ggml_is_view_op(t1->op)) {
1836
+ continue;
1837
+ }
1838
+
1839
+ // compare results, calculate rms etc
1840
+ if (!callback(i, t1, t2, user_data)) {
1841
+ break;
1842
+ }
1843
+ }
1844
+
1845
+ ggml_backend_graph_copy_free(copy);
1846
+
1847
+ return true;
1848
+ }
1849
+
1850
+ // CPU backend - buffer
1851
+
1852
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
1853
+ uintptr_t data = (uintptr_t)buffer->context;
1854
+
1855
+ // align the buffer
1856
+ if (data % TENSOR_ALIGNMENT != 0) {
1857
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
1858
+ }
1859
+
1860
+ return (void *)data;
1861
+ }
1862
+
1863
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1864
+ ggml_aligned_free(buffer->context, buffer->size);
1865
+ }
1866
+
1867
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1868
+ memset((char *)tensor->data + offset, value, size);
1869
+
1870
+ GGML_UNUSED(buffer);
1871
+ }
1872
+
1873
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1874
+ memcpy((char *)tensor->data + offset, data, size);
1875
+
1876
+ GGML_UNUSED(buffer);
1877
+ }
1878
+
1879
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1880
+ memcpy(data, (const char *)tensor->data + offset, size);
1881
+
1882
+ GGML_UNUSED(buffer);
1883
+ }
1884
+
1885
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
1886
+ if (ggml_backend_buffer_is_host(src->buffer)) {
1887
+ memcpy(dst->data, src->data, ggml_nbytes(src));
1888
+ return true;
1889
+ }
1890
+ return false;
1891
+
1892
+ GGML_UNUSED(buffer);
1893
+ }
1894
+
1895
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1896
+ memset(buffer->context, value, buffer->size);
1897
+ }
1898
+
1899
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
1900
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
1901
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1902
+ /* .init_tensor = */ NULL, // no initialization required
1903
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1904
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1905
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1906
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1907
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1908
+ /* .reset = */ NULL,
1909
+ };
1910
+
1911
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
1912
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1913
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1914
+ /* .init_tensor = */ NULL, // no initialization required
1915
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1916
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1917
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1918
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1919
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1920
+ /* .reset = */ NULL,
1921
+ };
1922
+
1923
+ // CPU backend buffer type
1924
+
1925
+ // this buffer type is defined here to make it available to all backends
1926
+
1927
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1928
+ return "CPU";
1929
+
1930
+ GGML_UNUSED(buft);
1931
+ }
1932
+
1933
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1934
+ void * data = ggml_aligned_malloc(size);
1935
+
1936
+ if (data == NULL) {
1937
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1938
+ return NULL;
1939
+ }
1940
+
1941
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
1942
+ }
1943
+
1944
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1945
+ return TENSOR_ALIGNMENT;
1946
+
1947
+ GGML_UNUSED(buft);
1948
+ }
1949
+
1950
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1951
+ return true;
1952
+
1953
+ GGML_UNUSED(buft);
1954
+ }
1955
+
1956
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
1957
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1958
+ /* .iface = */ {
1959
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
1960
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1961
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1962
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1963
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1964
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1965
+ },
1966
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1967
+ /* .context = */ NULL,
1968
+ };
1969
+
1970
+ return &ggml_backend_cpu_buffer_type;
1971
+ }
1972
+
1973
+ static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
1974
+ return "CPU_Mapped";
1975
+
1976
+ GGML_UNUSED(buft);
1977
+ }
1978
+
1979
+ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
1980
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1981
+ /* .iface = */ {
1982
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
1983
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1984
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1985
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1986
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1987
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1988
+ },
1989
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1990
+ /* .context = */ NULL,
1991
+ };
1992
+
1993
+ return &ggml_backend_cpu_buffer_type;
1994
+ }
1995
+
1996
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1997
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1998
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1999
+ }