whisper.rn 0.4.0-rc.8 → 0.4.0-rc.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/android/src/main/CMakeLists.txt +2 -1
  2. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
  3. package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
  4. package/android/src/main/java/com/rnwhisper/WhisperContext.java +20 -3
  5. package/android/src/main/jni.cpp +29 -1
  6. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  7. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  8. package/cpp/ggml-aarch64.c +3209 -0
  9. package/cpp/ggml-aarch64.h +39 -0
  10. package/cpp/ggml-alloc.c +725 -517
  11. package/cpp/ggml-alloc.h +47 -65
  12. package/cpp/ggml-backend-impl.h +166 -55
  13. package/cpp/ggml-backend.cpp +2635 -0
  14. package/cpp/ggml-backend.h +202 -85
  15. package/cpp/ggml-common.h +1853 -0
  16. package/cpp/ggml-cpu-impl.h +614 -0
  17. package/cpp/ggml-impl.h +143 -180
  18. package/cpp/ggml-metal.h +13 -11
  19. package/cpp/ggml-metal.m +2955 -1632
  20. package/cpp/ggml-quants.c +9824 -3263
  21. package/cpp/ggml-quants.h +133 -248
  22. package/cpp/ggml-whisper.metallib +0 -0
  23. package/cpp/ggml.c +8482 -5142
  24. package/cpp/ggml.h +633 -349
  25. package/cpp/rn-whisper.cpp +91 -0
  26. package/cpp/rn-whisper.h +2 -0
  27. package/cpp/whisper.cpp +1427 -658
  28. package/cpp/whisper.h +84 -28
  29. package/ios/RNWhisper.mm +124 -37
  30. package/ios/RNWhisperAudioUtils.h +1 -0
  31. package/ios/RNWhisperAudioUtils.m +20 -13
  32. package/ios/RNWhisperContext.h +3 -2
  33. package/ios/RNWhisperContext.mm +39 -7
  34. package/jest/mock.js +9 -1
  35. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  36. package/lib/commonjs/index.js +48 -19
  37. package/lib/commonjs/index.js.map +1 -1
  38. package/lib/commonjs/version.json +1 -1
  39. package/lib/module/NativeRNWhisper.js.map +1 -1
  40. package/lib/module/index.js +48 -19
  41. package/lib/module/index.js.map +1 -1
  42. package/lib/module/version.json +1 -1
  43. package/lib/typescript/NativeRNWhisper.d.ts +6 -3
  44. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  45. package/lib/typescript/index.d.ts +25 -3
  46. package/lib/typescript/index.d.ts.map +1 -1
  47. package/package.json +6 -5
  48. package/src/NativeRNWhisper.ts +12 -3
  49. package/src/index.ts +63 -24
  50. package/src/version.json +1 -1
  51. package/whisper-rn.podspec +9 -2
  52. package/cpp/ggml-backend.c +0 -1718
  53. package/cpp/ggml-metal-whisper.metal +0 -5820
@@ -0,0 +1,2635 @@
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
11
+ #include "ggml-backend-impl.h"
12
+ #include "ggml-alloc.h"
13
+ #include "ggml-impl.h"
14
+
15
+ #include <assert.h>
16
+ #include <limits.h>
17
+ #include <stdarg.h>
18
+ #include <stdio.h>
19
+ #include <stdlib.h>
20
+ #include <string.h>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ #ifdef __APPLE__
25
+ #include <sys/types.h>
26
+ #include <sys/sysctl.h>
27
+ #endif
28
+
29
+
30
+ // backend buffer type
31
+
32
+ const char * wsp_ggml_backend_buft_name(wsp_ggml_backend_buffer_type_t buft) {
33
+ return buft->iface.get_name(buft);
34
+ }
35
+
36
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
37
+ return buft->iface.alloc_buffer(buft, size);
38
+ }
39
+
40
+ size_t wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
41
+ return buft->iface.get_alignment(buft);
42
+ }
43
+
44
+ size_t wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_type_t buft) {
45
+ // get_max_size is optional, defaults to SIZE_MAX
46
+ if (buft->iface.get_max_size) {
47
+ return buft->iface.get_max_size(buft);
48
+ }
49
+ return SIZE_MAX;
50
+ }
51
+
52
+ size_t wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_type_t buft, struct wsp_ggml_tensor * tensor) {
53
+ // get_alloc_size is optional, defaults to wsp_ggml_nbytes
54
+ if (buft->iface.get_alloc_size) {
55
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
56
+ assert(size >= wsp_ggml_nbytes(tensor));
57
+ return size;
58
+ }
59
+ return wsp_ggml_nbytes(tensor);
60
+ }
61
+
62
+ bool wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_type_t buft) {
63
+ if (buft->iface.is_host) {
64
+ return buft->iface.is_host(buft);
65
+ }
66
+ return false;
67
+ }
68
+
69
+ wsp_ggml_backend_dev_t wsp_ggml_backend_buft_get_device(wsp_ggml_backend_buffer_type_t buft) {
70
+ return buft->device;
71
+ }
72
+
73
+ // backend buffer
74
+
75
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
76
+ wsp_ggml_backend_buffer_type_t buft,
77
+ struct wsp_ggml_backend_buffer_i iface,
78
+ void * context,
79
+ size_t size) {
80
+ wsp_ggml_backend_buffer_t buffer = new wsp_ggml_backend_buffer {
81
+ /* .interface = */ iface,
82
+ /* .buft = */ buft,
83
+ /* .context = */ context,
84
+ /* .size = */ size,
85
+ /* .usage = */ WSP_GGML_BACKEND_BUFFER_USAGE_ANY
86
+ };
87
+
88
+ return buffer;
89
+ }
90
+
91
+ const char * wsp_ggml_backend_buffer_name(wsp_ggml_backend_buffer_t buffer) {
92
+ return buffer->iface.get_name(buffer);
93
+ }
94
+
95
+ void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
96
+ if (buffer == NULL) {
97
+ return;
98
+ }
99
+
100
+ if (buffer->iface.free_buffer != NULL) {
101
+ buffer->iface.free_buffer(buffer);
102
+ }
103
+ delete buffer;
104
+ }
105
+
106
+ size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
107
+ return buffer->size;
108
+ }
109
+
110
+ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
111
+ void * base = buffer->iface.get_base(buffer);
112
+
113
+ WSP_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
114
+
115
+ return base;
116
+ }
117
+
118
+ void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
119
+ // init_tensor is optional
120
+ if (buffer->iface.init_tensor) {
121
+ buffer->iface.init_tensor(buffer, tensor);
122
+ }
123
+ }
124
+
125
+ size_t wsp_ggml_backend_buffer_get_alignment(wsp_ggml_backend_buffer_t buffer) {
126
+ return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_buffer_get_type(buffer));
127
+ }
128
+
129
+ size_t wsp_ggml_backend_buffer_get_max_size(wsp_ggml_backend_buffer_t buffer) {
130
+ return wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_buffer_get_type(buffer));
131
+ }
132
+
133
+ size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
134
+ return wsp_ggml_backend_buft_get_alloc_size(wsp_ggml_backend_buffer_get_type(buffer), tensor);
135
+ }
136
+
137
+ void wsp_ggml_backend_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
138
+ buffer->iface.clear(buffer, value);
139
+ }
140
+
141
+ bool wsp_ggml_backend_buffer_is_host(wsp_ggml_backend_buffer_t buffer) {
142
+ return wsp_ggml_backend_buft_is_host(wsp_ggml_backend_buffer_get_type(buffer));
143
+ }
144
+
145
+ void wsp_ggml_backend_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
146
+ buffer->usage = usage;
147
+
148
+ // FIXME: add a generic callback to the buffer interface
149
+ if (wsp_ggml_backend_buffer_is_multi_buffer(buffer)) {
150
+ wsp_ggml_backend_multi_buffer_set_usage(buffer, usage);
151
+ }
152
+ }
153
+
154
+ enum wsp_ggml_backend_buffer_usage wsp_ggml_backend_buffer_get_usage(wsp_ggml_backend_buffer_t buffer) {
155
+ return buffer->usage;
156
+ }
157
+
158
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_buffer_get_type(wsp_ggml_backend_buffer_t buffer) {
159
+ return buffer->buft;
160
+ }
161
+
162
+ void wsp_ggml_backend_buffer_reset(wsp_ggml_backend_buffer_t buffer) {
163
+ if (buffer->iface.reset) {
164
+ buffer->iface.reset(buffer);
165
+ }
166
+ }
167
+
168
+ bool wsp_ggml_backend_buffer_copy_tensor(const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
169
+ wsp_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
170
+ if (dst_buf->iface.cpy_tensor) {
171
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
172
+ }
173
+ return false;
174
+ }
175
+
176
+ // backend
177
+
178
+ wsp_ggml_guid_t wsp_ggml_backend_guid(wsp_ggml_backend_t backend) {
179
+ if (backend == NULL) {
180
+ return NULL;
181
+ }
182
+ return backend->guid;
183
+ }
184
+
185
+ const char * wsp_ggml_backend_name(wsp_ggml_backend_t backend) {
186
+ if (backend == NULL) {
187
+ return "NULL";
188
+ }
189
+ return backend->iface.get_name(backend);
190
+ }
191
+
192
+ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
193
+ if (backend == NULL) {
194
+ return;
195
+ }
196
+
197
+ backend->iface.free(backend);
198
+ }
199
+
200
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_get_default_buffer_type(wsp_ggml_backend_t backend) {
201
+ return backend->iface.get_default_buffer_type(backend);
202
+ }
203
+
204
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
205
+ return wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_get_default_buffer_type(backend), size);
206
+ }
207
+
208
+ size_t wsp_ggml_backend_get_alignment(wsp_ggml_backend_t backend) {
209
+ return wsp_ggml_backend_buft_get_alignment(wsp_ggml_backend_get_default_buffer_type(backend));
210
+ }
211
+
212
+ size_t wsp_ggml_backend_get_max_size(wsp_ggml_backend_t backend) {
213
+ return wsp_ggml_backend_buft_get_max_size(wsp_ggml_backend_get_default_buffer_type(backend));
214
+ }
215
+
216
+ void wsp_ggml_backend_tensor_set_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
217
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
218
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
219
+
220
+ if (backend->iface.set_tensor_async == NULL) {
221
+ wsp_ggml_backend_tensor_set(tensor, data, offset, size);
222
+ } else {
223
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
224
+ }
225
+ }
226
+
227
+ void wsp_ggml_backend_tensor_get_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
228
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
229
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
230
+
231
+ if (backend->iface.get_tensor_async == NULL) {
232
+ wsp_ggml_backend_tensor_get(tensor, data, offset, size);
233
+ } else {
234
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
235
+ }
236
+ }
237
+
238
+ void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239
+ wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
+
241
+ WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
244
+
245
+ if (!size) {
246
+ return;
247
+ }
248
+
249
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
250
+ }
251
+
252
+ void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253
+ wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
+
255
+ WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
258
+
259
+ if (!size) {
260
+ return;
261
+ }
262
+
263
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
264
+ }
265
+
266
+ WSP_GGML_API void wsp_ggml_backend_tensor_memset(struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
+ wsp_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
+
269
+ WSP_GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
272
+
273
+ if (!size) {
274
+ return;
275
+ }
276
+
277
+ WSP_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
278
+
279
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
+ }
281
+
282
+ void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
283
+ if (backend->iface.synchronize == NULL) {
284
+ return;
285
+ }
286
+
287
+ backend->iface.synchronize(backend);
288
+ }
289
+
290
+ wsp_ggml_backend_graph_plan_t wsp_ggml_backend_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
291
+ WSP_GGML_ASSERT(backend->iface.graph_plan_create != NULL);
292
+
293
+ return backend->iface.graph_plan_create(backend, cgraph);
294
+ }
295
+
296
+ void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
297
+ WSP_GGML_ASSERT(backend->iface.graph_plan_free != NULL);
298
+
299
+ backend->iface.graph_plan_free(backend, plan);
300
+ }
301
+
302
+ enum wsp_ggml_status wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
303
+ WSP_GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
304
+
305
+ return backend->iface.graph_plan_compute(backend, plan);
306
+ }
307
+
308
+ enum wsp_ggml_status wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
309
+ enum wsp_ggml_status err = wsp_ggml_backend_graph_compute_async(backend, cgraph);
310
+ wsp_ggml_backend_synchronize(backend);
311
+ return err;
312
+ }
313
+
314
+ enum wsp_ggml_status wsp_ggml_backend_graph_compute_async(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
315
+ return backend->iface.graph_compute(backend, cgraph);
316
+ }
317
+
318
+ bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
319
+ // helper to ease transition to device interface
320
+ if (backend->device) {
321
+ return wsp_ggml_backend_dev_supports_op(backend->device, op);
322
+ }
323
+
324
+ return backend->iface.supports_op(backend, op);
325
+ }
326
+
327
+ bool wsp_ggml_backend_supports_buft(wsp_ggml_backend_t backend, wsp_ggml_backend_buffer_type_t buft) {
328
+ // helper to ease transition to device interface
329
+ if (backend->device) {
330
+ return wsp_ggml_backend_dev_supports_buft(backend->device, buft);
331
+ }
332
+ return backend->iface.supports_buft(backend, buft);
333
+ }
334
+
335
+ bool wsp_ggml_backend_offload_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
336
+ // helper to ease transition to device interface
337
+ if (backend->device) {
338
+ return wsp_ggml_backend_dev_offload_op(backend->device, op);
339
+ }
340
+
341
+ if (backend->iface.offload_op != NULL) {
342
+ return backend->iface.offload_op(backend, op);
343
+ }
344
+ return false;
345
+ }
346
+
347
+ wsp_ggml_backend_dev_t wsp_ggml_backend_get_device(wsp_ggml_backend_t backend) {
348
+ return backend->device;
349
+ }
350
+
351
+ // backend copy
352
+
353
+ static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
354
+ if (a->type != b->type) {
355
+ return false;
356
+ }
357
+ for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
358
+ if (a->ne[i] != b->ne[i]) {
359
+ return false;
360
+ }
361
+ if (a->nb[i] != b->nb[i]) {
362
+ return false;
363
+ }
364
+ }
365
+ return true;
366
+ }
367
+
368
+ void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
369
+ WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
370
+
371
+ if (src == dst) {
372
+ return;
373
+ }
374
+
375
+ if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
376
+ wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
377
+ } else if (wsp_ggml_backend_buffer_is_host(dst->buffer)) {
378
+ wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
379
+ } else if (!wsp_ggml_backend_buffer_copy_tensor(src, dst)) {
380
+ #ifndef NDEBUG
381
+ WSP_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, wsp_ggml_backend_buffer_name(src->buffer), wsp_ggml_backend_buffer_name(dst->buffer));
382
+ #endif
383
+ size_t nbytes = wsp_ggml_nbytes(src);
384
+ void * data = malloc(nbytes);
385
+ wsp_ggml_backend_tensor_get(src, data, 0, nbytes);
386
+ wsp_ggml_backend_tensor_set(dst, data, 0, nbytes);
387
+ free(data);
388
+ }
389
+ }
390
+
391
+ void wsp_ggml_backend_tensor_copy_async(wsp_ggml_backend_t backend_src, wsp_ggml_backend_t backend_dst, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
392
+ WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
393
+
394
+ if (src == dst) {
395
+ return;
396
+ }
397
+
398
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
399
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
400
+ return;
401
+ }
402
+ }
403
+
404
+ // an async copy would normally happen after all the queued operations on both backends are completed
405
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
406
+ wsp_ggml_backend_synchronize(backend_src);
407
+ wsp_ggml_backend_synchronize(backend_dst);
408
+ wsp_ggml_backend_tensor_copy(src, dst);
409
+ }
410
+
411
+ // events
412
+
413
+ wsp_ggml_backend_event_t wsp_ggml_backend_event_new(wsp_ggml_backend_dev_t device) {
414
+ // null device is allowed for the transition period to the device interface
415
+ if (device == NULL || device->iface.event_new == NULL) {
416
+ return NULL;
417
+ }
418
+ return device->iface.event_new(device);
419
+ }
420
+
421
+ void wsp_ggml_backend_event_free(wsp_ggml_backend_event_t event) {
422
+ if (event == NULL) {
423
+ return;
424
+ }
425
+ event->device->iface.event_free(event->device, event);
426
+ }
427
+
428
+ void wsp_ggml_backend_event_record(wsp_ggml_backend_event_t event, wsp_ggml_backend_t backend) {
429
+ WSP_GGML_ASSERT(backend->iface.event_record != NULL);
430
+
431
+ backend->iface.event_record(backend, event);
432
+ }
433
+
434
+ void wsp_ggml_backend_event_synchronize(wsp_ggml_backend_event_t event) {
435
+ WSP_GGML_ASSERT(event->device->iface.event_synchronize);
436
+
437
+ event->device->iface.event_synchronize(event->device, event);
438
+ }
439
+
440
+ void wsp_ggml_backend_event_wait(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event) {
441
+ WSP_GGML_ASSERT(backend->iface.event_wait != NULL);
442
+
443
+ backend->iface.event_wait(backend, event);
444
+ }
445
+
446
+ // Backend device
447
+
448
+ const char * wsp_ggml_backend_dev_name(wsp_ggml_backend_dev_t device) {
449
+ return device->iface.get_name(device);
450
+ }
451
+
452
+ const char * wsp_ggml_backend_dev_description(wsp_ggml_backend_dev_t device) {
453
+ return device->iface.get_description(device);
454
+ }
455
+
456
+ void wsp_ggml_backend_dev_memory(wsp_ggml_backend_dev_t device, size_t * free, size_t * total) {
457
+ device->iface.get_memory(device, free, total);
458
+ }
459
+
460
+ enum wsp_ggml_backend_dev_type wsp_ggml_backend_dev_type(wsp_ggml_backend_dev_t device) {
461
+ return device->iface.get_type(device);
462
+ }
463
+
464
+ void wsp_ggml_backend_dev_get_props(wsp_ggml_backend_dev_t device, struct wsp_ggml_backend_dev_props * props) {
465
+ memset(props, 0, sizeof(*props));
466
+ device->iface.get_props(device, props);
467
+ }
468
+
469
+ wsp_ggml_backend_reg_t wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_dev_t device) {
470
+ return device->reg;
471
+ }
472
+
473
+ wsp_ggml_backend_t wsp_ggml_backend_dev_init(wsp_ggml_backend_dev_t device, const char * params) {
474
+ return device->iface.init_backend(device, params);
475
+ }
476
+
477
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_buffer_type(wsp_ggml_backend_dev_t device) {
478
+ return device->iface.get_buffer_type(device);
479
+ }
480
+
481
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_dev_host_buffer_type(wsp_ggml_backend_dev_t device) {
482
+ if (device->iface.get_host_buffer_type == NULL) {
483
+ return NULL;
484
+ }
485
+
486
+ return device->iface.get_host_buffer_type(device);
487
+ }
488
+
489
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_dev_buffer_from_host_ptr(wsp_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
490
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
491
+ }
492
+
493
+ bool wsp_ggml_backend_dev_supports_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
494
+ return device->iface.supports_op(device, op);
495
+ }
496
+
497
+ bool wsp_ggml_backend_dev_supports_buft(wsp_ggml_backend_dev_t device, wsp_ggml_backend_buffer_type_t buft) {
498
+ return device->iface.supports_buft(device, buft);
499
+ }
500
+
501
+ bool wsp_ggml_backend_dev_offload_op(wsp_ggml_backend_dev_t device, const struct wsp_ggml_tensor * op) {
502
+ if (device->iface.offload_op != NULL) {
503
+ return device->iface.offload_op(device, op);
504
+ }
505
+
506
+ return false;
507
+ }
508
+
509
+ // Backend (reg)
510
+
511
+ const char * wsp_ggml_backend_reg_name(wsp_ggml_backend_reg_t reg) {
512
+ return reg->iface.get_name(reg);
513
+ }
514
+
515
+ size_t wsp_ggml_backend_reg_dev_count(wsp_ggml_backend_reg_t reg) {
516
+ return reg->iface.get_device_count(reg);
517
+ }
518
+
519
+ wsp_ggml_backend_dev_t wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_reg_t reg, size_t index) {
520
+ return reg->iface.get_device(reg, index);
521
+ }
522
+
523
+ void * wsp_ggml_backend_reg_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
524
+ if (!reg->iface.get_proc_address) {
525
+ return NULL;
526
+ }
527
+ return reg->iface.get_proc_address(reg, name);
528
+ }
529
+
530
+ // Backend registry
531
+
532
+ #ifdef WSP_GGML_USE_CUDA
533
+ #include "ggml-cuda.h"
534
+ #endif
535
+
536
+ #ifdef WSP_GGML_USE_METAL
537
+ #include "ggml-metal.h"
538
+ #endif
539
+
540
+ #ifdef WSP_GGML_USE_SYCL
541
+ #include "ggml-sycl.h"
542
+ #endif
543
+
544
+ #ifdef WSP_GGML_USE_VULKAN
545
+ #include "ggml-vulkan.h"
546
+ #endif
547
+
548
+ #ifdef WSP_GGML_USE_BLAS
549
+ #include "ggml-blas.h"
550
+ #endif
551
+
552
+ #ifdef WSP_GGML_USE_RPC
553
+ #include "ggml-rpc.h"
554
+ #endif
555
+
556
+ #ifndef __AMX_INT8__
557
+ #undef WSP_GGML_USE_AMX
558
+ #endif
559
+
560
+ #ifdef WSP_GGML_USE_AMX
561
+ # include "ggml-amx.h"
562
+ #endif
563
+
564
+ #ifdef WSP_GGML_USE_CANN
565
+ #include "ggml-cann.h"
566
+ #endif
567
+
568
+ struct wsp_ggml_backend_registry {
569
+ std::vector<wsp_ggml_backend_reg_t> backends;
570
+ std::vector<wsp_ggml_backend_dev_t> devices;
571
+
572
+ wsp_ggml_backend_registry() {
573
+ #ifdef WSP_GGML_USE_CUDA
574
+ register_backend(wsp_ggml_backend_cuda_reg());
575
+ #endif
576
+ #ifdef WSP_GGML_USE_METAL
577
+ #include <TargetConditionals.h>
578
+ #if !TARGET_OS_SIMULATOR
579
+ register_backend(wsp_ggml_backend_metal_reg());
580
+ #endif
581
+ #endif
582
+ #ifdef WSP_GGML_USE_SYCL
583
+ register_backend(wsp_ggml_backend_sycl_reg());
584
+ #endif
585
+ #ifdef WSP_GGML_USE_VULKAN
586
+ register_backend(wsp_ggml_backend_vk_reg());
587
+ #endif
588
+ #ifdef WSP_GGML_USE_BLAS
589
+ register_backend(wsp_ggml_backend_blas_reg());
590
+ #endif
591
+ #ifdef WSP_GGML_USE_RPC
592
+ register_backend(wsp_ggml_backend_rpc_reg());
593
+ #endif
594
+ #ifdef WSP_GGML_USE_AMX
595
+ register_backend(wsp_ggml_backend_amx_reg());
596
+ #endif
597
+ #ifdef WSP_GGML_USE_CANN
598
+ register_backend(wsp_ggml_backend_cann_reg());
599
+ #endif
600
+
601
+ // TODO: kompute
602
+
603
+ register_backend(wsp_ggml_backend_cpu_reg());
604
+ }
605
+
606
+ void register_backend(wsp_ggml_backend_reg_t reg) {
607
+ #ifndef NDEBUG
608
+ WSP_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
609
+ __func__, wsp_ggml_backend_reg_name(reg), wsp_ggml_backend_reg_dev_count(reg));
610
+ #endif
611
+ backends.push_back(reg);
612
+ for (size_t i = 0; i < wsp_ggml_backend_reg_dev_count(reg); i++) {
613
+ register_device(wsp_ggml_backend_reg_dev_get(reg, i));
614
+ }
615
+ }
616
+
617
+ void register_device(wsp_ggml_backend_dev_t device) {
618
+ #ifndef NDEBUG
619
+ WSP_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, wsp_ggml_backend_dev_name(device), wsp_ggml_backend_dev_description(device));
620
+ #endif
621
+ devices.push_back(device);
622
+ }
623
+ };
624
+
625
+ static wsp_ggml_backend_registry & get_reg() {
626
+ static wsp_ggml_backend_registry reg;
627
+ return reg;
628
+ }
629
+
630
+ // Internal API
631
+ void wsp_ggml_backend_register(wsp_ggml_backend_reg_t reg) {
632
+ get_reg().register_backend(reg);
633
+ }
634
+
635
+ void wsp_ggml_backend_device_register(wsp_ggml_backend_dev_t device) {
636
+ get_reg().register_device(device);
637
+ }
638
+
639
+ // Backend (reg) enumeration
640
+ size_t wsp_ggml_backend_reg_count() {
641
+ return get_reg().backends.size();
642
+ }
643
+
644
+ wsp_ggml_backend_reg_t wsp_ggml_backend_reg_get(size_t index) {
645
+ WSP_GGML_ASSERT(index < wsp_ggml_backend_reg_count());
646
+ return get_reg().backends[index];
647
+ }
648
+
649
+ wsp_ggml_backend_reg_t wsp_ggml_backend_reg_by_name(const char * name) {
650
+ for (size_t i = 0; i < wsp_ggml_backend_reg_count(); i++) {
651
+ wsp_ggml_backend_reg_t reg = wsp_ggml_backend_reg_get(i);
652
+ if (strcmp(wsp_ggml_backend_reg_name(reg), name) == 0) {
653
+ return reg;
654
+ }
655
+ }
656
+ return NULL;
657
+ }
658
+
659
+ // Device enumeration
660
+ size_t wsp_ggml_backend_dev_count() {
661
+ return get_reg().devices.size();
662
+ }
663
+
664
+ wsp_ggml_backend_dev_t wsp_ggml_backend_dev_get(size_t index) {
665
+ WSP_GGML_ASSERT(index < wsp_ggml_backend_dev_count());
666
+ return get_reg().devices[index];
667
+ }
668
+
669
+ wsp_ggml_backend_dev_t wsp_ggml_backend_dev_by_name(const char * name) {
670
+ for (size_t i = 0; i < wsp_ggml_backend_dev_count(); i++) {
671
+ wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
672
+ if (strcmp(wsp_ggml_backend_dev_name(dev), name) == 0) {
673
+ return dev;
674
+ }
675
+ }
676
+ return NULL;
677
+ }
678
+
679
+ wsp_ggml_backend_dev_t wsp_ggml_backend_dev_by_type(enum wsp_ggml_backend_dev_type type) {
680
+ for (size_t i = 0; i < wsp_ggml_backend_dev_count(); i++) {
681
+ wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
682
+ if (wsp_ggml_backend_dev_type(dev) == type) {
683
+ return dev;
684
+ }
685
+ }
686
+ return NULL;
687
+ }
688
+
689
+ // Convenience functions
690
+ wsp_ggml_backend_t wsp_ggml_backend_init_by_name(const char * name, const char * params) {
691
+ wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_name(name);
692
+ if (!dev) {
693
+ return NULL;
694
+ }
695
+ return wsp_ggml_backend_dev_init(dev, params);
696
+ }
697
+
698
+ wsp_ggml_backend_t wsp_ggml_backend_init_by_type(enum wsp_ggml_backend_dev_type type, const char * params) {
699
+ wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_type(type);
700
+ if (!dev) {
701
+ return NULL;
702
+ }
703
+ return wsp_ggml_backend_dev_init(dev, params);
704
+ }
705
+
706
+ wsp_ggml_backend_t wsp_ggml_backend_init_best(void) {
707
+ wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
708
+ if (!dev) {
709
+ dev = wsp_ggml_backend_dev_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
710
+ }
711
+ if (!dev) {
712
+ return NULL;
713
+ }
714
+ return wsp_ggml_backend_dev_init(dev, NULL);
715
+ }
716
+
717
+ // backend CPU
718
+
719
+ static const char * wsp_ggml_backend_cpu_buffer_get_name(wsp_ggml_backend_buffer_t buffer) {
720
+ return "CPU";
721
+
722
+ WSP_GGML_UNUSED(buffer);
723
+ }
724
+
725
+ static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
726
+ uintptr_t data = (uintptr_t)buffer->context;
727
+
728
+ // align the buffer
729
+ if (data % TENSOR_ALIGNMENT != 0) {
730
+ data = WSP_GGML_PAD(data, TENSOR_ALIGNMENT);
731
+ }
732
+
733
+ return (void *)data;
734
+ }
735
+
736
+ static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
737
+ wsp_ggml_aligned_free(buffer->context, buffer->size);
738
+ }
739
+
740
+ static void wsp_ggml_backend_cpu_buffer_memset_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
741
+ memset((char *)tensor->data + offset, value, size);
742
+
743
+ WSP_GGML_UNUSED(buffer);
744
+ }
745
+
746
+ static void wsp_ggml_backend_cpu_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
747
+ memcpy((char *)tensor->data + offset, data, size);
748
+
749
+ WSP_GGML_UNUSED(buffer);
750
+ }
751
+
752
+ static void wsp_ggml_backend_cpu_buffer_get_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
753
+ memcpy(data, (const char *)tensor->data + offset, size);
754
+
755
+ WSP_GGML_UNUSED(buffer);
756
+ }
757
+
758
+ static bool wsp_ggml_backend_cpu_buffer_cpy_tensor(wsp_ggml_backend_buffer_t buffer, const struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
759
+ if (wsp_ggml_backend_buffer_is_host(src->buffer)) {
760
+ memcpy(dst->data, src->data, wsp_ggml_nbytes(src));
761
+ return true;
762
+ }
763
+ return false;
764
+
765
+ WSP_GGML_UNUSED(buffer);
766
+ }
767
+
768
+ static void wsp_ggml_backend_cpu_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
769
+ memset(buffer->context, value, buffer->size);
770
+ }
771
+
772
+ static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_i = {
773
+ /* .get_name = */ wsp_ggml_backend_cpu_buffer_get_name,
774
+ /* .free_buffer = */ wsp_ggml_backend_cpu_buffer_free_buffer,
775
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
776
+ /* .init_tensor = */ NULL, // no initialization required
777
+ /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
778
+ /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
779
+ /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
780
+ /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
781
+ /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
782
+ /* .reset = */ NULL,
783
+ };
784
+
785
+ static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_cpu_buffer_from_ptr_i = {
786
+ /* .get_name = */ wsp_ggml_backend_cpu_buffer_get_name,
787
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
788
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
789
+ /* .init_tensor = */ NULL, // no initialization required
790
+ /* .memset_tensor = */ wsp_ggml_backend_cpu_buffer_memset_tensor,
791
+ /* .set_tensor = */ wsp_ggml_backend_cpu_buffer_set_tensor,
792
+ /* .get_tensor = */ wsp_ggml_backend_cpu_buffer_get_tensor,
793
+ /* .cpy_tensor = */ wsp_ggml_backend_cpu_buffer_cpy_tensor,
794
+ /* .clear = */ wsp_ggml_backend_cpu_buffer_clear,
795
+ /* .reset = */ NULL,
796
+ };
797
+
798
+ static const char * wsp_ggml_backend_cpu_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
799
+ return "CPU";
800
+
801
+ WSP_GGML_UNUSED(buft);
802
+ }
803
+
804
+ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
805
+ auto alloc_size = size;
806
+ if (alloc_size == 0) {
807
+ alloc_size = 1;
808
+ }
809
+
810
+ void * data = wsp_ggml_aligned_malloc(alloc_size);
811
+
812
+ if (data == NULL) {
813
+ WSP_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
814
+ return NULL;
815
+ }
816
+
817
+ return wsp_ggml_backend_buffer_init(buft, wsp_ggml_backend_cpu_buffer_i, data, alloc_size);
818
+ }
819
+
820
+ static size_t wsp_ggml_backend_cpu_buffer_type_get_alignment(wsp_ggml_backend_buffer_type_t buft) {
821
+ return TENSOR_ALIGNMENT;
822
+
823
+ WSP_GGML_UNUSED(buft);
824
+ }
825
+
826
+ static bool wsp_ggml_backend_cpu_buffer_type_is_host(wsp_ggml_backend_buffer_type_t buft) {
827
+ return true;
828
+
829
+ WSP_GGML_UNUSED(buft);
830
+ }
831
+
832
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_buffer_type(void) {
833
+ static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type = {
834
+ /* .iface = */ {
835
+ /* .get_name = */ wsp_ggml_backend_cpu_buffer_type_get_name,
836
+ /* .alloc_buffer = */ wsp_ggml_backend_cpu_buffer_type_alloc_buffer,
837
+ /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
838
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
839
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
840
+ /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
841
+ },
842
+ /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
843
+ /* .context = */ NULL,
844
+ };
845
+
846
+ return &wsp_ggml_backend_cpu_buffer_type;
847
+ }
848
+
849
+ #ifdef WSP_GGML_USE_CPU_HBM
850
+
851
+ // buffer type HBM
852
+
853
+ #include <hbwmalloc.h>
854
+
855
+ static const char * wsp_ggml_backend_cpu_hbm_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
856
+ return "CPU_HBM";
857
+
858
+ WSP_GGML_UNUSED(buft);
859
+ }
860
+
861
+ static const char * wsp_ggml_backend_cpu_hbm_buffer_get_name(wsp_ggml_backend_buffer_t buf) {
862
+ return "CPU_HBM";
863
+
864
+ WSP_GGML_UNUSED(buf);
865
+ }
866
+
867
+ static void wsp_ggml_backend_cpu_hbm_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
868
+ hbw_free(buffer->context);
869
+ }
870
+
871
+ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
872
+ //void * ptr = hbw_malloc(size);
873
+ void * ptr;
874
+ int result = hbw_posix_memalign(&ptr, wsp_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
875
+ if (result != 0) {
876
+ WSP_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
877
+ return NULL;
878
+ }
879
+
880
+ wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
881
+ buffer->buft = buft;
882
+ buffer->iface.get_name = wsp_ggml_backend_cpu_hbm_buffer_get_name;
883
+ buffer->iface.free_buffer = wsp_ggml_backend_cpu_hbm_buffer_free_buffer;
884
+
885
+ return buffer;
886
+ }
887
+
888
+ wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_hbm_buffer_type(void) {
889
+ static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type_hbm = {
890
+ /* .iface = */ {
891
+ /* .get_name = */ wsp_ggml_backend_cpu_hbm_buffer_type_get_name,
892
+ /* .alloc_buffer = */ wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
893
+ /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
894
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
895
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
896
+ /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
897
+ },
898
+ /* .context = */ NULL,
899
+ };
900
+
901
+ return &wsp_ggml_backend_cpu_buffer_type_hbm;
902
+ }
903
+ #endif
904
+
905
+ struct wsp_ggml_backend_cpu_context {
906
+ int n_threads;
907
+ wsp_ggml_threadpool_t threadpool;
908
+
909
+ uint8_t * work_data;
910
+ size_t work_size;
911
+
912
+ wsp_ggml_abort_callback abort_callback;
913
+ void * abort_callback_data;
914
+ };
915
+
916
+ static const char * wsp_ggml_backend_cpu_get_name(wsp_ggml_backend_t backend) {
917
+ return "CPU";
918
+
919
+ WSP_GGML_UNUSED(backend);
920
+ }
921
+
922
+ static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
923
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
924
+ delete[] cpu_ctx->work_data;
925
+ delete cpu_ctx;
926
+ delete backend;
927
+ }
928
+
929
+ static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_get_default_buffer_type(wsp_ggml_backend_t backend) {
930
+ return wsp_ggml_backend_cpu_buffer_type();
931
+
932
+ WSP_GGML_UNUSED(backend);
933
+ }
934
+
935
+ struct wsp_ggml_backend_plan_cpu {
936
+ struct wsp_ggml_cplan cplan;
937
+ struct wsp_ggml_cgraph cgraph;
938
+ };
939
+
940
+ static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_ggml_backend_t backend, const struct wsp_ggml_cgraph * cgraph) {
941
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
942
+
943
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = new wsp_ggml_backend_plan_cpu;
944
+
945
+ cpu_plan->cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
946
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
947
+
948
+ if (cpu_plan->cplan.work_size > 0) {
949
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
950
+ if (cpu_plan->cplan.work_data == NULL) {
951
+ delete cpu_plan;
952
+ return NULL;
953
+ }
954
+ }
955
+
956
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
957
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
958
+
959
+ return cpu_plan;
960
+ }
961
+
962
+ static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
963
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
964
+
965
+ delete[] cpu_plan->cplan.work_data;
966
+ delete cpu_plan;
967
+
968
+ WSP_GGML_UNUSED(backend);
969
+ }
970
+
971
+ static enum wsp_ggml_status wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
972
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
973
+
974
+ return wsp_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
975
+
976
+ WSP_GGML_UNUSED(backend);
977
+ }
978
+
979
+ static enum wsp_ggml_status wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
980
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
981
+
982
+ struct wsp_ggml_cplan cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
983
+
984
+ if (cpu_ctx->work_size < cplan.work_size) {
985
+ delete[] cpu_ctx->work_data;
986
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
987
+ if (cpu_ctx->work_data == NULL) {
988
+ cpu_ctx->work_size = 0;
989
+ return WSP_GGML_STATUS_ALLOC_FAILED;
990
+ }
991
+ cpu_ctx->work_size = cplan.work_size;
992
+ }
993
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
994
+
995
+ cplan.abort_callback = cpu_ctx->abort_callback;
996
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
997
+
998
+ return wsp_ggml_graph_compute(cgraph, &cplan);
999
+ }
1000
+
1001
+ static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
1002
+ /* .get_name = */ wsp_ggml_backend_cpu_get_name,
1003
+ /* .free = */ wsp_ggml_backend_cpu_free,
1004
+ /* .get_default_buffer_type = */ wsp_ggml_backend_cpu_get_default_buffer_type,
1005
+ /* .set_tensor_async = */ NULL,
1006
+ /* .get_tensor_async = */ NULL,
1007
+ /* .cpy_tensor_async = */ NULL,
1008
+ /* .synchronize = */ NULL,
1009
+ /* .graph_plan_create = */ wsp_ggml_backend_cpu_graph_plan_create,
1010
+ /* .graph_plan_free = */ wsp_ggml_backend_cpu_graph_plan_free,
1011
+ /* .graph_plan_update = */ NULL,
1012
+ /* .graph_plan_compute = */ wsp_ggml_backend_cpu_graph_plan_compute,
1013
+ /* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
1014
+ /* .supports_op = */ NULL,
1015
+ /* .supports_buft = */ NULL,
1016
+ /* .offload_op = */ NULL,
1017
+ /* .event_record = */ NULL,
1018
+ /* .event_wait = */ NULL,
1019
+ };
1020
+
1021
+ static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
1022
+ static wsp_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
1023
+ return &guid;
1024
+ }
1025
+
1026
+ wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
1027
+ struct wsp_ggml_backend_cpu_context * ctx = new wsp_ggml_backend_cpu_context;
1028
+ if (ctx == NULL) {
1029
+ return NULL;
1030
+ }
1031
+
1032
+ ctx->n_threads = WSP_GGML_DEFAULT_N_THREADS;
1033
+ ctx->threadpool = NULL;
1034
+ ctx->work_data = NULL;
1035
+ ctx->work_size = 0;
1036
+ ctx->abort_callback = NULL;
1037
+ ctx->abort_callback_data = NULL;
1038
+
1039
+ wsp_ggml_backend_t cpu_backend = new wsp_ggml_backend {
1040
+ /* .guid = */ wsp_ggml_backend_cpu_guid(),
1041
+ /* .interface = */ wsp_ggml_backend_cpu_i,
1042
+ /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
1043
+ /* .context = */ ctx,
1044
+ };
1045
+
1046
+ if (cpu_backend == NULL) {
1047
+ delete ctx;
1048
+ return NULL;
1049
+ }
1050
+
1051
+ return cpu_backend;
1052
+ }
1053
+
1054
+ bool wsp_ggml_backend_is_cpu(wsp_ggml_backend_t backend) {
1055
+ return backend != NULL && wsp_ggml_guid_matches(backend->guid, wsp_ggml_backend_cpu_guid());
1056
+ }
1057
+
1058
+ void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_threads) {
1059
+ WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1060
+
1061
+ struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1062
+ ctx->n_threads = n_threads;
1063
+ }
1064
+
1065
+ void wsp_ggml_backend_cpu_set_threadpool(wsp_ggml_backend_t backend_cpu, wsp_ggml_threadpool_t threadpool) {
1066
+ WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1067
+
1068
+ struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1069
+
1070
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
1071
+ // already had a different threadpool, pause/suspend it before switching
1072
+ wsp_ggml_threadpool_pause(ctx->threadpool);
1073
+ }
1074
+ ctx->threadpool = threadpool;
1075
+ }
1076
+
1077
+ void wsp_ggml_backend_cpu_set_abort_callback(wsp_ggml_backend_t backend_cpu, wsp_ggml_abort_callback abort_callback, void * abort_callback_data) {
1078
+ WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
1079
+
1080
+ struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
1081
+ ctx->abort_callback = abort_callback;
1082
+ ctx->abort_callback_data = abort_callback_data;
1083
+ }
1084
+
1085
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1086
+ WSP_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1087
+ return wsp_ggml_backend_buffer_init(wsp_ggml_backend_cpu_buffer_type(), wsp_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1088
+ }
1089
+
1090
+ ////////////////////////
1091
+
1092
+ struct wsp_ggml_backend_cpu_device_context {
1093
+ std::string description = "CPU";
1094
+
1095
+ wsp_ggml_backend_cpu_device_context() {
1096
+ #ifdef __APPLE__
1097
+ size_t len = 0;
1098
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1099
+ description.resize(len);
1100
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1101
+ }
1102
+ #elif defined(__linux__)
1103
+ FILE * f = fopen("/proc/cpuinfo", "r");
1104
+ if (f) {
1105
+ char buf[1024];
1106
+ while (fgets(buf, sizeof(buf), f)) {
1107
+ if (strncmp(buf, "model name", 10) == 0) {
1108
+ char * p = strchr(buf, ':');
1109
+ if (p) {
1110
+ p++;
1111
+ while (std::isspace(*p)) {
1112
+ p++;
1113
+ }
1114
+ while (std::isspace(p[strlen(p) - 1])) {
1115
+ p[strlen(p) - 1] = '\0';
1116
+ }
1117
+ description = p;
1118
+ break;
1119
+ }
1120
+ }
1121
+ }
1122
+ fclose(f);
1123
+ }
1124
+ #elif defined(_WIN32)
1125
+ HKEY hKey;
1126
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1127
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1128
+ 0,
1129
+ KEY_READ,
1130
+ &hKey) == ERROR_SUCCESS) {
1131
+ DWORD cpu_brand_size = 0;
1132
+ if (RegQueryValueExA(hKey,
1133
+ TEXT("ProcessorNameString"),
1134
+ NULL,
1135
+ NULL,
1136
+ NULL,
1137
+ &cpu_brand_size) == ERROR_SUCCESS) {
1138
+ description.resize(cpu_brand_size);
1139
+ if (RegQueryValueExA(hKey,
1140
+ TEXT("ProcessorNameString"),
1141
+ NULL,
1142
+ NULL,
1143
+ (LPBYTE)&description[0], // NOLINT
1144
+ &cpu_brand_size) == ERROR_SUCCESS) {
1145
+ if (description.find('\0') != std::string::npos) {
1146
+ description.resize(description.find('\0'));
1147
+ }
1148
+ }
1149
+ }
1150
+ RegCloseKey(hKey);
1151
+ }
1152
+ #endif
1153
+ }
1154
+ };
1155
+
1156
+ static const char * wsp_ggml_backend_cpu_device_get_name(wsp_ggml_backend_dev_t dev) {
1157
+ return "CPU";
1158
+
1159
+ WSP_GGML_UNUSED(dev);
1160
+ }
1161
+
1162
+ static const char * wsp_ggml_backend_cpu_device_get_description(wsp_ggml_backend_dev_t dev) {
1163
+ struct wsp_ggml_backend_cpu_device_context * ctx = (struct wsp_ggml_backend_cpu_device_context *)dev->context;
1164
+
1165
+ return ctx->description.c_str();
1166
+ }
1167
+
1168
+ static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, size_t * free, size_t * total) {
1169
+ // TODO
1170
+ *free = 0;
1171
+ *total = 0;
1172
+
1173
+ WSP_GGML_UNUSED(dev);
1174
+ }
1175
+
1176
+ static enum wsp_ggml_backend_dev_type wsp_ggml_backend_cpu_device_get_type(wsp_ggml_backend_dev_t dev) {
1177
+ return WSP_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1178
+
1179
+ WSP_GGML_UNUSED(dev);
1180
+ }
1181
+
1182
+ static void wsp_ggml_backend_cpu_device_get_props(wsp_ggml_backend_dev_t dev, struct wsp_ggml_backend_dev_props * props) {
1183
+ props->name = wsp_ggml_backend_cpu_device_get_name(dev);
1184
+ props->description = wsp_ggml_backend_cpu_device_get_description(dev);
1185
+ props->type = wsp_ggml_backend_cpu_device_get_type(dev);
1186
+ wsp_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1187
+ props->caps = {
1188
+ /* .async = */ false,
1189
+ /* .host_buffer = */ false,
1190
+ /* .buffer_from_host_ptr = */ true,
1191
+ /* .events = */ false,
1192
+ };
1193
+ }
1194
+
1195
+ static wsp_ggml_backend_t wsp_ggml_backend_cpu_device_init(wsp_ggml_backend_dev_t dev, const char * params) {
1196
+ return wsp_ggml_backend_cpu_init();
1197
+
1198
+ WSP_GGML_UNUSED(dev);
1199
+ WSP_GGML_UNUSED(params);
1200
+ }
1201
+
1202
+ static wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_device_get_buffer_type(wsp_ggml_backend_dev_t dev) {
1203
+ return wsp_ggml_backend_cpu_buffer_type();
1204
+
1205
+ WSP_GGML_UNUSED(dev);
1206
+ }
1207
+
1208
+ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_device_buffer_from_ptr(wsp_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1209
+ return wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
1210
+
1211
+ WSP_GGML_UNUSED(dev);
1212
+ WSP_GGML_UNUSED(max_tensor_size);
1213
+ }
1214
+
1215
+ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev, const struct wsp_ggml_tensor * op) {
1216
+ switch (op->op) {
1217
+ case WSP_GGML_OP_CPY:
1218
+ return
1219
+ op->type != WSP_GGML_TYPE_IQ2_XXS &&
1220
+ op->type != WSP_GGML_TYPE_IQ2_XS &&
1221
+ op->type != WSP_GGML_TYPE_IQ1_S &&
1222
+ op->type != WSP_GGML_TYPE_IQ1_M; // missing type_traits.from_float
1223
+ case WSP_GGML_OP_MUL_MAT:
1224
+ return op->src[1]->type == WSP_GGML_TYPE_F32 || op->src[1]->type == wsp_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
1225
+ case WSP_GGML_OP_ROPE_BACK:
1226
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1227
+ case WSP_GGML_OP_IM2COL_BACK:
1228
+ return op->src[0]->type == WSP_GGML_TYPE_F32 && op->src[1]->type == WSP_GGML_TYPE_F32;
1229
+ case WSP_GGML_OP_OUT_PROD:
1230
+ return (op->src[0]->type == WSP_GGML_TYPE_F32 || wsp_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == WSP_GGML_TYPE_F32;
1231
+ default:
1232
+ return true;
1233
+ }
1234
+
1235
+ WSP_GGML_UNUSED(dev);
1236
+ }
1237
+
1238
+ static bool wsp_ggml_backend_cpu_device_supports_buft(wsp_ggml_backend_dev_t dev, wsp_ggml_backend_buffer_type_t buft) {
1239
+ return wsp_ggml_backend_buft_is_host(buft);
1240
+
1241
+ WSP_GGML_UNUSED(dev);
1242
+ }
1243
+
1244
+ static const struct wsp_ggml_backend_device_i wsp_ggml_backend_cpu_device_i = {
1245
+ /* .get_name = */ wsp_ggml_backend_cpu_device_get_name,
1246
+ /* .get_description = */ wsp_ggml_backend_cpu_device_get_description,
1247
+ /* .get_memory = */ wsp_ggml_backend_cpu_device_get_memory,
1248
+ /* .get_type = */ wsp_ggml_backend_cpu_device_get_type,
1249
+ /* .get_props = */ wsp_ggml_backend_cpu_device_get_props,
1250
+ /* .init_backend = */ wsp_ggml_backend_cpu_device_init,
1251
+ /* .get_buffer_type = */ wsp_ggml_backend_cpu_device_get_buffer_type,
1252
+ /* .get_host_buffer_type = */ NULL,
1253
+ /* .buffer_from_host_ptr = */ wsp_ggml_backend_cpu_device_buffer_from_ptr,
1254
+ /* .supports_op = */ wsp_ggml_backend_cpu_device_supports_op,
1255
+ /* .supports_buft = */ wsp_ggml_backend_cpu_device_supports_buft,
1256
+ /* .offload_op = */ NULL,
1257
+ /* .event_new = */ NULL,
1258
+ /* .event_free = */ NULL,
1259
+ /* .event_synchronize = */ NULL,
1260
+ };
1261
+
1262
+ ////////////////////////
1263
+
1264
+ static const char * wsp_ggml_backend_cpu_reg_get_name(wsp_ggml_backend_reg_t reg) {
1265
+ return "CPU";
1266
+
1267
+ WSP_GGML_UNUSED(reg);
1268
+ }
1269
+
1270
+ static size_t wsp_ggml_backend_cpu_reg_get_device_count(wsp_ggml_backend_reg_t reg) {
1271
+ return 1;
1272
+
1273
+ WSP_GGML_UNUSED(reg);
1274
+ }
1275
+
1276
+ static wsp_ggml_backend_dev_t wsp_ggml_backend_cpu_reg_get_device(wsp_ggml_backend_reg_t reg, size_t index) {
1277
+ WSP_GGML_ASSERT(index == 0);
1278
+
1279
+ static wsp_ggml_backend_cpu_device_context ctx;
1280
+ static wsp_ggml_backend_device wsp_ggml_backend_cpu_device = {
1281
+ /* .iface = */ wsp_ggml_backend_cpu_device_i,
1282
+ /* .reg = */ reg,
1283
+ /* .context = */ &ctx,
1284
+ };
1285
+
1286
+ return &wsp_ggml_backend_cpu_device;
1287
+ }
1288
+
1289
+ static void * wsp_ggml_backend_cpu_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
1290
+ if (strcmp(name, "wsp_ggml_backend_set_n_threads") == 0) {
1291
+ return (void *)wsp_ggml_backend_cpu_set_n_threads;
1292
+ }
1293
+ return NULL;
1294
+
1295
+ WSP_GGML_UNUSED(reg);
1296
+ }
1297
+
1298
+ static const struct wsp_ggml_backend_reg_i wsp_ggml_backend_cpu_reg_i = {
1299
+ /* .get_name = */ wsp_ggml_backend_cpu_reg_get_name,
1300
+ /* .get_device_count = */ wsp_ggml_backend_cpu_reg_get_device_count,
1301
+ /* .get_device = */ wsp_ggml_backend_cpu_reg_get_device,
1302
+ /* .get_proc_address = */ wsp_ggml_backend_cpu_get_proc_address,
1303
+ };
1304
+
1305
+ wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void) {
1306
+ static struct wsp_ggml_backend_reg wsp_ggml_backend_cpu_reg = {
1307
+ /* .iface = */ wsp_ggml_backend_cpu_reg_i,
1308
+ /* .context = */ NULL,
1309
+ };
1310
+
1311
+ return &wsp_ggml_backend_cpu_reg;
1312
+ }
1313
+
1314
+ // multi-buffer buffer
1315
+
1316
+ struct wsp_ggml_backend_multi_buffer_context {
1317
+ wsp_ggml_backend_buffer_t * buffers;
1318
+ size_t n_buffers;
1319
+ };
1320
+
1321
+ static const char * wsp_ggml_backend_multi_buffer_get_name(wsp_ggml_backend_buffer_t buffer) {
1322
+ wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1323
+
1324
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1325
+ }
1326
+
1327
+ static void wsp_ggml_backend_multi_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
1328
+ wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1329
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1330
+ wsp_ggml_backend_buffer_free(ctx->buffers[i]);
1331
+ }
1332
+
1333
+ free(ctx->buffers);
1334
+ free(ctx);
1335
+ }
1336
+
1337
+ static void wsp_ggml_backend_multi_buffer_clear(wsp_ggml_backend_buffer_t buffer, uint8_t value) {
1338
+ wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1339
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1340
+ wsp_ggml_backend_buffer_clear(ctx->buffers[i], value);
1341
+ }
1342
+ }
1343
+
1344
+ static const struct wsp_ggml_backend_buffer_i wsp_ggml_backend_multi_buffer_i = {
1345
+ /* .get_name = */ wsp_ggml_backend_multi_buffer_get_name,
1346
+ /* .free_buffer = */ wsp_ggml_backend_multi_buffer_free_buffer,
1347
+ /* .get_base = */ NULL,
1348
+ /* .init_tensor = */ NULL,
1349
+ /* .memset_tensor = */ NULL,
1350
+ /* .set_tensor = */ NULL,
1351
+ /* .get_tensor = */ NULL,
1352
+ /* .cpy_tensor = */ NULL,
1353
+ /* .clear = */ wsp_ggml_backend_multi_buffer_clear,
1354
+ /* .reset = */ NULL,
1355
+ };
1356
+
1357
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_multi_buffer_alloc_buffer(wsp_ggml_backend_buffer_t * buffers, size_t n_buffers) {
1358
+ wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) malloc(sizeof(struct wsp_ggml_backend_multi_buffer_context));
1359
+ ctx->n_buffers = n_buffers;
1360
+ ctx->buffers = (wsp_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(wsp_ggml_backend_buffer_t));
1361
+
1362
+ WSP_GGML_ASSERT(ctx->buffers != NULL);
1363
+
1364
+ size_t total_size = 0;
1365
+ for (size_t i = 0; i < n_buffers; i++) {
1366
+ ctx->buffers[i] = buffers[i];
1367
+ total_size += wsp_ggml_backend_buffer_get_size(buffers[i]);
1368
+ }
1369
+
1370
+ return wsp_ggml_backend_buffer_init(buffers[0]->buft, wsp_ggml_backend_multi_buffer_i, ctx, total_size);
1371
+ }
1372
+
1373
+ bool wsp_ggml_backend_buffer_is_multi_buffer(wsp_ggml_backend_buffer_t buffer) {
1374
+ return buffer->iface.get_name == wsp_ggml_backend_multi_buffer_get_name;
1375
+ }
1376
+
1377
+ void wsp_ggml_backend_multi_buffer_set_usage(wsp_ggml_backend_buffer_t buffer, enum wsp_ggml_backend_buffer_usage usage) {
1378
+ WSP_GGML_ASSERT(wsp_ggml_backend_buffer_is_multi_buffer(buffer));
1379
+ wsp_ggml_backend_multi_buffer_context * ctx = (wsp_ggml_backend_multi_buffer_context *) buffer->context;
1380
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
1381
+ wsp_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1382
+ }
1383
+ }
1384
+
1385
+ // creates a copy of the tensor with the same memory layout
1386
+ static struct wsp_ggml_tensor * wsp_ggml_dup_tensor_layout(struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * tensor) {
1387
+ struct wsp_ggml_tensor * dup = wsp_ggml_dup_tensor(ctx, tensor);
1388
+ for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
1389
+ dup->nb[i] = tensor->nb[i];
1390
+ }
1391
+ return dup;
1392
+ }
1393
+
1394
+ static bool wsp_ggml_is_view_op(enum wsp_ggml_op op) {
1395
+ return op == WSP_GGML_OP_VIEW || op == WSP_GGML_OP_RESHAPE || op == WSP_GGML_OP_PERMUTE || op == WSP_GGML_OP_TRANSPOSE;
1396
+ }
1397
+
1398
+ // scheduler
1399
+
1400
+ #ifndef WSP_GGML_SCHED_MAX_BACKENDS
1401
+ #define WSP_GGML_SCHED_MAX_BACKENDS 16
1402
+ #endif
1403
+
1404
+ #ifndef WSP_GGML_SCHED_MAX_SPLIT_INPUTS
1405
+ #define WSP_GGML_SCHED_MAX_SPLIT_INPUTS WSP_GGML_MAX_SRC
1406
+ #endif
1407
+
1408
+ #ifndef WSP_GGML_SCHED_MAX_COPIES
1409
+ #define WSP_GGML_SCHED_MAX_COPIES 4
1410
+ #endif
1411
+
1412
+ struct wsp_ggml_backend_sched_split {
1413
+ int backend_id;
1414
+ int i_start;
1415
+ int i_end;
1416
+ struct wsp_ggml_tensor * inputs[WSP_GGML_SCHED_MAX_SPLIT_INPUTS];
1417
+ int n_inputs;
1418
+ // graph view of this split
1419
+ struct wsp_ggml_cgraph graph;
1420
+ };
1421
+
1422
+ struct wsp_ggml_backend_sched {
1423
+ bool is_reset; // true if the scheduler has been reset since the last graph split
1424
+ bool is_alloc;
1425
+
1426
+ int n_backends;
1427
+
1428
+ wsp_ggml_backend_t backends[WSP_GGML_SCHED_MAX_BACKENDS];
1429
+ wsp_ggml_backend_buffer_type_t bufts[WSP_GGML_SCHED_MAX_BACKENDS];
1430
+ wsp_ggml_gallocr_t galloc;
1431
+
1432
+ // hash map of the nodes in the graph
1433
+ struct wsp_ggml_hash_set hash_set;
1434
+ int * hv_tensor_backend_ids; // [hash_set.size]
1435
+ struct wsp_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1436
+
1437
+ int * node_backend_ids; // [graph_size]
1438
+ int * leaf_backend_ids; // [graph_size]
1439
+
1440
+ int * prev_node_backend_ids; // [graph_size]
1441
+ int * prev_leaf_backend_ids; // [graph_size]
1442
+
1443
+ // copy of the graph with modified inputs
1444
+ struct wsp_ggml_cgraph graph;
1445
+
1446
+ // graph splits
1447
+ struct wsp_ggml_backend_sched_split * splits;
1448
+ int n_splits;
1449
+ int splits_capacity;
1450
+
1451
+ // pipeline parallelism support
1452
+ int n_copies;
1453
+ int cur_copy;
1454
+ wsp_ggml_backend_event_t events[WSP_GGML_SCHED_MAX_BACKENDS][WSP_GGML_SCHED_MAX_COPIES];
1455
+ struct wsp_ggml_tensor * graph_inputs[WSP_GGML_SCHED_MAX_SPLIT_INPUTS];
1456
+ int n_graph_inputs;
1457
+
1458
+ struct wsp_ggml_context * ctx;
1459
+
1460
+ wsp_ggml_backend_sched_eval_callback callback_eval;
1461
+ void * callback_eval_user_data;
1462
+
1463
+ char * context_buffer;
1464
+ size_t context_buffer_size;
1465
+
1466
+ bool debug;
1467
+ };
1468
+
1469
+ #define hash_id(tensor) wsp_ggml_hash_find_or_insert(&sched->hash_set, tensor)
1470
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1471
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1472
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1473
+
1474
+ // returns the priority of the backend, lower id is higher priority
1475
+ static int wsp_ggml_backend_sched_backend_id(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
1476
+ for (int i = 0; i < sched->n_backends; i++) {
1477
+ if (sched->backends[i] == backend) {
1478
+ return i;
1479
+ }
1480
+ }
1481
+ return -1;
1482
+ }
1483
+
1484
+ static int wsp_ggml_backend_sched_backend_from_buffer(wsp_ggml_backend_sched_t sched, const struct wsp_ggml_tensor * tensor, const struct wsp_ggml_tensor * op) {
1485
+ wsp_ggml_backend_buffer_t buffer = tensor->buffer;
1486
+ if (buffer == NULL) {
1487
+ return -1;
1488
+ }
1489
+
1490
+ // find highest prio backend that supports the buffer type and the op
1491
+ for (int i = 0; i < sched->n_backends; i++) {
1492
+ if (wsp_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1493
+ wsp_ggml_backend_supports_op(sched->backends[i], op)) {
1494
+ return i;
1495
+ }
1496
+ }
1497
+
1498
+ #ifndef NDEBUG
1499
+ WSP_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1500
+ __func__, wsp_ggml_op_desc(tensor), wsp_ggml_backend_buffer_name(buffer), tensor->name);
1501
+ #endif
1502
+
1503
+ return -1;
1504
+ }
1505
+
1506
+ #if 0
1507
+ #define WSP_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1508
+ static char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*16 + WSP_GGML_SCHED_MAX_SPLITS_DEBUG*WSP_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1509
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1510
+ #define GET_CAUSE(node) causes[hash_id(node)]
1511
+ #else
1512
+ #define SET_CAUSE(node, ...)
1513
+ #define GET_CAUSE(node) ""
1514
+ #endif
1515
+
1516
+ // returns the backend that should be used for the node based on the current locations
1517
+ static int wsp_ggml_backend_sched_backend_id_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * tensor) {
1518
+ // TODO: use supports_op to check if the backend supports the op
1519
+
1520
+ // assign pre-allocated nodes to their backend
1521
+ int cur_backend_id = wsp_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1522
+ if (cur_backend_id != -1) {
1523
+ SET_CAUSE(tensor, "1.dst");
1524
+ return cur_backend_id;
1525
+ }
1526
+
1527
+ // view_src
1528
+ if (tensor->view_src != NULL) {
1529
+ cur_backend_id = wsp_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1530
+ if (cur_backend_id != -1) {
1531
+ SET_CAUSE(tensor, "1.vsrc");
1532
+ return cur_backend_id;
1533
+ }
1534
+ }
1535
+
1536
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1537
+ // since the tensor is pre-allocated, it cannot be moved to another backend
1538
+ WSP_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1539
+ }
1540
+
1541
+ // graph input
1542
+ if (tensor->flags & WSP_GGML_TENSOR_FLAG_INPUT) {
1543
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1544
+ SET_CAUSE(tensor, "1.inp");
1545
+ return cur_backend_id;
1546
+ }
1547
+
1548
+ // operations with weights are preferably run on the same backend as the weights
1549
+ for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
1550
+ const struct wsp_ggml_tensor * src = tensor->src[i];
1551
+ if (src == NULL) {
1552
+ continue;
1553
+ }
1554
+ if (src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1555
+ int src_backend_id = wsp_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1556
+ // check if a backend with higher prio wants to offload the op
1557
+ if (src_backend_id == sched->n_backends - 1) {
1558
+ for (int b = 0; b < src_backend_id; b++) {
1559
+ if (wsp_ggml_backend_supports_op(sched->backends[b], tensor) && wsp_ggml_backend_offload_op(sched->backends[b], tensor)) {
1560
+ SET_CAUSE(tensor, "1.off");
1561
+ return b;
1562
+ }
1563
+ }
1564
+ }
1565
+ SET_CAUSE(tensor, "1.wgt%d", i);
1566
+ return src_backend_id;
1567
+ }
1568
+ }
1569
+
1570
+ return -1;
1571
+ }
1572
+
1573
+ static char * fmt_size(size_t size) {
1574
+ static char buffer[128];
1575
+ if (size >= 1024*1024) {
1576
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1577
+ } else {
1578
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1579
+ }
1580
+ return buffer;
1581
+ }
1582
+
1583
+ static void wsp_ggml_backend_sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
1584
+ int cur_split = 0;
1585
+ for (int i = 0; i < graph->n_nodes; i++) {
1586
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1587
+ wsp_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1588
+ WSP_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, wsp_ggml_backend_name(split_backend),
1589
+ sched->splits[cur_split].n_inputs);
1590
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1591
+ WSP_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1592
+ fmt_size(wsp_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1593
+ }
1594
+ WSP_GGML_LOG_DEBUG("\n");
1595
+ cur_split++;
1596
+ }
1597
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1598
+ if (wsp_ggml_is_view_op(node->op)) {
1599
+ continue;
1600
+ }
1601
+ wsp_ggml_backend_t tensor_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, node);
1602
+ WSP_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
1603
+ fmt_size(wsp_ggml_nbytes(node)), tensor_backend ? wsp_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1604
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1605
+ struct wsp_ggml_tensor * src = node->src[j];
1606
+ if (src == NULL) {
1607
+ continue;
1608
+ }
1609
+ wsp_ggml_backend_t src_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, src);
1610
+ WSP_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1611
+ fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1612
+ }
1613
+ WSP_GGML_LOG_DEBUG("\n");
1614
+ }
1615
+ }
1616
+
1617
+ static bool wsp_ggml_backend_sched_buffer_supported(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * t, int backend_id) {
1618
+ wsp_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1619
+ wsp_ggml_backend_buffer_type_t buft = NULL;
1620
+
1621
+ if (buf) {
1622
+ // the tensor is already allocated
1623
+ buft = buf->buft;
1624
+ } else {
1625
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1626
+ int tensor_backend_id = tensor_backend_id(t);
1627
+ if (tensor_backend_id == -1 && t->view_src) {
1628
+ tensor_backend_id = tensor_backend_id(t->view_src);
1629
+ }
1630
+ if (tensor_backend_id != -1) {
1631
+ buft = sched->bufts[tensor_backend_id];
1632
+ }
1633
+ }
1634
+
1635
+ return buft != NULL && wsp_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1636
+ }
1637
+
1638
+ static void wsp_ggml_backend_sched_set_if_supported(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1639
+ if (wsp_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1640
+ *node_backend_id = cur_backend_id;
1641
+ SET_CAUSE(node, "2.sup");
1642
+ }
1643
+ }
1644
+
1645
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1646
+ static void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
1647
+ // reset splits
1648
+ sched->n_splits = 0;
1649
+ sched->n_graph_inputs = 0;
1650
+ sched->is_reset = false;
1651
+
1652
+ struct wsp_ggml_init_params params = {
1653
+ /* .mem_size = */ sched->context_buffer_size,
1654
+ /* .mem_buffer = */ sched->context_buffer,
1655
+ /* .no_alloc = */ true
1656
+ };
1657
+
1658
+ wsp_ggml_free(sched->ctx);
1659
+
1660
+ sched->ctx = wsp_ggml_init(params);
1661
+ if (sched->ctx == NULL) {
1662
+ WSP_GGML_ABORT("%s: failed to initialize context\n", __func__);
1663
+ }
1664
+
1665
+ // pass 1: assign backends to ops with pre-allocated inputs
1666
+ for (int i = 0; i < graph->n_leafs; i++) {
1667
+ struct wsp_ggml_tensor * leaf = graph->leafs[i];
1668
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1669
+ // do not overwrite user assignments
1670
+ if (*leaf_backend_id == -1) {
1671
+ *leaf_backend_id = wsp_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1672
+ }
1673
+ }
1674
+
1675
+ for (int i = 0; i < graph->n_nodes; i++) {
1676
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1677
+ int * node_backend_id = &tensor_backend_id(node);
1678
+ // do not overwrite user assignments
1679
+ if (*node_backend_id == -1) {
1680
+ *node_backend_id = wsp_ggml_backend_sched_backend_id_from_cur(sched, node);
1681
+
1682
+ #if 0
1683
+ // src
1684
+ if (node->op == WSP_GGML_OP_NONE) {
1685
+ continue;
1686
+ }
1687
+
1688
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1689
+ struct wsp_ggml_tensor * src = node->src[j];
1690
+ if (src == NULL) {
1691
+ continue;
1692
+ }
1693
+ int * src_backend_id = &tensor_backend_id(src);
1694
+ if (*src_backend_id == -1) {
1695
+ *src_backend_id = wsp_ggml_backend_sched_backend_id_from_cur(sched, src);
1696
+ }
1697
+ }
1698
+ #endif
1699
+ }
1700
+ }
1701
+
1702
+ // pass 2: expand current backend assignments
1703
+ // assign the same backend to adjacent nodes
1704
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1705
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1706
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1707
+ // expand gpu down
1708
+ {
1709
+ int cur_backend_id = -1;
1710
+ for (int i = 0; i < graph->n_nodes; i++) {
1711
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1712
+ if (wsp_ggml_is_view_op(node->op)) {
1713
+ continue;
1714
+ }
1715
+ int * node_backend_id = &tensor_backend_id(node);
1716
+ if (*node_backend_id != -1) {
1717
+ if (*node_backend_id == sched->n_backends - 1) {
1718
+ // skip cpu (lowest prio backend)
1719
+ cur_backend_id = -1;
1720
+ } else {
1721
+ cur_backend_id = *node_backend_id;
1722
+ }
1723
+ } else if (cur_backend_id != -1) {
1724
+ wsp_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1725
+ }
1726
+ }
1727
+ }
1728
+ // expand gpu up
1729
+ {
1730
+ int cur_backend_id = -1;
1731
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1732
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1733
+ if (wsp_ggml_is_view_op(node->op)) {
1734
+ continue;
1735
+ }
1736
+ int * node_backend_id = &tensor_backend_id(node);
1737
+ if (*node_backend_id != -1) {
1738
+ if (*node_backend_id == sched->n_backends - 1) {
1739
+ // skip cpu (lowest prio backend)
1740
+ cur_backend_id = -1;
1741
+ } else {
1742
+ cur_backend_id = *node_backend_id;
1743
+ }
1744
+ } else if (cur_backend_id != -1) {
1745
+ wsp_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1746
+ }
1747
+ }
1748
+ }
1749
+ // expand rest down
1750
+ {
1751
+ int cur_backend_id = -1;
1752
+ for (int i = 0; i < graph->n_nodes; i++) {
1753
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1754
+ if (wsp_ggml_is_view_op(node->op)) {
1755
+ continue;
1756
+ }
1757
+ int * node_backend_id = &tensor_backend_id(node);
1758
+ if (*node_backend_id != -1) {
1759
+ cur_backend_id = *node_backend_id;
1760
+ } else if (cur_backend_id != -1) {
1761
+ wsp_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1762
+ }
1763
+ }
1764
+ }
1765
+ // expand rest up
1766
+ {
1767
+ int cur_backend_id = -1;
1768
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1769
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1770
+ if (wsp_ggml_is_view_op(node->op)) {
1771
+ continue;
1772
+ }
1773
+ int * node_backend_id = &tensor_backend_id(node);
1774
+ if (*node_backend_id != -1) {
1775
+ cur_backend_id = *node_backend_id;
1776
+ } else if (cur_backend_id != -1) {
1777
+ wsp_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1778
+ }
1779
+ }
1780
+ }
1781
+
1782
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1783
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1784
+ // however, we also need to verify that the sources are in compatible buffer types
1785
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1786
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1787
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1788
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1789
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1790
+ for (int i = 0; i < graph->n_nodes; i++) {
1791
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1792
+ if (wsp_ggml_is_view_op(node->op)) {
1793
+ continue;
1794
+ }
1795
+ int * node_backend_id = &tensor_backend_id(node);
1796
+ if (*node_backend_id == -1) {
1797
+ // unassigned node: find the backend with the most supported inputs
1798
+ int n_supported_best = -1;
1799
+ for (int b = 0; b < sched->n_backends; b++) {
1800
+ if (wsp_ggml_backend_supports_op(sched->backends[b], node)) {
1801
+ int n_supported = 0;
1802
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1803
+ struct wsp_ggml_tensor * src = node->src[j];
1804
+ if (src == NULL) {
1805
+ continue;
1806
+ }
1807
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && wsp_ggml_backend_sched_buffer_supported(sched, src, b)) {
1808
+ n_supported++;
1809
+ }
1810
+ }
1811
+ if (n_supported > n_supported_best) {
1812
+ n_supported_best = n_supported;
1813
+ *node_backend_id = b;
1814
+ SET_CAUSE(node, "3.best");
1815
+ }
1816
+ }
1817
+ }
1818
+ } else {
1819
+ // assigned node: upgrade to higher prio backend if possible
1820
+ for (int b = 0; b < *node_backend_id; b++) {
1821
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && wsp_ggml_backend_supports_op(sched->backends[b], node)) {
1822
+ bool supported = true;
1823
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1824
+ struct wsp_ggml_tensor * src = node->src[j];
1825
+ if (src == NULL) {
1826
+ continue;
1827
+ }
1828
+ if (!wsp_ggml_backend_sched_buffer_supported(sched, src, b)) {
1829
+ supported = false;
1830
+ break;
1831
+ }
1832
+ }
1833
+ if (supported) {
1834
+ *node_backend_id = b;
1835
+ SET_CAUSE(node, "3.upg");
1836
+ break;
1837
+ }
1838
+ }
1839
+ }
1840
+ }
1841
+ }
1842
+
1843
+ // pass 4: assign backends to remaining src from dst and view_src
1844
+ for (int i = 0; i < graph->n_nodes; i++) {
1845
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1846
+ int * cur_backend_id = &tensor_backend_id(node);
1847
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1848
+ *cur_backend_id = tensor_backend_id(node->view_src);
1849
+ SET_CAUSE(node, "4.vsrc");
1850
+ }
1851
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1852
+ struct wsp_ggml_tensor * src = node->src[j];
1853
+ if (src == NULL) {
1854
+ continue;
1855
+ }
1856
+ int * src_backend_id = &tensor_backend_id(src);
1857
+ if (*src_backend_id == -1) {
1858
+ if (src->view_src != NULL) {
1859
+ // views are always on the same backend as the source
1860
+ *src_backend_id = tensor_backend_id(src->view_src);
1861
+ SET_CAUSE(src, "4.vsrc");
1862
+ } else {
1863
+ *src_backend_id = *cur_backend_id;
1864
+ SET_CAUSE(src, "4.cur");
1865
+ }
1866
+ }
1867
+ }
1868
+ }
1869
+
1870
+ // pass 5: split graph, find tensors that need to be copied
1871
+ {
1872
+ int i_split = 0;
1873
+ struct wsp_ggml_backend_sched_split * split = &sched->splits[0];
1874
+ // find the backend of the first split, skipping view ops
1875
+ int i = 0;
1876
+ for (; i < graph->n_nodes; i++) {
1877
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1878
+ if (!wsp_ggml_is_view_op(node->op)) {
1879
+ split->backend_id = tensor_backend_id(node);
1880
+ break;
1881
+ }
1882
+ }
1883
+ split->i_start = 0;
1884
+ split->n_inputs = 0;
1885
+ int cur_backend_id = split->backend_id;
1886
+ for (; i < graph->n_nodes; i++) {
1887
+ struct wsp_ggml_tensor * node = graph->nodes[i];
1888
+
1889
+ if (wsp_ggml_is_view_op(node->op)) {
1890
+ continue;
1891
+ }
1892
+
1893
+ const int node_backend_id = tensor_backend_id(node);
1894
+
1895
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1896
+
1897
+ // check if we should start a new split based on the sources of the current node
1898
+ bool need_new_split = false;
1899
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1900
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1901
+ struct wsp_ggml_tensor * src = node->src[j];
1902
+ if (src == NULL) {
1903
+ continue;
1904
+ }
1905
+ // check if a weight is on a different backend
1906
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1907
+ if (src->buffer != NULL && src->buffer->usage == WSP_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1908
+ int src_backend_id = tensor_backend_id(src);
1909
+ if (src_backend_id != cur_backend_id) {
1910
+ need_new_split = true;
1911
+ break;
1912
+ }
1913
+ }
1914
+ // check if the split has too many inputs
1915
+ // FIXME: count the number of inputs instead of only checking when full
1916
+ if (split->n_inputs == WSP_GGML_SCHED_MAX_SPLIT_INPUTS) {
1917
+ const size_t id = hash_id(src);
1918
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1919
+ bool supported = wsp_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1920
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1921
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1922
+ need_new_split = true;
1923
+ break;
1924
+ }
1925
+ }
1926
+ }
1927
+ }
1928
+
1929
+ if (node_backend_id != cur_backend_id || need_new_split) {
1930
+ split->i_end = i;
1931
+ i_split++;
1932
+ if (i_split >= sched->splits_capacity) {
1933
+ sched->splits_capacity *= 2;
1934
+ sched->splits = (wsp_ggml_backend_sched_split *)
1935
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct wsp_ggml_backend_sched_split));
1936
+ WSP_GGML_ASSERT(sched->splits != NULL);
1937
+ }
1938
+ split = &sched->splits[i_split];
1939
+ split->backend_id = node_backend_id;
1940
+ split->i_start = i;
1941
+ split->n_inputs = 0;
1942
+ cur_backend_id = node_backend_id;
1943
+ }
1944
+
1945
+ // find inputs that are not on the same backend
1946
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
1947
+ struct wsp_ggml_tensor * src = node->src[j];
1948
+ if (src == NULL) {
1949
+ continue;
1950
+ }
1951
+
1952
+ size_t src_id = hash_id(src);
1953
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1954
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1955
+
1956
+ if (src->flags & WSP_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1957
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1958
+ wsp_ggml_backend_t backend = sched->backends[src_backend_id];
1959
+ for (int c = 0; c < sched->n_copies; c++) {
1960
+ struct wsp_ggml_tensor * tensor_copy;
1961
+ if (c == sched->cur_copy) {
1962
+ tensor_copy = src; // use the original tensor as the current copy
1963
+ } else {
1964
+ tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
1965
+ wsp_ggml_format_name(tensor_copy, "%s#%s#%d", wsp_ggml_backend_name(backend), src->name, c);
1966
+ }
1967
+ if (sched->n_copies > 1) {
1968
+ wsp_ggml_set_input(tensor_copy);
1969
+ wsp_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1970
+ }
1971
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1972
+ SET_CAUSE(tensor_copy, "4.cpy");
1973
+ }
1974
+ int n_graph_inputs = sched->n_graph_inputs++;
1975
+ WSP_GGML_ASSERT(n_graph_inputs < WSP_GGML_SCHED_MAX_SPLIT_INPUTS);
1976
+ sched->graph_inputs[n_graph_inputs] = src;
1977
+ }
1978
+ }
1979
+
1980
+ if (src_backend_id != cur_backend_id && !wsp_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1981
+ // create a copy of the input in the split's backend
1982
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1983
+ wsp_ggml_backend_t backend = sched->backends[cur_backend_id];
1984
+ for (int c = 0; c < sched->n_copies; c++) {
1985
+ struct wsp_ggml_tensor * tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
1986
+ wsp_ggml_format_name(tensor_copy, "%s#%s#%d", wsp_ggml_backend_name(backend), src->name, c);
1987
+ if (sched->n_copies > 1) {
1988
+ wsp_ggml_set_input(tensor_copy);
1989
+ wsp_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1990
+ }
1991
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1992
+ SET_CAUSE(tensor_copy, "4.cpy");
1993
+ }
1994
+ int n_inputs = split->n_inputs++;
1995
+ WSP_GGML_ASSERT(n_inputs < WSP_GGML_SCHED_MAX_SPLIT_INPUTS);
1996
+ split->inputs[n_inputs] = src;
1997
+ }
1998
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1999
+ }
2000
+ }
2001
+ }
2002
+ split->i_end = graph->n_nodes;
2003
+ sched->n_splits = i_split + 1;
2004
+ }
2005
+
2006
+ if (sched->debug) {
2007
+ wsp_ggml_backend_sched_print_assignments(sched, graph);
2008
+ }
2009
+
2010
+ // swap node_backend_ids and leaf _backend_ids with prevs
2011
+ {
2012
+ int * tmp = sched->node_backend_ids;
2013
+ sched->node_backend_ids = sched->prev_node_backend_ids;
2014
+ sched->prev_node_backend_ids = tmp;
2015
+
2016
+ tmp = sched->leaf_backend_ids;
2017
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
2018
+ sched->prev_leaf_backend_ids = tmp;
2019
+ }
2020
+
2021
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*WSP_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
2022
+ if (sched->graph.size < graph_size) {
2023
+ sched->graph.size = graph_size;
2024
+ sched->graph.nodes = (wsp_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct wsp_ggml_tensor *));
2025
+ sched->graph.leafs = (wsp_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct wsp_ggml_tensor *));
2026
+ WSP_GGML_ASSERT(sched->graph.nodes != NULL);
2027
+ WSP_GGML_ASSERT(sched->graph.leafs != NULL);
2028
+ }
2029
+ sched->graph.n_nodes = 0;
2030
+ sched->graph.n_leafs = 0;
2031
+
2032
+ struct wsp_ggml_cgraph * graph_copy = &sched->graph;
2033
+
2034
+ for (int i = 0; i < sched->n_splits; i++) {
2035
+ struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
2036
+ split->graph = wsp_ggml_graph_view(graph, split->i_start, split->i_end);
2037
+
2038
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
2039
+ for (int j = 0; j < split->n_inputs; j++) {
2040
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
2041
+
2042
+ struct wsp_ggml_tensor * input = split->inputs[j];
2043
+ const size_t input_id = hash_id(input);
2044
+ struct wsp_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
2045
+
2046
+ // add a dependency to the input source so that it is not freed before the copy is done
2047
+ struct wsp_ggml_tensor * input_dep = wsp_ggml_view_tensor(sched->ctx, input);
2048
+ input_dep->src[0] = input;
2049
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
2050
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
2051
+
2052
+ // add a dependency to the input copy so that it is allocated at the start of the split
2053
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
2054
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
2055
+ }
2056
+
2057
+ for (int j = split->i_start; j < split->i_end; j++) {
2058
+ assert(graph_copy->size > graph_copy->n_nodes);
2059
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
2060
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
2061
+ }
2062
+ }
2063
+
2064
+ if (sched->n_copies > 1) {
2065
+ // add input copies as leafs so that they are allocated first
2066
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
2067
+ struct wsp_ggml_tensor * input = sched->graph_inputs[i];
2068
+ size_t id = hash_id(input);
2069
+ int backend_id = tensor_backend_id(input);
2070
+ for (int c = 0; c < sched->n_copies; c++) {
2071
+ struct wsp_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2072
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2073
+ assert(graph_copy->size > graph_copy->n_leafs);
2074
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2075
+ }
2076
+ }
2077
+
2078
+ for (int i = 0; i < sched->n_splits; i++) {
2079
+ struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
2080
+ int backend_id = split->backend_id;
2081
+ for (int j = 0; j < split->n_inputs; j++) {
2082
+ struct wsp_ggml_tensor * input = split->inputs[j];
2083
+ size_t id = hash_id(input);
2084
+ for (int c = 0; c < sched->n_copies; c++) {
2085
+ struct wsp_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2086
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2087
+ assert(graph_copy->size > graph_copy->n_leafs);
2088
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2089
+ }
2090
+ }
2091
+ }
2092
+ }
2093
+
2094
+ // add leafs from the original graph
2095
+ for (int i = 0; i < graph->n_leafs; i++) {
2096
+ struct wsp_ggml_tensor * leaf = graph->leafs[i];
2097
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
2098
+ assert(graph_copy->size > graph_copy->n_leafs);
2099
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
2100
+ }
2101
+ }
2102
+
2103
+ static bool wsp_ggml_backend_sched_alloc_splits(wsp_ggml_backend_sched_t sched) {
2104
+ bool backend_ids_changed = false;
2105
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
2106
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
2107
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
2108
+ backend_ids_changed = true;
2109
+ break;
2110
+ }
2111
+ }
2112
+ if (!backend_ids_changed) {
2113
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
2114
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
2115
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
2116
+ backend_ids_changed = true;
2117
+ break;
2118
+ }
2119
+ }
2120
+ }
2121
+
2122
+ // allocate graph
2123
+ if (backend_ids_changed || !wsp_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2124
+ // the re-allocation may cause the split inputs to be moved to a different address
2125
+ wsp_ggml_backend_sched_synchronize(sched);
2126
+ #ifndef NDEBUG
2127
+ WSP_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2128
+ #endif
2129
+ wsp_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2130
+ if (!wsp_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2131
+ WSP_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2132
+ return false;
2133
+ }
2134
+ }
2135
+
2136
+ return true;
2137
+ }
2138
+
2139
+ static enum wsp_ggml_status wsp_ggml_backend_sched_compute_splits(wsp_ggml_backend_sched_t sched) {
2140
+ struct wsp_ggml_backend_sched_split * splits = sched->splits;
2141
+
2142
+ for (int i = 0; i < sched->n_splits; i++) {
2143
+ struct wsp_ggml_backend_sched_split * split = &splits[i];
2144
+ int split_backend_id = split->backend_id;
2145
+ wsp_ggml_backend_t split_backend = sched->backends[split_backend_id];
2146
+
2147
+ // copy the input tensors to the split backend
2148
+ for (int j = 0; j < split->n_inputs; j++) {
2149
+ wsp_ggml_backend_t input_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
2150
+ struct wsp_ggml_tensor * input = split->inputs[j];
2151
+ struct wsp_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
2152
+
2153
+ if (input->flags & WSP_GGML_TENSOR_FLAG_INPUT) {
2154
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
2155
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2156
+ wsp_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2157
+ } else {
2158
+ wsp_ggml_backend_synchronize(split_backend);
2159
+ }
2160
+ wsp_ggml_backend_tensor_copy(input, input_cpy);
2161
+ } else {
2162
+ // wait for the split backend to finish using the input before overwriting it
2163
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2164
+ wsp_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
2165
+ } else {
2166
+ wsp_ggml_backend_synchronize(split_backend);
2167
+ }
2168
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
2169
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
2170
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
2171
+ wsp_ggml_backend_synchronize(input_backend);
2172
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2173
+ wsp_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2174
+ } else {
2175
+ wsp_ggml_backend_synchronize(split_backend);
2176
+ }
2177
+ wsp_ggml_backend_tensor_copy(input, input_cpy);
2178
+ }
2179
+ }
2180
+ }
2181
+
2182
+ if (!sched->callback_eval) {
2183
+ enum wsp_ggml_status ec = wsp_ggml_backend_graph_compute_async(split_backend, &split->graph);
2184
+ if (ec != WSP_GGML_STATUS_SUCCESS) {
2185
+ return ec;
2186
+ }
2187
+ } else {
2188
+ // similar to wsp_ggml_backend_compare_graph_backend
2189
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
2190
+ struct wsp_ggml_tensor * t = split->graph.nodes[j0];
2191
+
2192
+ // check if the user needs data from this node
2193
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2194
+
2195
+ int j1 = j0;
2196
+
2197
+ // determine the range [j0, j1] of nodes that can be computed together
2198
+ while (!need && j1 < split->graph.n_nodes - 1) {
2199
+ t = split->graph.nodes[++j1];
2200
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2201
+ }
2202
+
2203
+ struct wsp_ggml_cgraph gv = wsp_ggml_graph_view(&split->graph, j0, j1 + 1);
2204
+
2205
+ enum wsp_ggml_status ec = wsp_ggml_backend_graph_compute_async(split_backend, &gv);
2206
+ if (ec != WSP_GGML_STATUS_SUCCESS) {
2207
+ return ec;
2208
+ }
2209
+
2210
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
2211
+ wsp_ggml_backend_synchronize(split_backend);
2212
+
2213
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
2214
+ break;
2215
+ }
2216
+
2217
+ j0 = j1;
2218
+ }
2219
+ }
2220
+
2221
+ // record the event of this copy
2222
+ if (split->n_inputs > 0) {
2223
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2224
+ wsp_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
2225
+ }
2226
+ }
2227
+ }
2228
+
2229
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
2230
+
2231
+ return WSP_GGML_STATUS_SUCCESS;
2232
+ }
2233
+
2234
+ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(
2235
+ wsp_ggml_backend_t * backends,
2236
+ wsp_ggml_backend_buffer_type_t * bufts,
2237
+ int n_backends,
2238
+ size_t graph_size,
2239
+ bool parallel) {
2240
+ WSP_GGML_ASSERT(n_backends > 0);
2241
+ WSP_GGML_ASSERT(n_backends <= WSP_GGML_SCHED_MAX_BACKENDS);
2242
+ WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
2243
+
2244
+ struct wsp_ggml_backend_sched * sched = (wsp_ggml_backend_sched *) calloc(1, sizeof(struct wsp_ggml_backend_sched));
2245
+
2246
+ sched->debug = getenv("WSP_GGML_SCHED_DEBUG") != NULL;
2247
+ sched->n_backends = n_backends;
2248
+ sched->n_copies = parallel ? WSP_GGML_SCHED_MAX_COPIES : 1;
2249
+
2250
+ // initialize hash table
2251
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
2252
+ sched->hash_set = wsp_ggml_hash_set_new(graph_size);
2253
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2254
+ sched->hv_tensor_copies = (wsp_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct wsp_ggml_tensor *));
2255
+
2256
+ const size_t wsp_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
2257
+ const size_t nodes_size = graph_size + wsp_ggml_sched_max_splits*WSP_GGML_SCHED_MAX_SPLIT_INPUTS*2;
2258
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2259
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2260
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2261
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2262
+
2263
+ sched->context_buffer_size = wsp_ggml_sched_max_splits*WSP_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct wsp_ggml_tensor) + wsp_ggml_graph_overhead_custom(graph_size, false);
2264
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
2265
+
2266
+ const int initial_splits_capacity = 16;
2267
+ sched->splits = (wsp_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2268
+ sched->splits_capacity = initial_splits_capacity;
2269
+
2270
+ for (int b = 0; b < n_backends; b++) {
2271
+ sched->backends[b] = backends[b];
2272
+ sched->bufts[b] = bufts ? bufts[b] : wsp_ggml_backend_get_default_buffer_type(backends[b]);
2273
+ WSP_GGML_ASSERT(wsp_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
2274
+
2275
+ if (sched->n_copies > 1) {
2276
+ for (int c = 0; c < sched->n_copies; c++) {
2277
+ sched->events[b][c] = wsp_ggml_backend_event_new(backends[b]->device);
2278
+ }
2279
+ }
2280
+ }
2281
+
2282
+ sched->galloc = wsp_ggml_gallocr_new_n(sched->bufts, n_backends);
2283
+
2284
+ wsp_ggml_backend_sched_reset(sched);
2285
+
2286
+ return sched;
2287
+ }
2288
+
2289
+ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
2290
+ if (sched == NULL) {
2291
+ return;
2292
+ }
2293
+ for (int b = 0; b < sched->n_backends; b++) {
2294
+ for (int c = 0; c < sched->n_copies; c++) {
2295
+ wsp_ggml_backend_event_free(sched->events[b][c]);
2296
+ }
2297
+ }
2298
+ wsp_ggml_gallocr_free(sched->galloc);
2299
+ wsp_ggml_free(sched->ctx);
2300
+ wsp_ggml_hash_set_free(&sched->hash_set);
2301
+ free(sched->splits);
2302
+ free(sched->hv_tensor_backend_ids);
2303
+ free(sched->hv_tensor_copies);
2304
+ free(sched->node_backend_ids);
2305
+ free(sched->leaf_backend_ids);
2306
+ free(sched->prev_node_backend_ids);
2307
+ free(sched->prev_leaf_backend_ids);
2308
+ free(sched->context_buffer);
2309
+ free(sched->graph.nodes);
2310
+ free(sched->graph.leafs);
2311
+ free(sched);
2312
+ }
2313
+
2314
+ void wsp_ggml_backend_sched_reset(wsp_ggml_backend_sched_t sched) {
2315
+ // reset state for the next run
2316
+ if (!sched->is_reset) {
2317
+ wsp_ggml_hash_set_reset(&sched->hash_set);
2318
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2319
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct wsp_ggml_tensor *));
2320
+ sched->is_reset = true;
2321
+ }
2322
+ sched->is_alloc = false;
2323
+ }
2324
+
2325
+ bool wsp_ggml_backend_sched_reserve(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
2326
+ WSP_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
2327
+
2328
+ wsp_ggml_backend_sched_split_graph(sched, measure_graph);
2329
+
2330
+ if (!wsp_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2331
+ return false;
2332
+ }
2333
+
2334
+ wsp_ggml_backend_sched_reset(sched);
2335
+ wsp_ggml_backend_sched_synchronize(sched);
2336
+
2337
+ return true;
2338
+ }
2339
+
2340
+ bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
2341
+ WSP_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
2342
+
2343
+ wsp_ggml_backend_sched_split_graph(sched, graph);
2344
+
2345
+
2346
+ if (!wsp_ggml_backend_sched_alloc_splits(sched)) {
2347
+ return false;
2348
+ }
2349
+
2350
+ sched->is_alloc = true;
2351
+
2352
+ return true;
2353
+ }
2354
+
2355
+ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
2356
+ enum wsp_ggml_status err = wsp_ggml_backend_sched_graph_compute_async(sched, graph);
2357
+ wsp_ggml_backend_sched_synchronize(sched);
2358
+ return err;
2359
+ }
2360
+
2361
+ enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute_async(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
2362
+ if (!sched->is_reset && !sched->is_alloc) {
2363
+ wsp_ggml_backend_sched_reset(sched);
2364
+ }
2365
+
2366
+ if (!sched->is_alloc) {
2367
+ if (!wsp_ggml_backend_sched_alloc_graph(sched, graph)) {
2368
+ return WSP_GGML_STATUS_ALLOC_FAILED;
2369
+ }
2370
+ }
2371
+
2372
+ return wsp_ggml_backend_sched_compute_splits(sched);
2373
+ }
2374
+
2375
+ void wsp_ggml_backend_sched_synchronize(wsp_ggml_backend_sched_t sched) {
2376
+ for (int i = 0; i < sched->n_backends; i++) {
2377
+ wsp_ggml_backend_synchronize(sched->backends[i]);
2378
+ }
2379
+ }
2380
+
2381
+ void wsp_ggml_backend_sched_set_eval_callback(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_sched_eval_callback callback, void * user_data) {
2382
+ sched->callback_eval = callback;
2383
+ sched->callback_eval_user_data = user_data;
2384
+ }
2385
+
2386
+ int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched) {
2387
+ return sched->n_splits;
2388
+ }
2389
+
2390
+ int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched) {
2391
+ return sched->n_copies;
2392
+ }
2393
+
2394
+ int wsp_ggml_backend_sched_get_n_backends(wsp_ggml_backend_sched_t sched) {
2395
+ return sched->n_backends;
2396
+ }
2397
+
2398
+ wsp_ggml_backend_t wsp_ggml_backend_sched_get_backend(wsp_ggml_backend_sched_t sched, int i) {
2399
+ WSP_GGML_ASSERT(i >= 0 && i < sched->n_backends);
2400
+ return sched->backends[i];
2401
+ }
2402
+
2403
+ size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
2404
+ int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
2405
+ WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2406
+
2407
+ return wsp_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2408
+ }
2409
+
2410
+ void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend) {
2411
+ int backend_index = wsp_ggml_backend_sched_backend_id(sched, backend);
2412
+ WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2413
+ tensor_backend_id(node) = backend_index;
2414
+ SET_CAUSE(node, "usr");
2415
+ sched->is_reset = false;
2416
+ }
2417
+
2418
+ wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
2419
+ int backend_index = tensor_backend_id(node);
2420
+ if (backend_index == -1) {
2421
+ return NULL;
2422
+ }
2423
+ return sched->backends[backend_index];
2424
+ }
2425
+
2426
+ // utils
2427
+
2428
+ void wsp_ggml_backend_view_init(struct wsp_ggml_tensor * tensor) {
2429
+ WSP_GGML_ASSERT(tensor->buffer == NULL);
2430
+ WSP_GGML_ASSERT(tensor->view_src != NULL);
2431
+ WSP_GGML_ASSERT(tensor->view_src->buffer != NULL);
2432
+ WSP_GGML_ASSERT(tensor->view_src->data != NULL);
2433
+
2434
+ tensor->buffer = tensor->view_src->buffer;
2435
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2436
+ wsp_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2437
+ }
2438
+
2439
+ void wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr) {
2440
+ WSP_GGML_ASSERT(tensor->buffer == NULL);
2441
+ WSP_GGML_ASSERT(tensor->data == NULL);
2442
+ WSP_GGML_ASSERT(tensor->view_src == NULL);
2443
+ WSP_GGML_ASSERT(addr >= wsp_ggml_backend_buffer_get_base(buffer));
2444
+ WSP_GGML_ASSERT((char *)addr + wsp_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2445
+ (char *)wsp_ggml_backend_buffer_get_base(buffer) + wsp_ggml_backend_buffer_get_size(buffer));
2446
+
2447
+ tensor->buffer = buffer;
2448
+ tensor->data = addr;
2449
+ wsp_ggml_backend_buffer_init_tensor(buffer, tensor);
2450
+ }
2451
+
2452
+ static struct wsp_ggml_tensor * graph_copy_dup_tensor(struct wsp_ggml_hash_set hash_set, struct wsp_ggml_tensor ** node_copies,
2453
+ struct wsp_ggml_context * ctx_allocated, struct wsp_ggml_context * ctx_unallocated, struct wsp_ggml_tensor * src) {
2454
+
2455
+ WSP_GGML_ASSERT(src != NULL);
2456
+ WSP_GGML_ASSERT(src->data && "graph must be allocated");
2457
+
2458
+ size_t id = wsp_ggml_hash_insert(&hash_set, src);
2459
+ if (id == WSP_GGML_HASHSET_ALREADY_EXISTS) {
2460
+ return node_copies[wsp_ggml_hash_find(&hash_set, src)];
2461
+ }
2462
+
2463
+ struct wsp_ggml_tensor * dst = wsp_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2464
+ if (src->view_src != NULL) {
2465
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2466
+ dst->view_offs = src->view_offs;
2467
+ }
2468
+ dst->op = src->op;
2469
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2470
+ wsp_ggml_set_name(dst, src->name);
2471
+
2472
+ // copy src
2473
+ for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
2474
+ struct wsp_ggml_tensor * s = src->src[i];
2475
+ if (s == NULL) {
2476
+ continue;
2477
+ }
2478
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2479
+ }
2480
+
2481
+ node_copies[id] = dst;
2482
+ return dst;
2483
+ }
2484
+
2485
+ static void graph_copy_init_tensor(struct wsp_ggml_hash_set * hash_set, struct wsp_ggml_tensor ** node_copies, bool * node_init, struct wsp_ggml_tensor * src) {
2486
+ size_t id = wsp_ggml_hash_find(hash_set, src);
2487
+ if (node_init[id]) {
2488
+ return;
2489
+ }
2490
+ node_init[id] = true;
2491
+
2492
+ struct wsp_ggml_tensor * dst = node_copies[id];
2493
+ if (dst->view_src != NULL) {
2494
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2495
+ wsp_ggml_backend_view_init(dst);
2496
+ }
2497
+ else {
2498
+ wsp_ggml_backend_tensor_copy(src, dst);
2499
+ }
2500
+
2501
+ // init src
2502
+ for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
2503
+ struct wsp_ggml_tensor * s = src->src[i];
2504
+ if (s == NULL) {
2505
+ continue;
2506
+ }
2507
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2508
+ }
2509
+ }
2510
+
2511
+ struct wsp_ggml_backend_graph_copy wsp_ggml_backend_graph_copy(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * graph) {
2512
+ struct wsp_ggml_hash_set hash_set = wsp_ggml_hash_set_new(graph->visited_hash_set.size);
2513
+ struct wsp_ggml_tensor ** node_copies = (wsp_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2514
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2515
+
2516
+ struct wsp_ggml_init_params params = {
2517
+ /* .mem_size = */ wsp_ggml_tensor_overhead()*hash_set.size + wsp_ggml_graph_overhead_custom(graph->size, false),
2518
+ /* .mem_buffer = */ NULL,
2519
+ /* .no_alloc = */ true
2520
+ };
2521
+
2522
+ struct wsp_ggml_context * ctx_allocated = wsp_ggml_init(params);
2523
+ struct wsp_ggml_context * ctx_unallocated = wsp_ggml_init(params);
2524
+
2525
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2526
+ WSP_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2527
+ wsp_ggml_hash_set_free(&hash_set);
2528
+ free(node_copies);
2529
+ free(node_init);
2530
+ wsp_ggml_free(ctx_allocated);
2531
+ wsp_ggml_free(ctx_unallocated);
2532
+ return {
2533
+ /* .buffer = */ NULL,
2534
+ /* .ctx_allocated = */ NULL,
2535
+ /* .ctx_unallocated = */ NULL,
2536
+ /* .graph = */ NULL,
2537
+ };
2538
+ }
2539
+
2540
+ // dup nodes
2541
+ for (int i = 0; i < graph->n_nodes; i++) {
2542
+ struct wsp_ggml_tensor * node = graph->nodes[i];
2543
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2544
+ }
2545
+
2546
+ // allocate nodes
2547
+ wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2548
+ if (buffer == NULL) {
2549
+ WSP_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2550
+ wsp_ggml_hash_set_free(&hash_set);
2551
+ free(node_copies);
2552
+ free(node_init);
2553
+ wsp_ggml_free(ctx_allocated);
2554
+ wsp_ggml_free(ctx_unallocated);
2555
+ return {
2556
+ /* .buffer = */ NULL,
2557
+ /* .ctx_allocated = */ NULL,
2558
+ /* .ctx_unallocated = */ NULL,
2559
+ /* .graph = */ NULL,
2560
+ };
2561
+ }
2562
+
2563
+ //printf("copy buffer size: %zu MB\n", wsp_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2564
+
2565
+ // copy data and init views
2566
+ for (int i = 0; i < graph->n_nodes; i++) {
2567
+ struct wsp_ggml_tensor * node = graph->nodes[i];
2568
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2569
+ }
2570
+
2571
+ // build graph copy
2572
+ struct wsp_ggml_cgraph * graph_copy = wsp_ggml_new_graph_custom(ctx_allocated, graph->size, false);
2573
+ for (int i = 0; i < graph->n_nodes; i++) {
2574
+ struct wsp_ggml_tensor * node = graph->nodes[i];
2575
+ struct wsp_ggml_tensor * node_copy = node_copies[wsp_ggml_hash_find(&hash_set, node)];
2576
+ graph_copy->nodes[i] = node_copy;
2577
+ }
2578
+ graph_copy->n_nodes = graph->n_nodes;
2579
+
2580
+ wsp_ggml_hash_set_free(&hash_set);
2581
+ free(node_copies);
2582
+ free(node_init);
2583
+
2584
+ return {
2585
+ /* .buffer = */ buffer,
2586
+ /* .ctx_allocated = */ ctx_allocated,
2587
+ /* .ctx_unallocated = */ ctx_unallocated,
2588
+ /* .graph = */ graph_copy,
2589
+ };
2590
+ }
2591
+
2592
+ void wsp_ggml_backend_graph_copy_free(struct wsp_ggml_backend_graph_copy copy) {
2593
+ wsp_ggml_backend_buffer_free(copy.buffer);
2594
+ wsp_ggml_free(copy.ctx_allocated);
2595
+ wsp_ggml_free(copy.ctx_unallocated);
2596
+ }
2597
+
2598
+ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data) {
2599
+ struct wsp_ggml_backend_graph_copy copy = wsp_ggml_backend_graph_copy(backend2, graph);
2600
+ if (copy.buffer == NULL) {
2601
+ return false;
2602
+ }
2603
+
2604
+ struct wsp_ggml_cgraph * g1 = graph;
2605
+ struct wsp_ggml_cgraph * g2 = copy.graph;
2606
+
2607
+ assert(g1->n_nodes == g2->n_nodes);
2608
+
2609
+ for (int i = 0; i < g1->n_nodes; i++) {
2610
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2611
+ struct wsp_ggml_tensor * t1 = g1->nodes[i];
2612
+ struct wsp_ggml_tensor * t2 = g2->nodes[i];
2613
+
2614
+ assert(t1->op == t2->op && wsp_ggml_are_same_layout(t1, t2));
2615
+
2616
+ struct wsp_ggml_cgraph g1v = wsp_ggml_graph_view(g1, i, i + 1);
2617
+ struct wsp_ggml_cgraph g2v = wsp_ggml_graph_view(g2, i, i + 1);
2618
+
2619
+ wsp_ggml_backend_graph_compute(backend1, &g1v);
2620
+ wsp_ggml_backend_graph_compute(backend2, &g2v);
2621
+
2622
+ if (wsp_ggml_is_view_op(t1->op)) {
2623
+ continue;
2624
+ }
2625
+
2626
+ // compare results, calculate rms etc
2627
+ if (!callback(i, t1, t2, user_data)) {
2628
+ break;
2629
+ }
2630
+ }
2631
+
2632
+ wsp_ggml_backend_graph_copy_free(copy);
2633
+
2634
+ return true;
2635
+ }