whisper.rn 0.4.0-rc.3 → 0.4.0-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/android/src/main/CMakeLists.txt +2 -0
  2. package/android/src/main/java/com/rnwhisper/RNWhisper.java +6 -1
  3. package/android/src/main/java/com/rnwhisper/WhisperContext.java +3 -3
  4. package/android/src/main/jni.cpp +6 -2
  5. package/cpp/ggml-alloc.c +413 -280
  6. package/cpp/ggml-alloc.h +67 -8
  7. package/cpp/ggml-backend-impl.h +87 -0
  8. package/cpp/ggml-backend.c +950 -0
  9. package/cpp/ggml-backend.h +136 -0
  10. package/cpp/ggml-impl.h +243 -0
  11. package/cpp/{ggml-metal.metal → ggml-metal-whisper.metal} +591 -121
  12. package/cpp/ggml-metal.h +21 -0
  13. package/cpp/ggml-metal.m +623 -234
  14. package/cpp/ggml-quants.c +7377 -0
  15. package/cpp/ggml-quants.h +224 -0
  16. package/cpp/ggml.c +3773 -4455
  17. package/cpp/ggml.h +279 -146
  18. package/cpp/whisper.cpp +182 -103
  19. package/cpp/whisper.h +48 -11
  20. package/ios/RNWhisper.mm +8 -2
  21. package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +4 -0
  22. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  23. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  24. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +19 -0
  25. package/ios/RNWhisperContext.h +5 -1
  26. package/ios/RNWhisperContext.mm +76 -10
  27. package/jest/mock.js +1 -1
  28. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  29. package/lib/commonjs/index.js +28 -9
  30. package/lib/commonjs/index.js.map +1 -1
  31. package/lib/commonjs/version.json +1 -1
  32. package/lib/module/NativeRNWhisper.js.map +1 -1
  33. package/lib/module/index.js +28 -9
  34. package/lib/module/index.js.map +1 -1
  35. package/lib/module/version.json +1 -1
  36. package/lib/typescript/NativeRNWhisper.d.ts +7 -1
  37. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  38. package/lib/typescript/index.d.ts +7 -2
  39. package/lib/typescript/index.d.ts.map +1 -1
  40. package/package.json +1 -1
  41. package/src/NativeRNWhisper.ts +8 -1
  42. package/src/index.ts +29 -17
  43. package/src/version.json +1 -1
  44. package/whisper-rn.podspec +1 -2
@@ -0,0 +1,950 @@
1
+ #include "ggml-backend-impl.h"
2
+ #include "ggml-alloc.h"
3
+ #include "ggml-impl.h"
4
+
5
+ #include <assert.h>
6
+ #include <limits.h>
7
+ #include <stdarg.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ #define UNUSED WSP_GGML_UNUSED
13
+
14
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
15
+
16
+ // backend buffer
17
+
18
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_buffer_init(
19
+ struct wsp_ggml_backend * backend,
20
+ struct wsp_ggml_backend_buffer_i iface,
21
+ wsp_ggml_backend_buffer_context_t context,
22
+ size_t size) {
23
+ wsp_ggml_backend_buffer_t buffer = malloc(sizeof(struct wsp_ggml_backend_buffer));
24
+
25
+ WSP_GGML_ASSERT(iface.get_base != NULL);
26
+
27
+ (*buffer) = (struct wsp_ggml_backend_buffer) {
28
+ /* .interface = */ iface,
29
+ /* .backend = */ backend,
30
+ /* .context = */ context,
31
+ /* .size = */ size,
32
+ };
33
+
34
+ return buffer;
35
+ }
36
+
37
+ void wsp_ggml_backend_buffer_free(wsp_ggml_backend_buffer_t buffer) {
38
+ if (buffer == NULL) {
39
+ return;
40
+ }
41
+
42
+ if (buffer->iface.free_buffer != NULL) {
43
+ buffer->iface.free_buffer(buffer);
44
+ }
45
+ free(buffer);
46
+ }
47
+
48
+ size_t wsp_ggml_backend_buffer_get_alignment(wsp_ggml_backend_buffer_t buffer) {
49
+ return wsp_ggml_backend_get_alignment(buffer->backend);
50
+ }
51
+
52
+ size_t wsp_ggml_backend_buffer_get_size(wsp_ggml_backend_buffer_t buffer) {
53
+ return buffer->size;
54
+ }
55
+
56
+ void * wsp_ggml_backend_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
57
+ void * base = buffer->iface.get_base(buffer);
58
+
59
+ WSP_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
60
+
61
+ return base;
62
+ }
63
+
64
+ size_t wsp_ggml_backend_buffer_get_alloc_size(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
65
+ // get_alloc_size is optional, defaults to wsp_ggml_nbytes
66
+ if (buffer->iface.get_alloc_size) {
67
+ return buffer->iface.get_alloc_size(buffer, tensor);
68
+ }
69
+ return wsp_ggml_nbytes(tensor);
70
+ }
71
+
72
+ void wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
73
+ // init_tensor is optional
74
+ if (buffer->iface.init_tensor) {
75
+ buffer->iface.init_tensor(buffer, tensor);
76
+ }
77
+ }
78
+
79
+ void wsp_ggml_backend_buffer_free_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
80
+ // free_tensor is optional
81
+ if (buffer->iface.free_tensor) {
82
+ buffer->iface.free_tensor(buffer, tensor);
83
+ }
84
+ }
85
+
86
+ // backend
87
+
88
+ wsp_ggml_backend_t wsp_ggml_get_backend(const struct wsp_ggml_tensor * tensor) {
89
+ return tensor->buffer ? tensor->buffer->backend : NULL;
90
+ }
91
+
92
+ const char * wsp_ggml_backend_name(wsp_ggml_backend_t backend) {
93
+ if (backend == NULL) {
94
+ return "NULL";
95
+ }
96
+ return backend->iface.get_name(backend);
97
+ }
98
+
99
+ void wsp_ggml_backend_free(wsp_ggml_backend_t backend) {
100
+ if (backend == NULL) {
101
+ return;
102
+ }
103
+
104
+ backend->iface.free(backend);
105
+ }
106
+
107
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
108
+ return backend->iface.alloc_buffer(backend, size);
109
+ }
110
+
111
+ size_t wsp_ggml_backend_get_alignment(wsp_ggml_backend_t backend) {
112
+ return backend->iface.get_alignment(backend);
113
+ }
114
+
115
+ void wsp_ggml_backend_tensor_set_async(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
116
+ wsp_ggml_get_backend(tensor)->iface.set_tensor_async(wsp_ggml_get_backend(tensor), tensor, data, offset, size);
117
+ }
118
+
119
+ void wsp_ggml_backend_tensor_get_async(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
120
+ wsp_ggml_get_backend(tensor)->iface.get_tensor_async(wsp_ggml_get_backend(tensor), tensor, data, offset, size);
121
+ }
122
+
123
+ void wsp_ggml_backend_tensor_set(struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
124
+ wsp_ggml_backend_t backend = wsp_ggml_get_backend(tensor);
125
+
126
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
127
+ WSP_GGML_ASSERT(backend != NULL && "tensor backend not set");
128
+
129
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
130
+ backend->iface.synchronize(backend);
131
+ }
132
+
133
+ void wsp_ggml_backend_tensor_get(const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
134
+ wsp_ggml_backend_t backend = wsp_ggml_get_backend(tensor);
135
+
136
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
137
+ WSP_GGML_ASSERT(backend != NULL && "tensor backend not set");
138
+
139
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
140
+ backend->iface.synchronize(backend);
141
+ }
142
+
143
+ void wsp_ggml_backend_synchronize(wsp_ggml_backend_t backend) {
144
+ backend->iface.synchronize(backend);
145
+ }
146
+
147
+ wsp_ggml_backend_graph_plan_t wsp_ggml_backend_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
148
+ return backend->iface.graph_plan_create(backend, cgraph);
149
+ }
150
+
151
+ void wsp_ggml_backend_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
152
+ backend->iface.graph_plan_free(backend, plan);
153
+ }
154
+
155
+ void wsp_ggml_backend_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
156
+ backend->iface.graph_plan_compute(backend, plan);
157
+ }
158
+
159
+ void wsp_ggml_backend_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
160
+ backend->iface.graph_compute(backend, cgraph);
161
+ }
162
+
163
+ bool wsp_ggml_backend_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
164
+ return backend->iface.supports_op(backend, op);
165
+ }
166
+
167
+ // backend copy
168
+
169
+ static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
170
+ if (a->type != b->type) {
171
+ return false;
172
+ }
173
+ for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
174
+ if (a->ne[i] != b->ne[i]) {
175
+ return false;
176
+ }
177
+ if (a->nb[i] != b->nb[i]) {
178
+ return false;
179
+ }
180
+ }
181
+ return true;
182
+ }
183
+
184
+ void wsp_ggml_backend_tensor_copy(struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
185
+ //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
186
+ //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
187
+ WSP_GGML_ASSERT(wsp_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
188
+
189
+ // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, wsp_ggml_backend_name(src->backend), wsp_ggml_backend_name(dst->backend), wsp_ggml_nbytes(src));
190
+
191
+ if (src == dst) {
192
+ return;
193
+ }
194
+
195
+ // TODO: allow backends to support copy to/from same backend
196
+
197
+ if (wsp_ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
198
+ wsp_ggml_get_backend(dst)->iface.cpy_tensor_from(wsp_ggml_get_backend(dst)->context, src, dst);
199
+ } else if (wsp_ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
200
+ wsp_ggml_get_backend(src)->iface.cpy_tensor_to(wsp_ggml_get_backend(src)->context, src, dst);
201
+ } else {
202
+ // shouldn't be hit when copying from/to CPU
203
+ #ifndef NDEBUG
204
+ fprintf(stderr, "wsp_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", wsp_ggml_backend_name(src->buffer->backend), wsp_ggml_backend_name(dst->buffer->backend));
205
+ #endif
206
+ size_t nbytes = wsp_ggml_nbytes(src);
207
+ void * data = malloc(nbytes);
208
+ wsp_ggml_backend_tensor_get(src, data, 0, nbytes);
209
+ wsp_ggml_backend_tensor_set(dst, data, 0, nbytes);
210
+ free(data);
211
+ }
212
+ }
213
+
214
+ // backend CPU
215
+
216
+ struct wsp_ggml_backend_cpu_context {
217
+ int n_threads;
218
+ void * work_data;
219
+ size_t work_size;
220
+ };
221
+
222
+ static const char * wsp_ggml_backend_cpu_name(wsp_ggml_backend_t backend) {
223
+ return "CPU";
224
+
225
+ UNUSED(backend);
226
+ }
227
+
228
+ static void wsp_ggml_backend_cpu_free(wsp_ggml_backend_t backend) {
229
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
230
+ free(cpu_ctx->work_data);
231
+ free(cpu_ctx);
232
+ free(backend);
233
+ }
234
+
235
+ static void * wsp_ggml_backend_cpu_buffer_get_base(wsp_ggml_backend_buffer_t buffer) {
236
+ return (void *)buffer->context;
237
+ }
238
+
239
+ static void wsp_ggml_backend_cpu_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
240
+ free(buffer->context);
241
+ UNUSED(buffer);
242
+ }
243
+
244
+ static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i = {
245
+ /* .free_buffer = */ wsp_ggml_backend_cpu_buffer_free_buffer,
246
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
247
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
248
+ /* .init_tensor = */ NULL, // no initialization required
249
+ /* .free_tensor = */ NULL, // no cleanup required
250
+ };
251
+
252
+ // for buffers from ptr, free is not called
253
+ static struct wsp_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
254
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
255
+ /* .get_base = */ wsp_ggml_backend_cpu_buffer_get_base,
256
+ /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
257
+ /* .init_tensor = */ NULL,
258
+ /* .free_tensor = */ NULL,
259
+ };
260
+
261
+ static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
262
+
263
+ static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_alloc_buffer(wsp_ggml_backend_t backend, size_t size) {
264
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
265
+ void * data = malloc(size); // TODO: maybe use WSP_GGML_ALIGNED_MALLOC?
266
+
267
+ WSP_GGML_ASSERT(data != NULL && "failed to allocate buffer");
268
+
269
+ return wsp_ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
270
+ }
271
+
272
+ static size_t wsp_ggml_backend_cpu_get_alignment(wsp_ggml_backend_t backend) {
273
+ return TENSOR_ALIGNMENT;
274
+ UNUSED(backend);
275
+ }
276
+
277
+ static void wsp_ggml_backend_cpu_set_tensor_async(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
278
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor write out of bounds");
279
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
280
+
281
+ memcpy((char *)tensor->data + offset, data, size);
282
+
283
+ UNUSED(backend);
284
+ }
285
+
286
+ static void wsp_ggml_backend_cpu_get_tensor_async(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
287
+ WSP_GGML_ASSERT(offset + size <= wsp_ggml_nbytes(tensor) && "tensor read out of bounds");
288
+ WSP_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
289
+
290
+ memcpy(data, (const char *)tensor->data + offset, size);
291
+
292
+ UNUSED(backend);
293
+ }
294
+
295
+ static void wsp_ggml_backend_cpu_synchronize(wsp_ggml_backend_t backend) {
296
+ UNUSED(backend);
297
+ }
298
+
299
+ static void wsp_ggml_backend_cpu_cpy_tensor_from(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
300
+ wsp_ggml_backend_tensor_get(src, dst->data, 0, wsp_ggml_nbytes(src));
301
+
302
+ UNUSED(backend);
303
+ }
304
+
305
+ static void wsp_ggml_backend_cpu_cpy_tensor_to(wsp_ggml_backend_t backend, struct wsp_ggml_tensor * src, struct wsp_ggml_tensor * dst) {
306
+ wsp_ggml_backend_tensor_set(dst, src->data, 0, wsp_ggml_nbytes(src));
307
+
308
+ UNUSED(backend);
309
+ }
310
+
311
+ struct wsp_ggml_backend_plan_cpu {
312
+ struct wsp_ggml_cplan cplan;
313
+ struct wsp_ggml_cgraph cgraph;
314
+ };
315
+
316
+ static wsp_ggml_backend_graph_plan_t wsp_ggml_backend_cpu_graph_plan_create(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
317
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
318
+
319
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct wsp_ggml_backend_plan_cpu));
320
+
321
+ cpu_plan->cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
322
+ cpu_plan->cgraph = *cgraph;
323
+
324
+ if (cpu_plan->cplan.work_size > 0) {
325
+ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
326
+ }
327
+
328
+ return cpu_plan;
329
+ }
330
+
331
+ static void wsp_ggml_backend_cpu_graph_plan_free(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
332
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
333
+
334
+ free(cpu_plan->cplan.work_data);
335
+ free(cpu_plan);
336
+
337
+ UNUSED(backend);
338
+ }
339
+
340
+ static void wsp_ggml_backend_cpu_graph_plan_compute(wsp_ggml_backend_t backend, wsp_ggml_backend_graph_plan_t plan) {
341
+ struct wsp_ggml_backend_plan_cpu * cpu_plan = (struct wsp_ggml_backend_plan_cpu *)plan;
342
+
343
+ wsp_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
344
+
345
+ UNUSED(backend);
346
+ }
347
+
348
+ static void wsp_ggml_backend_cpu_graph_compute(wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph) {
349
+ struct wsp_ggml_backend_cpu_context * cpu_ctx = (struct wsp_ggml_backend_cpu_context *)backend->context;
350
+
351
+ struct wsp_ggml_cplan cplan = wsp_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
352
+
353
+ if (cpu_ctx->work_size < cplan.work_size) {
354
+ // TODO: may be faster to free and use malloc to avoid the copy
355
+ cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
356
+ cpu_ctx->work_size = cplan.work_size;
357
+ }
358
+
359
+ cplan.work_data = cpu_ctx->work_data;
360
+
361
+ wsp_ggml_graph_compute(cgraph, &cplan);
362
+ }
363
+
364
+ static bool wsp_ggml_backend_cpu_supports_op(wsp_ggml_backend_t backend, const struct wsp_ggml_tensor * op) {
365
+ return true;
366
+ UNUSED(backend);
367
+ UNUSED(op);
368
+ }
369
+
370
+ static struct wsp_ggml_backend_i cpu_backend_i = {
371
+ /* .get_name = */ wsp_ggml_backend_cpu_name,
372
+ /* .free = */ wsp_ggml_backend_cpu_free,
373
+ /* .alloc_buffer = */ wsp_ggml_backend_cpu_alloc_buffer,
374
+ /* .get_alignment = */ wsp_ggml_backend_cpu_get_alignment,
375
+ /* .set_tensor_async = */ wsp_ggml_backend_cpu_set_tensor_async,
376
+ /* .get_tensor_async = */ wsp_ggml_backend_cpu_get_tensor_async,
377
+ /* .synchronize = */ wsp_ggml_backend_cpu_synchronize,
378
+ /* .cpy_tensor_from = */ wsp_ggml_backend_cpu_cpy_tensor_from,
379
+ /* .cpy_tensor_to = */ wsp_ggml_backend_cpu_cpy_tensor_to,
380
+ /* .graph_plan_create = */ wsp_ggml_backend_cpu_graph_plan_create,
381
+ /* .graph_plan_free = */ wsp_ggml_backend_cpu_graph_plan_free,
382
+ /* .graph_plan_compute = */ wsp_ggml_backend_cpu_graph_plan_compute,
383
+ /* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
384
+ /* .supports_op = */ wsp_ggml_backend_cpu_supports_op,
385
+ };
386
+
387
+ wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
388
+ struct wsp_ggml_backend_cpu_context * ctx = malloc(sizeof(struct wsp_ggml_backend_cpu_context));
389
+
390
+ ctx->n_threads = WSP_GGML_DEFAULT_N_THREADS;
391
+ ctx->work_data = NULL;
392
+ ctx->work_size = 0;
393
+
394
+ wsp_ggml_backend_t cpu_backend = malloc(sizeof(struct wsp_ggml_backend));
395
+
396
+ *cpu_backend = (struct wsp_ggml_backend) {
397
+ /* .interface = */ cpu_backend_i,
398
+ /* .context = */ ctx
399
+ };
400
+ return cpu_backend;
401
+ }
402
+
403
+ bool wsp_ggml_backend_is_cpu(wsp_ggml_backend_t backend) {
404
+ return backend->iface.get_name == wsp_ggml_backend_cpu_name;
405
+ }
406
+
407
+ void wsp_ggml_backend_cpu_set_n_threads(wsp_ggml_backend_t backend_cpu, int n_threads) {
408
+ WSP_GGML_ASSERT(wsp_ggml_backend_is_cpu(backend_cpu));
409
+
410
+ struct wsp_ggml_backend_cpu_context * ctx = (struct wsp_ggml_backend_cpu_context *)backend_cpu->context;
411
+ ctx->n_threads = n_threads;
412
+ }
413
+
414
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_buffer_from_ptr(wsp_ggml_backend_t backend_cpu, void * ptr, size_t size) {
415
+ return wsp_ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
416
+ }
417
+
418
+ // scheduler
419
+
420
+ #define WSP_GGML_MAX_BACKENDS 4
421
+ #define WSP_GGML_MAX_SPLITS 256
422
+ #define WSP_GGML_MAX_SPLIT_INPUTS 16
423
+
424
+ struct wsp_ggml_backend_sched_split {
425
+ wsp_ggml_tallocr_t tallocr;
426
+ int i_start;
427
+ int i_end;
428
+ struct wsp_ggml_tensor * inputs[WSP_GGML_MAX_SPLIT_INPUTS];
429
+ int n_inputs;
430
+ struct wsp_ggml_cgraph * graph;
431
+ };
432
+
433
+ struct wsp_ggml_backend_sched {
434
+ int n_backends;
435
+ wsp_ggml_backend_t backends[WSP_GGML_MAX_BACKENDS];
436
+ wsp_ggml_tallocr_t tallocs[WSP_GGML_MAX_BACKENDS];
437
+
438
+ wsp_ggml_gallocr_t galloc;
439
+
440
+ struct wsp_ggml_hash_set hash_set;
441
+ wsp_ggml_tallocr_t * node_talloc; // [hash_set.size]
442
+ struct wsp_ggml_tensor * (* node_copies)[WSP_GGML_MAX_BACKENDS]; // [hash_set.size][WSP_GGML_MAX_BACKENDS]
443
+
444
+ struct wsp_ggml_cgraph * graph;
445
+ struct wsp_ggml_backend_sched_split splits[WSP_GGML_MAX_SPLITS];
446
+ int n_splits;
447
+
448
+ struct wsp_ggml_context * ctx;
449
+
450
+ // align context_buffer to WSP_GGML_MEM_ALIGN
451
+ #ifdef _MSC_VER
452
+ __declspec(align(WSP_GGML_MEM_ALIGN))
453
+ #else
454
+ __attribute__((aligned(WSP_GGML_MEM_ALIGN)))
455
+ #endif
456
+ char context_buffer[WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS*sizeof(struct wsp_ggml_tensor) + WSP_GGML_MAX_SPLITS*sizeof(struct wsp_ggml_cgraph)];
457
+ };
458
+
459
+ #define hash_id(node) wsp_ggml_hash_find_or_insert(sched->hash_set, node)
460
+ #define node_allocr(node) sched->node_talloc[hash_id(node)]
461
+
462
+ static bool wsp_ggml_is_view_op(enum wsp_ggml_op op) {
463
+ return op == WSP_GGML_OP_VIEW || op == WSP_GGML_OP_RESHAPE || op == WSP_GGML_OP_PERMUTE || op == WSP_GGML_OP_TRANSPOSE;
464
+ }
465
+
466
+ // returns the priority of the backend, lower is better
467
+ static int sched_backend_prio(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
468
+ for (int i = 0; i < sched->n_backends; i++) {
469
+ if (sched->backends[i] == backend) {
470
+ return i;
471
+ }
472
+ }
473
+ return INT_MAX;
474
+ }
475
+
476
+ static int sched_allocr_prio(wsp_ggml_backend_sched_t sched, wsp_ggml_tallocr_t allocr) {
477
+ for (int i = 0; i < sched->n_backends; i++) {
478
+ if (sched->tallocs[i] == allocr) {
479
+ return i;
480
+ }
481
+ }
482
+ return INT_MAX;
483
+ }
484
+
485
+ // returns the backend that should be used for the node based on the current locations
486
+ char causes[WSP_GGML_DEFAULT_GRAPH_SIZE*4 + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
487
+ static wsp_ggml_backend_t sched_backend_from_cur(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node) {
488
+ // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
489
+ // ie. kv cache updates
490
+ // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
491
+ // dst
492
+ wsp_ggml_backend_t cur_backend = wsp_ggml_get_backend(node);
493
+ if (cur_backend != NULL) {
494
+ sprintf(causes[hash_id(node)], "1.dst");
495
+ return cur_backend;
496
+ }
497
+
498
+ // view_src
499
+ if (node->view_src != NULL && wsp_ggml_get_backend(node->view_src) != NULL) {
500
+ sprintf(causes[hash_id(node)], "1.vsrc");
501
+ return wsp_ggml_get_backend(node->view_src);
502
+ }
503
+
504
+ // src
505
+ int cur_prio = INT_MAX;
506
+ size_t cur_size = 0;
507
+
508
+ for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
509
+ const struct wsp_ggml_tensor * src = node->src[i];
510
+ if (src == NULL) {
511
+ break;
512
+ }
513
+ wsp_ggml_backend_t src_backend = wsp_ggml_get_backend(src);
514
+ if (src_backend != NULL) {
515
+ int src_prio = sched_backend_prio(sched, src_backend);
516
+ size_t src_size = wsp_ggml_nbytes(src);
517
+ if (src_prio < cur_prio && src_size >= cur_size) {
518
+ cur_prio = src_prio;
519
+ cur_size = src_size;
520
+ cur_backend = src_backend;
521
+ sprintf(causes[hash_id(node)], "1.src%d", i);
522
+ }
523
+ }
524
+ }
525
+ return cur_backend;
526
+ }
527
+
528
+ static char * fmt_size(size_t size) {
529
+ static char buffer[128];
530
+ if (size >= 1024*1024) {
531
+ sprintf(buffer, "%zuM", size/1024/1024);
532
+ } else {
533
+ sprintf(buffer, "%zuK", size/1024);
534
+ }
535
+ return buffer;
536
+ }
537
+
538
+ static void sched_print_assignments(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
539
+ int cur_split = 0;
540
+ for (int i = 0; i < graph->n_nodes; i++) {
541
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
542
+ wsp_ggml_backend_t split_backend = wsp_ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend;
543
+ fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, wsp_ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs);
544
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
545
+ fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(wsp_ggml_nbytes(sched->splits[cur_split].inputs[j])));
546
+ }
547
+ fprintf(stderr, "\n");
548
+ cur_split++;
549
+ }
550
+ struct wsp_ggml_tensor * node = graph->nodes[i];
551
+ if (wsp_ggml_is_view_op(node->op)) {
552
+ continue;
553
+ }
554
+ wsp_ggml_tallocr_t node_allocr = node_allocr(node);
555
+ wsp_ggml_backend_t node_backend = node_allocr ? wsp_ggml_tallocr_get_buffer(node_allocr)->backend : NULL;
556
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name, fmt_size(wsp_ggml_nbytes(node)), node_allocr ? wsp_ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]);
557
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
558
+ struct wsp_ggml_tensor * src = node->src[j];
559
+ if (src == NULL) {
560
+ break;
561
+ }
562
+ wsp_ggml_tallocr_t src_allocr = node_allocr(src);
563
+ wsp_ggml_backend_t src_backend = src_allocr ? wsp_ggml_tallocr_get_buffer(src_allocr)->backend : NULL;
564
+ fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(wsp_ggml_nbytes(src)), src_backend ? wsp_ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]);
565
+ }
566
+ fprintf(stderr, "\n");
567
+ }
568
+ }
569
+
570
+ // creates a copy of the tensor with the same memory layout
571
+ static struct wsp_ggml_tensor * wsp_ggml_dup_tensor_layout(struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * tensor) {
572
+ struct wsp_ggml_tensor * dup = wsp_ggml_dup_tensor(ctx, tensor);
573
+ for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
574
+ dup->nb[i] = tensor->nb[i];
575
+ }
576
+ return dup;
577
+ }
578
+
579
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
580
+ // TODO: merge passes
581
+ static void sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
582
+ // reset state
583
+ size_t hash_size = sched->hash_set.size;
584
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
585
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
586
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
587
+ sched->n_splits = 0;
588
+
589
+ struct wsp_ggml_init_params params = {
590
+ /*.mem_size = */ sizeof(sched->context_buffer),
591
+ /*.mem_buffer = */ sched->context_buffer,
592
+ /*.no_alloc = */ true
593
+ };
594
+
595
+ if (sched->ctx != NULL) {
596
+ wsp_ggml_free(sched->ctx);
597
+ }
598
+
599
+ sched->ctx = wsp_ggml_init(params);
600
+
601
+ // pass 1: assign backends to ops with allocated inputs
602
+ for (int i = 0; i < graph->n_leafs; i++) {
603
+ struct wsp_ggml_tensor * leaf = graph->leafs[i];
604
+ if (node_allocr(leaf) != NULL) {
605
+ // do not overwrite user assignments
606
+ continue;
607
+ }
608
+ wsp_ggml_backend_t leaf_backend = wsp_ggml_get_backend(leaf);
609
+ if (leaf_backend == NULL && leaf->view_src != NULL) {
610
+ leaf_backend = wsp_ggml_get_backend(leaf->view_src);
611
+ }
612
+ if (leaf_backend != NULL) {
613
+ node_allocr(leaf) = wsp_ggml_backend_sched_get_tallocr(sched, leaf_backend);
614
+ }
615
+ }
616
+
617
+ for (int i = 0; i < graph->n_nodes; i++) {
618
+ struct wsp_ggml_tensor * node = graph->nodes[i];
619
+ if (node_allocr(node) != NULL) {
620
+ // do not overwrite user assignments
621
+ continue;
622
+ }
623
+ wsp_ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
624
+ if (node_backend != NULL) {
625
+ node_allocr(node) = wsp_ggml_backend_sched_get_tallocr(sched, node_backend);
626
+ }
627
+ }
628
+ //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
629
+
630
+ // pass 2: assign backends to ops from current assignments
631
+ // TODO:
632
+ // - reuse sched_backend_from_cur
633
+ for (int i = 0; i < graph->n_nodes; i++) {
634
+ struct wsp_ggml_tensor * node = graph->nodes[i];
635
+ wsp_ggml_tallocr_t node_allocr = node_allocr(node);
636
+ if (node_allocr == NULL) {
637
+ int cur_prio = INT_MAX;
638
+ size_t cur_size = 0;
639
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
640
+ struct wsp_ggml_tensor * src = node->src[j];
641
+ if (src == NULL) {
642
+ break;
643
+ }
644
+ wsp_ggml_tallocr_t src_allocr = node_allocr(src);
645
+ if (src_allocr != NULL) {
646
+ int src_prio = sched_allocr_prio(sched, src_allocr);
647
+ size_t src_size = wsp_ggml_nbytes(src);
648
+ if (src_prio < cur_prio && src_size >= cur_size) {
649
+ cur_prio = src_prio;
650
+ cur_size = src_size;
651
+ node_allocr = src_allocr;
652
+ sprintf(causes[hash_id(node)], "2.src%d", j);
653
+ }
654
+ }
655
+ }
656
+ if (node_allocr != NULL) {
657
+ node_allocr(node) = node_allocr;
658
+ }
659
+ }
660
+ }
661
+ //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
662
+
663
+ // pass 3: assign backends to remaining src from dst (should only be leafs)
664
+ for (int i = 0; i < graph->n_nodes; i++) {
665
+ struct wsp_ggml_tensor * node = graph->nodes[i];
666
+ wsp_ggml_tallocr_t node_allocr = node_allocr(node);
667
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
668
+ struct wsp_ggml_tensor * src = node->src[j];
669
+ if (src == NULL) {
670
+ break;
671
+ }
672
+ wsp_ggml_tallocr_t src_allocr = node_allocr(src);
673
+ if (src_allocr == NULL) {
674
+ node_allocr(src) = node_allocr;
675
+ }
676
+ }
677
+ }
678
+ //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
679
+
680
+ // pass 4: split graph, find tensors that need to be copied
681
+ // TODO:
682
+ // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
683
+ // find first backend
684
+ int cur_split = 0;
685
+ for (int i = 0; i < graph->n_nodes; i++) {
686
+ struct wsp_ggml_tensor * node = graph->nodes[i];
687
+ if (node->view_src == NULL) {
688
+ sched->splits[0].tallocr = node_allocr(node);
689
+ break;
690
+ }
691
+ }
692
+ sched->splits[0].i_start = 0;
693
+ sched->splits[0].n_inputs = 0;
694
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
695
+ wsp_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
696
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
697
+ for (int i = 0; i < graph->n_nodes; i++) {
698
+ struct wsp_ggml_tensor * node = graph->nodes[i];
699
+
700
+ if (wsp_ggml_is_view_op(node->op)) {
701
+ continue;
702
+ }
703
+
704
+ wsp_ggml_tallocr_t node_allocr = node_allocr(node);
705
+
706
+ if (node_allocr != cur_allocr) {
707
+ sched->splits[cur_split].i_end = i;
708
+ cur_split++;
709
+ WSP_GGML_ASSERT(cur_split < WSP_GGML_MAX_SPLITS);
710
+ sched->splits[cur_split].tallocr = node_allocr;
711
+ sched->splits[cur_split].i_start = i;
712
+ sched->splits[cur_split].n_inputs = 0;
713
+ memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
714
+ cur_allocr = node_allocr;
715
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
716
+ }
717
+
718
+ // find inputs that are not on the same backend
719
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
720
+ struct wsp_ggml_tensor * src = node->src[j];
721
+ if (src == NULL) {
722
+ break;
723
+ }
724
+ wsp_ggml_tallocr_t src_allocr = node_allocr(src);
725
+ if (src_allocr != node_allocr) {
726
+ int n_inputs = sched->splits[cur_split].n_inputs++;
727
+ WSP_GGML_ASSERT(n_inputs < WSP_GGML_MAX_SPLIT_INPUTS);
728
+ sched->splits[cur_split].inputs[n_inputs] = (struct wsp_ggml_tensor *)src;
729
+
730
+ // create copies
731
+ size_t id = hash_id(src);
732
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
733
+ struct wsp_ggml_tensor * tensor_copy = wsp_ggml_dup_tensor_layout(sched->ctx, src);
734
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
735
+ node_allocr(tensor_copy) = cur_allocr;
736
+ wsp_ggml_backend_t backend = wsp_ggml_tallocr_get_buffer(cur_allocr)->backend;
737
+ wsp_ggml_format_name(tensor_copy, "%s#%s", wsp_ggml_backend_name(backend), src->name);
738
+ }
739
+ node->src[j] = sched->node_copies[id][cur_backend_id];
740
+ }
741
+ }
742
+ }
743
+ sched->splits[cur_split].i_end = graph->n_nodes;
744
+ sched->n_splits = cur_split + 1;
745
+
746
+ //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
747
+
748
+ #if 1
749
+ // sanity check: all sources should have the same backend as the node
750
+ for (int i = 0; i < graph->n_nodes; i++) {
751
+ struct wsp_ggml_tensor * node = graph->nodes[i];
752
+ wsp_ggml_tallocr_t node_allocr = node_allocr(node);
753
+ if (node_allocr == NULL) {
754
+ fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
755
+ }
756
+ for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
757
+ struct wsp_ggml_tensor * src = node->src[j];
758
+ if (src == NULL) {
759
+ break;
760
+ }
761
+ wsp_ggml_tallocr_t src_allocr = node_allocr(src);
762
+ if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
763
+ fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
764
+ node->name, node_allocr ? wsp_ggml_backend_name(wsp_ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL",
765
+ j, src->name, src_allocr ? wsp_ggml_backend_name(wsp_ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL");
766
+ }
767
+ }
768
+ }
769
+ #endif
770
+
771
+ // create copies of the graph for each split
772
+ // FIXME: avoid this copy, pass split inputs to wsp_ggml_gallocr_alloc_graph_n in some other way
773
+ struct wsp_ggml_cgraph * graph_copy = wsp_ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*WSP_GGML_MAX_SPLIT_INPUTS, false);
774
+ for (int i = 0; i < sched->n_splits; i++) {
775
+ struct wsp_ggml_backend_sched_split * split = &sched->splits[i];
776
+ split->graph = wsp_ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end);
777
+
778
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
779
+ for (int j = 0; j < split->n_inputs; j++) {
780
+ struct wsp_ggml_tensor * input = split->inputs[j];
781
+ struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
782
+ input_cpy->src[0] = input;
783
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
784
+ }
785
+
786
+ for (int j = split->i_start; j < split->i_end; j++) {
787
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
788
+ }
789
+ }
790
+ sched->graph = graph_copy;
791
+ }
792
+
793
+ static void sched_alloc_splits(wsp_ggml_backend_sched_t sched) {
794
+ wsp_ggml_gallocr_alloc_graph_n(
795
+ sched->galloc,
796
+ sched->graph,
797
+ sched->hash_set,
798
+ sched->node_talloc);
799
+ }
800
+
801
+ static void sched_compute_splits(wsp_ggml_backend_sched_t sched) {
802
+ uint64_t copy_us[WSP_GGML_MAX_BACKENDS] = {0};
803
+ uint64_t compute_us[WSP_GGML_MAX_BACKENDS] = {0};
804
+
805
+ struct wsp_ggml_backend_sched_split * splits = sched->splits;
806
+
807
+ for (int i = 0; i < sched->n_splits; i++) {
808
+ struct wsp_ggml_backend_sched_split * split = &splits[i];
809
+ wsp_ggml_backend_t split_backend = wsp_ggml_tallocr_get_buffer(split->tallocr)->backend;
810
+ int split_backend_id = sched_backend_prio(sched, split_backend);
811
+
812
+ // copy the input tensors to the split backend
813
+ uint64_t copy_start_us = wsp_ggml_time_us();
814
+ for (int j = 0; j < split->n_inputs; j++) {
815
+ struct wsp_ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)];
816
+ if (split->inputs[j]->buffer == NULL) {
817
+ if (split->inputs[j]->view_src == NULL) {
818
+ fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name);
819
+ exit(1);
820
+ }
821
+ struct wsp_ggml_tensor * view = split->inputs[j];
822
+ view->backend = view->view_src->backend;
823
+ view->buffer = view->view_src->buffer;
824
+ view->data = (char *)view->view_src->data + view->view_offs;
825
+ wsp_ggml_backend_buffer_init_tensor(wsp_ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
826
+ }
827
+ if (input_cpy->buffer == NULL) {
828
+ fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
829
+ exit(1);
830
+ }
831
+ WSP_GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend);
832
+ WSP_GGML_ASSERT(input_cpy->buffer->backend == split_backend);
833
+ wsp_ggml_backend_tensor_copy(split->inputs[j], input_cpy);
834
+ }
835
+ // wsp_ggml_backend_synchronize(split_backend);
836
+ int64_t copy_end_us = wsp_ggml_time_us();
837
+ copy_us[split_backend_id] += copy_end_us - copy_start_us;
838
+
839
+ #if 0
840
+ char split_filename[WSP_GGML_MAX_NAME];
841
+ snprintf(split_filename, WSP_GGML_MAX_NAME, "split_%i_%s.dot", i, wsp_ggml_backend_name(split_backend));
842
+ wsp_ggml_graph_dump_dot(split->graph, NULL, split_filename);
843
+ #endif
844
+
845
+ uint64_t compute_start_us = wsp_ggml_time_us();
846
+ wsp_ggml_backend_graph_compute(split_backend, split->graph);
847
+ // wsp_ggml_backend_synchronize(split_backend);
848
+ uint64_t compute_end_us = wsp_ggml_time_us();
849
+ compute_us[split_backend_id] += compute_end_us - compute_start_us;
850
+ }
851
+
852
+ #if 0
853
+ // per-backend timings
854
+ fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
855
+ for (int i = 0; i < sched->n_backends; i++) {
856
+ if (copy_us[i] > 0 || compute_us[i] > 0) {
857
+ fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", wsp_ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
858
+ }
859
+ }
860
+ #endif
861
+ }
862
+
863
+ static void sched_reset(wsp_ggml_backend_sched_t sched) {
864
+ for (int i = 0; i < sched->n_backends; i++) {
865
+ wsp_ggml_tallocr_reset(sched->tallocs[i]);
866
+ }
867
+ }
868
+
869
+ wsp_ggml_backend_sched_t wsp_ggml_backend_sched_new(wsp_ggml_backend_t * backends, int n_backends) {
870
+ WSP_GGML_ASSERT(n_backends <= WSP_GGML_MAX_BACKENDS);
871
+
872
+ struct wsp_ggml_backend_sched * sched = malloc(sizeof(struct wsp_ggml_backend_sched));
873
+ memset(sched, 0, sizeof(struct wsp_ggml_backend_sched));
874
+
875
+ fprintf(stderr, "wsp_ggml_backend_sched size: %lu KB\n", sizeof(struct wsp_ggml_backend_sched)/1024);
876
+
877
+ sched->n_backends = n_backends;
878
+ for (int i = 0; i < n_backends; i++) {
879
+ sched->backends[i] = backends[i];
880
+ }
881
+
882
+ sched->galloc = wsp_ggml_gallocr_new();
883
+
884
+ // init measure allocs for each backend
885
+ for (int i = 0; i < n_backends; i++) {
886
+ sched->tallocs[i] = wsp_ggml_tallocr_new_measure_from_backend(backends[i]);
887
+ }
888
+
889
+ return sched;
890
+ }
891
+
892
+ void wsp_ggml_backend_sched_free(wsp_ggml_backend_sched_t sched) {
893
+ if (sched == NULL) {
894
+ return;
895
+ }
896
+ for (int i = 0; i < sched->n_backends; i++) {
897
+ wsp_ggml_tallocr_free(sched->tallocs[i]);
898
+ }
899
+ wsp_ggml_gallocr_free(sched->galloc);
900
+ free(sched->hash_set.keys);
901
+ free(sched->node_talloc);
902
+ free(sched->node_copies);
903
+ free(sched);
904
+ }
905
+
906
+ void wsp_ggml_backend_sched_init_measure(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * measure_graph) {
907
+ // initialize hash tables
908
+ size_t hash_size = measure_graph->visited_hash_table.size + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS;
909
+ sched->hash_set.size = hash_size;
910
+ sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
911
+ sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
912
+ sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
913
+
914
+ sched_split_graph(sched, measure_graph);
915
+ sched_alloc_splits(sched);
916
+
917
+ // allocate buffers and reset allocators
918
+ for (int i = 0; i < sched->n_backends; i++) {
919
+ size_t size = wsp_ggml_tallocr_max_size(sched->tallocs[i]);
920
+ wsp_ggml_tallocr_free(sched->tallocs[i]);
921
+ sched->tallocs[i] = wsp_ggml_tallocr_new_from_backend(sched->backends[i], size);
922
+ }
923
+
924
+ sched_reset(sched);
925
+ }
926
+
927
+ void wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph) {
928
+ WSP_GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + WSP_GGML_MAX_SPLITS*WSP_GGML_MAX_SPLIT_INPUTS);
929
+
930
+ sched_split_graph(sched, graph);
931
+ sched_alloc_splits(sched);
932
+ sched_compute_splits(sched);
933
+ sched_reset(sched);
934
+ }
935
+
936
+ wsp_ggml_tallocr_t wsp_ggml_backend_sched_get_tallocr(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
937
+ int backend_index = sched_backend_prio(sched, backend);
938
+ return sched->tallocs[backend_index];
939
+ }
940
+
941
+ wsp_ggml_backend_buffer_t wsp_ggml_backend_sched_get_buffer(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend) {
942
+ int backend_index = sched_backend_prio(sched, backend);
943
+ return wsp_ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
944
+ }
945
+
946
+ void wsp_ggml_backend_sched_set_node_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend) {
947
+ int backend_index = sched_backend_prio(sched, backend);
948
+ WSP_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
949
+ node_allocr(node) = sched->tallocs[backend_index];
950
+ }