llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,11 @@
15
15
 
16
16
  // backend buffer type
17
17
 
18
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
18
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
+ return buft->iface.get_name(buft);
20
+ }
21
+
22
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
19
23
  return buft->iface.alloc_buffer(buft, size);
20
24
  }
21
25
 
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
23
27
  return buft->iface.get_alignment(buft);
24
28
  }
25
29
 
26
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
30
+ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
27
31
  // get_alloc_size is optional, defaults to ggml_nbytes
28
32
  if (buft->iface.get_alloc_size) {
29
33
  return buft->iface.get_alloc_size(buft, tensor);
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
44
48
 
45
49
  // backend buffer
46
50
 
47
- ggml_backend_buffer_t ggml_backend_buffer_init(
51
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
48
52
  ggml_backend_buffer_type_t buft,
49
53
  struct ggml_backend_buffer_i iface,
50
54
  ggml_backend_buffer_context_t context,
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
58
62
  /* .buft = */ buft,
59
63
  /* .context = */ context,
60
64
  /* .size = */ size,
65
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
61
66
  };
62
67
 
63
68
  return buffer;
64
69
  }
65
70
 
71
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
72
+ return buffer->iface.get_name(buffer);
73
+ }
74
+
66
75
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
67
76
  if (buffer == NULL) {
68
77
  return;
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
86
95
  return base;
87
96
  }
88
97
 
89
- void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
98
+ GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
90
99
  // init_tensor is optional
91
100
  if (buffer->iface.init_tensor) {
92
101
  buffer->iface.init_tensor(buffer, tensor);
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
94
103
  }
95
104
 
96
105
  size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
97
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
106
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
98
107
  }
99
108
 
100
109
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
101
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
110
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
102
111
  }
103
112
 
104
113
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
106
115
  }
107
116
 
108
117
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
118
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
110
119
  }
111
120
 
112
- ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
121
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
+ buffer->usage = usage;
123
+ }
124
+
125
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
113
126
  return buffer->buft;
114
127
  }
115
128
 
129
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
130
+ if (buffer->iface.reset) {
131
+ buffer->iface.reset(buffer);
132
+ }
133
+ }
134
+
135
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
136
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
137
+ if (dst_buf->iface.cpy_tensor) {
138
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
139
+ }
140
+ return false;
141
+ }
142
+
116
143
  // backend
117
144
 
118
145
  const char * ggml_backend_name(ggml_backend_t backend) {
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
146
173
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
147
174
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
148
175
 
149
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
176
+ if (backend->iface.set_tensor_async == NULL) {
177
+ ggml_backend_tensor_set(tensor, data, offset, size);
178
+ } else {
179
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
180
+ }
150
181
  }
151
182
 
152
183
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
153
184
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
154
185
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
155
186
 
156
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
187
+ if (backend->iface.get_tensor_async == NULL) {
188
+ ggml_backend_tensor_get(tensor, data, offset, size);
189
+ } else {
190
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
191
+ }
157
192
  }
158
193
 
159
- void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
194
+ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
+
160
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
161
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
198
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
162
199
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
163
200
 
164
- tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
201
+ tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
165
202
  }
166
203
 
167
- void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
204
+ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
+
168
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
169
208
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
170
209
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
171
210
 
172
- tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
211
+ tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
173
212
  }
174
213
 
175
214
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -190,16 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
190
229
 
191
230
  void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
192
231
  backend->iface.graph_plan_compute(backend, plan);
193
-
194
- // TODO: optional sync
195
- ggml_backend_synchronize(backend);
196
232
  }
197
233
 
198
- void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
- backend->iface.graph_compute(backend, cgraph);
200
-
201
- // TODO: optional sync
202
- ggml_backend_synchronize(backend);
234
+ bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
235
+ return backend->iface.graph_compute(backend, cgraph);
203
236
  }
204
237
 
205
238
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -224,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
224
257
  }
225
258
 
226
259
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
227
- //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
228
- //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
229
260
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
230
261
 
231
- // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
232
-
233
262
  if (src == dst) {
234
263
  return;
235
264
  }
236
265
 
237
- // TODO: allow backends to support copy to/from same backend
238
-
239
- if (dst->buffer->iface.cpy_tensor_from != NULL) {
240
- dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
241
- } else if (src->buffer->iface.cpy_tensor_to != NULL) {
242
- src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
243
- } else {
244
- // shouldn't be hit when copying from/to CPU
245
- #ifndef NDEBUG
246
- fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
247
- "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
248
- #endif
266
+ if (ggml_backend_buffer_is_host(src->buffer)) {
267
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
268
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
269
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
270
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
271
+ #ifndef NDEBUG
272
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
273
+ #endif
249
274
  size_t nbytes = ggml_nbytes(src);
250
275
  void * data = malloc(nbytes);
251
276
  ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -254,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
254
279
  }
255
280
  }
256
281
 
282
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
283
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
284
+
285
+ if (src == dst) {
286
+ return;
287
+ }
288
+
289
+ if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
290
+ if (backend->iface.cpy_tensor_async != NULL) {
291
+ if (backend->iface.cpy_tensor_async(backend, src, dst)) {
292
+ return;
293
+ }
294
+ }
295
+ }
296
+
297
+ size_t nbytes = ggml_nbytes(src);
298
+ if (ggml_backend_buffer_is_host(src->buffer)) {
299
+ ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
300
+ }
301
+ else {
302
+ ggml_backend_tensor_copy(src, dst);
303
+ }
304
+ }
305
+
306
+
257
307
  // backend registry
258
308
 
259
309
  #define GGML_MAX_BACKENDS_REG 16
@@ -268,9 +318,9 @@ struct ggml_backend_reg {
268
318
  static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
269
319
  static size_t ggml_backend_registry_count = 0;
270
320
 
271
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
321
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
272
322
 
273
- static void ggml_backend_registry_init(void) {
323
+ GGML_CALL static void ggml_backend_registry_init(void) {
274
324
  static bool initialized = false;
275
325
 
276
326
  if (initialized) {
@@ -283,18 +333,18 @@ static void ggml_backend_registry_init(void) {
283
333
 
284
334
  // add forward decls here to avoid including the backend headers
285
335
  #ifdef GGML_USE_CUBLAS
286
- extern void ggml_backend_cuda_reg_devices(void);
336
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
287
337
  ggml_backend_cuda_reg_devices();
288
338
  #endif
289
339
 
290
340
  #ifdef GGML_USE_METAL
291
- extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
292
- extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
341
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
293
343
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
294
344
  #endif
295
345
  }
296
346
 
297
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
347
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
298
348
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
299
349
 
300
350
  size_t id = ggml_backend_registry_count;
@@ -389,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
389
439
 
390
440
  // backend CPU
391
441
 
392
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
442
+ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
+ return "CPU";
444
+
445
+ GGML_UNUSED(buffer);
446
+ }
447
+
448
+ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
393
449
  return (void *)buffer->context;
394
450
  }
395
451
 
396
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
452
+ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
397
453
  free(buffer->context);
398
454
  }
399
455
 
400
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
456
+ GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
401
457
  memcpy((char *)tensor->data + offset, data, size);
402
458
 
403
459
  GGML_UNUSED(buffer);
404
460
  }
405
461
 
406
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
462
+ GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
407
463
  memcpy(data, (const char *)tensor->data + offset, size);
408
464
 
409
465
  GGML_UNUSED(buffer);
410
466
  }
411
467
 
412
- static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
413
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
414
-
415
- GGML_UNUSED(buffer);
416
- }
417
-
418
- static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
419
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
468
+ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
+ if (ggml_backend_buffer_is_host(src->buffer)) {
470
+ memcpy(dst->data, src->data, ggml_nbytes(src));
471
+ return true;
472
+ }
473
+ return false;
420
474
 
421
475
  GGML_UNUSED(buffer);
422
476
  }
423
477
 
424
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
478
+ GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
425
479
  memset(buffer->context, value, buffer->size);
426
480
  }
427
481
 
428
482
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
483
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
429
484
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
430
485
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
431
486
  /* .init_tensor = */ NULL, // no initialization required
432
487
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
433
488
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
434
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
435
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
489
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
436
490
  /* .clear = */ ggml_backend_cpu_buffer_clear,
491
+ /* .reset = */ NULL,
437
492
  };
438
493
 
439
494
  // for buffers from ptr, free is not called
440
495
  static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
496
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
441
497
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
442
498
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
443
499
  /* .init_tensor = */ NULL, // no initialization required
444
500
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
445
501
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
446
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
447
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
502
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
448
503
  /* .clear = */ ggml_backend_cpu_buffer_clear,
504
+ /* .reset = */ NULL,
449
505
  };
450
506
 
451
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
452
508
 
453
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
509
+ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
+ return "CPU";
511
+
512
+ GGML_UNUSED(buft);
513
+ }
514
+
515
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
454
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
455
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
456
518
 
@@ -459,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
459
521
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
460
522
  }
461
523
 
462
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
524
+ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
463
525
  return TENSOR_ALIGNMENT;
464
526
 
465
527
  GGML_UNUSED(buft);
466
528
  }
467
529
 
468
- static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
530
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
469
531
  return ggml_backend_is_cpu(backend);
470
532
 
471
533
  GGML_UNUSED(buft);
472
534
  }
473
535
 
474
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
536
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
475
537
  return true;
476
538
 
477
539
  GGML_UNUSED(buft);
478
540
  }
479
541
 
480
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
542
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
481
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
482
544
  /* .iface = */ {
545
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
483
546
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
484
547
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
485
548
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -498,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
498
561
 
499
562
  #include <hbwmalloc.h>
500
563
 
501
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
564
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
+ return "CPU_HBM";
566
+
567
+ GGML_UNUSED(buft);
568
+ }
569
+
570
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
+ return "CPU_HBM";
572
+
573
+ GGML_UNUSED(buf);
574
+ }
575
+
576
+ GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
502
577
  hbw_free(buffer->context);
503
578
  }
504
579
 
505
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
580
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
506
581
  //void * ptr = hbw_malloc(size);
507
582
  void * ptr;
508
583
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -511,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
511
586
  return NULL;
512
587
  }
513
588
 
514
- // FIXME: this is a hack to avoid having to implement a new buffer type
515
589
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
516
590
  buffer->buft = buft;
591
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
517
592
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
518
593
 
519
594
  return buffer;
520
595
  }
521
596
 
522
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
597
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
523
598
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
524
599
  /* .iface = */ {
600
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
525
601
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
526
602
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
527
603
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -541,20 +617,20 @@ struct ggml_backend_cpu_context {
541
617
  size_t work_size;
542
618
  };
543
619
 
544
- static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
620
+ GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
545
621
  return "CPU";
546
622
 
547
623
  GGML_UNUSED(backend);
548
624
  }
549
625
 
550
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
626
+ GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
551
627
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
552
628
  free(cpu_ctx->work_data);
553
629
  free(cpu_ctx);
554
630
  free(backend);
555
631
  }
556
632
 
557
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
633
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
558
634
  return ggml_backend_cpu_buffer_type();
559
635
 
560
636
  GGML_UNUSED(backend);
@@ -565,7 +641,7 @@ struct ggml_backend_plan_cpu {
565
641
  struct ggml_cgraph cgraph;
566
642
  };
567
643
 
568
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
644
+ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
569
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
570
646
 
571
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -580,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
580
656
  return cpu_plan;
581
657
  }
582
658
 
583
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
659
+ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
584
660
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
585
661
 
586
662
  free(cpu_plan->cplan.work_data);
@@ -589,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
589
665
  GGML_UNUSED(backend);
590
666
  }
591
667
 
592
- static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
668
+ GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
593
669
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
594
670
 
595
671
  ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -597,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
597
673
  GGML_UNUSED(backend);
598
674
  }
599
675
 
600
- static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
676
+ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
601
677
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
602
678
 
603
679
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -611,9 +687,10 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
611
687
  cplan.work_data = cpu_ctx->work_data;
612
688
 
613
689
  ggml_graph_compute(cgraph, &cplan);
690
+ return true;
614
691
  }
615
692
 
616
- static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
693
+ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
617
694
  switch (op->op) {
618
695
  case GGML_OP_MUL_MAT:
619
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -630,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
630
707
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
631
708
  /* .set_tensor_async = */ NULL,
632
709
  /* .get_tensor_async = */ NULL,
633
- /* .cpy_tensor_from_async = */ NULL,
634
- /* .cpy_tensor_to_async = */ NULL,
710
+ /* .cpy_tensor_async = */ NULL,
635
711
  /* .synchronize = */ NULL,
636
712
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
637
713
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
@@ -656,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
656
732
  return cpu_backend;
657
733
  }
658
734
 
659
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
660
- return backend->iface.get_name == ggml_backend_cpu_name;
735
+ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
+ return backend && backend->iface.get_name == ggml_backend_cpu_name;
661
737
  }
662
738
 
663
739
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -667,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
667
743
  ctx->n_threads = n_threads;
668
744
  }
669
745
 
670
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
746
+ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
671
747
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
672
748
  }
673
749
 
674
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
750
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
675
751
  return ggml_backend_cpu_init();
676
752
 
677
753
  GGML_UNUSED(params);
@@ -681,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
681
757
 
682
758
  // scheduler
683
759
 
684
- #define GGML_MAX_BACKENDS 4
760
+ #define GGML_MAX_BACKENDS 16
685
761
  #define GGML_MAX_SPLITS 256
686
762
  #define GGML_MAX_SPLIT_INPUTS 16
687
763
 
@@ -691,21 +767,29 @@ struct ggml_backend_sched_split {
691
767
  int i_end;
692
768
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
693
769
  int n_inputs;
770
+ // graph view of this split
694
771
  struct ggml_cgraph graph;
695
772
  };
696
773
 
697
774
  struct ggml_backend_sched {
775
+ bool is_reset; // true if the scheduler has been reset since the last graph split
776
+
698
777
  int n_backends;
699
778
  ggml_backend_t backends[GGML_MAX_BACKENDS];
779
+ ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
700
780
  ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
701
781
 
702
782
  ggml_gallocr_t galloc;
703
783
 
784
+ // hash keys of the nodes in the graph
704
785
  struct ggml_hash_set hash_set;
705
- ggml_tallocr_t * node_talloc; // [hash_set.size]
706
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
786
+ // hash values (arrays of [hash_set.size])
787
+ ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
788
+ struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
707
789
 
790
+ // copy of the graph with modified inputs
708
791
  struct ggml_cgraph * graph;
792
+
709
793
  struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
710
794
  int n_splits;
711
795
 
@@ -746,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
746
830
  return INT_MAX;
747
831
  }
748
832
 
749
- static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
833
+ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
750
834
  if (buffer == NULL) {
751
835
  return NULL;
752
836
  }
837
+
838
+ // check if this is already allocate in a allocr buffer (from user manual allocations)
839
+ for (int i = 0; i < sched->n_backends; i++) {
840
+ if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
841
+ return sched->tallocs[i];
842
+ }
843
+ }
844
+
753
845
  // find highest prio backend that supports the buffer type
754
846
  for (int i = 0; i < sched->n_backends; i++) {
755
847
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
756
- return sched->backends[i];
848
+ return sched->tallocs[i];
757
849
  }
758
850
  }
759
851
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
@@ -763,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
763
855
  if (allocr == NULL) {
764
856
  return NULL;
765
857
  }
766
- // find highest prio backend that supports the buffer type
767
858
  for (int i = 0; i < sched->n_backends; i++) {
768
859
  if (sched->tallocs[i] == allocr) {
769
860
  return sched->backends[i];
@@ -773,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
773
864
  }
774
865
 
775
866
  #if 0
776
- static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
867
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
777
868
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
778
869
  #define GET_CAUSE(node) causes[hash_id(node)]
779
870
  #else
@@ -782,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
782
873
  #endif
783
874
 
784
875
  // returns the backend that should be used for the node based on the current locations
785
- static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
786
- // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
787
- // ie. kv cache updates
788
- // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
876
+ static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
877
+ // assign pre-allocated nodes to their backend
789
878
  // dst
790
- ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
791
- if (cur_backend != NULL) {
879
+ ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
880
+ if (cur_allocr != NULL) {
792
881
  SET_CAUSE(node, "1.dst");
793
- return cur_backend;
882
+ return cur_allocr;
794
883
  }
795
-
796
884
  // view_src
797
- if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
798
- SET_CAUSE(node, "1.vsrc");
799
- return get_buffer_backend(sched, node->view_src->buffer);
885
+ if (node->view_src != NULL) {
886
+ cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
887
+ if (cur_allocr != NULL) {
888
+ SET_CAUSE(node, "1.vsrc");
889
+ return cur_allocr;
890
+ }
800
891
  }
801
-
802
- // src
803
- int cur_prio = INT_MAX;
804
- size_t cur_size = 0;
805
-
892
+ // assign nodes that use weights to the backend of the weights
806
893
  for (int i = 0; i < GGML_MAX_SRC; i++) {
807
894
  const struct ggml_tensor * src = node->src[i];
808
895
  if (src == NULL) {
809
896
  break;
810
897
  }
811
- ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
812
- if (src_backend != NULL) {
813
- int src_prio = sched_backend_prio(sched, src_backend);
814
- size_t src_size = ggml_nbytes(src);
815
- if (src_prio < cur_prio && src_size >= cur_size) {
816
- cur_prio = src_prio;
817
- cur_size = src_size;
818
- cur_backend = src_backend;
819
- SET_CAUSE(node, "1.src%d", i);
820
- }
898
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
899
+ ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
900
+ // operations with weights are always run on the same backend as the weights
901
+ SET_CAUSE(node, "1.wgt%d", i);
902
+ return src_allocr;
821
903
  }
822
904
  }
823
- return cur_backend;
905
+
906
+ return NULL;
824
907
  }
825
908
 
826
909
  static char * fmt_size(size_t size) {
@@ -853,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
853
936
  }
854
937
  ggml_tallocr_t node_allocr = node_allocr(node);
855
938
  ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
856
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
939
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
857
940
  fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
858
941
  for (int j = 0; j < GGML_MAX_SRC; j++) {
859
942
  struct ggml_tensor * src = node->src[j];
@@ -862,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
862
945
  }
863
946
  ggml_tallocr_t src_allocr = node_allocr(src);
864
947
  ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
865
- fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
948
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
866
949
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
867
950
  }
868
951
  fprintf(stderr, "\n");
@@ -878,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
878
961
  return dup;
879
962
  }
880
963
 
964
+
965
+ //#define DEBUG_PASS1
966
+ //#define DEBUG_PASS2
967
+ //#define DEBUG_PASS3
968
+ //#define DEBUG_PASS4
969
+
881
970
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
882
- // TODO: merge passes
883
971
  static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
884
- // reset state
885
- size_t hash_size = sched->hash_set.size;
886
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
887
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
888
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
972
+ // reset splits
889
973
  sched->n_splits = 0;
974
+ sched->is_reset = false;
890
975
 
891
976
  struct ggml_init_params params = {
892
977
  /* .mem_size = */ sizeof(sched->context_buffer),
@@ -894,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
894
979
  /* .no_alloc = */ true
895
980
  };
896
981
 
897
- if (sched->ctx != NULL) {
898
- ggml_free(sched->ctx);
899
- }
982
+ ggml_free(sched->ctx);
900
983
 
901
984
  sched->ctx = ggml_init(params);
985
+ if (sched->ctx == NULL) {
986
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
987
+ GGML_ASSERT(false);
988
+ }
902
989
 
903
- // pass 1: assign backends to ops with allocated inputs
990
+ // pass 1: assign backends to ops with pre-allocated inputs
904
991
  for (int i = 0; i < graph->n_leafs; i++) {
905
992
  struct ggml_tensor * leaf = graph->leafs[i];
906
993
  if (node_allocr(leaf) != NULL) {
907
994
  // do not overwrite user assignments
908
995
  continue;
909
996
  }
910
- ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
911
- if (leaf_backend == NULL && leaf->view_src != NULL) {
912
- leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
913
- }
914
- if (leaf_backend != NULL) {
915
- node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
916
- }
997
+ node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
917
998
  }
918
999
 
919
1000
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -922,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
922
1003
  // do not overwrite user assignments
923
1004
  continue;
924
1005
  }
925
- ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
926
- if (node_backend != NULL) {
927
- node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
1006
+ node_allocr(node) = sched_allocr_from_cur(sched, node);
1007
+ // src
1008
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1009
+ struct ggml_tensor * src = node->src[j];
1010
+ if (src == NULL) {
1011
+ break;
1012
+ }
1013
+ if (node_allocr(src) == NULL) {
1014
+ node_allocr(src) = sched_allocr_from_cur(sched, src);
1015
+ }
928
1016
  }
929
1017
  }
930
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1018
+ #ifdef DEBUG_PASS1
1019
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1020
+ #endif
931
1021
 
932
- // pass 2: assign backends to ops from current assignments
933
- // TODO:
934
- // - reuse sched_backend_from_cur
935
- for (int i = 0; i < graph->n_nodes; i++) {
936
- struct ggml_tensor * node = graph->nodes[i];
937
- ggml_tallocr_t node_allocr = node_allocr(node);
938
- if (node_allocr == NULL) {
939
- int cur_prio = INT_MAX;
940
- size_t cur_size = 0;
941
- for (int j = 0; j < GGML_MAX_SRC; j++) {
942
- struct ggml_tensor * src = node->src[j];
943
- if (src == NULL) {
944
- break;
1022
+ // pass 2: expand current backend assignments
1023
+ // assign the same backend to adjacent nodes
1024
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1025
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1026
+
1027
+ // pass 2.1 expand gpu up
1028
+ {
1029
+ ggml_tallocr_t cur_allocr = NULL;
1030
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1031
+ struct ggml_tensor * node = graph->nodes[i];
1032
+ if (ggml_is_view_op(node->op)) {
1033
+ continue;
1034
+ }
1035
+ ggml_tallocr_t node_allocr = node_allocr(node);
1036
+ if (node_allocr != NULL) {
1037
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1038
+ // skip cpu (lowest prio backend)
1039
+ cur_allocr = NULL;
1040
+ } else {
1041
+ cur_allocr = node_allocr;
945
1042
  }
946
- ggml_tallocr_t src_allocr = node_allocr(src);
947
- if (src_allocr != NULL) {
948
- int src_prio = sched_allocr_prio(sched, src_allocr);
949
- size_t src_size = ggml_nbytes(src);
950
- if (src_prio < cur_prio && src_size >= cur_size) {
951
- cur_prio = src_prio;
952
- cur_size = src_size;
953
- node_allocr = src_allocr;
954
- SET_CAUSE(node, "2.src%d", j);
955
- }
1043
+ } else {
1044
+ node_allocr(node) = cur_allocr;
1045
+ SET_CAUSE(node, "2.1");
1046
+ }
1047
+ }
1048
+ }
1049
+
1050
+ // pass 2.2 expand gpu down
1051
+ {
1052
+ ggml_tallocr_t cur_allocr = NULL;
1053
+ for (int i = 0; i < graph->n_nodes; i++) {
1054
+ struct ggml_tensor * node = graph->nodes[i];
1055
+ if (ggml_is_view_op(node->op)) {
1056
+ continue;
1057
+ }
1058
+ ggml_tallocr_t node_allocr = node_allocr(node);
1059
+ if (node_allocr != NULL) {
1060
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1061
+ // skip cpu (lowest prio backend)
1062
+ cur_allocr = NULL;
1063
+ } else {
1064
+ cur_allocr = node_allocr;
956
1065
  }
1066
+ } else {
1067
+ node_allocr(node) = cur_allocr;
1068
+ SET_CAUSE(node, "2.2");
957
1069
  }
1070
+ }
1071
+ }
1072
+
1073
+ // pass 2.3 expand rest up
1074
+ {
1075
+ ggml_tallocr_t cur_allocr = NULL;
1076
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1077
+ struct ggml_tensor * node = graph->nodes[i];
1078
+ if (ggml_is_view_op(node->op)) {
1079
+ continue;
1080
+ }
1081
+ ggml_tallocr_t node_allocr = node_allocr(node);
958
1082
  if (node_allocr != NULL) {
959
- node_allocr(node) = node_allocr;
1083
+ cur_allocr = node_allocr;
1084
+ } else {
1085
+ node_allocr(node) = cur_allocr;
1086
+ SET_CAUSE(node, "2.3");
960
1087
  }
961
1088
  }
962
1089
  }
963
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
964
1090
 
965
- // pass 3: assign backends to remaining src from dst (should only be leafs)
1091
+ // pass 2.4 expand rest down
1092
+ {
1093
+ ggml_tallocr_t cur_allocr = NULL;
1094
+ for (int i = 0; i < graph->n_nodes; i++) {
1095
+ struct ggml_tensor * node = graph->nodes[i];
1096
+ if (ggml_is_view_op(node->op)) {
1097
+ continue;
1098
+ }
1099
+ ggml_tallocr_t node_allocr = node_allocr(node);
1100
+ if (node_allocr != NULL) {
1101
+ cur_allocr = node_allocr;
1102
+ } else {
1103
+ node_allocr(node) = cur_allocr;
1104
+ SET_CAUSE(node, "2.4");
1105
+ }
1106
+ }
1107
+ }
1108
+ #ifdef DEBUG_PASS2
1109
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1110
+ #endif
1111
+
1112
+ // pass 3: assign backends to remaining src from dst and view_src
966
1113
  for (int i = 0; i < graph->n_nodes; i++) {
967
1114
  struct ggml_tensor * node = graph->nodes[i];
968
- ggml_tallocr_t node_allocr = node_allocr(node);
1115
+ ggml_tallocr_t cur_allocr = node_allocr(node);
1116
+ if (node->view_src != NULL && cur_allocr == NULL) {
1117
+ cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1118
+ SET_CAUSE(node, "3.vsrc");
1119
+ }
969
1120
  for (int j = 0; j < GGML_MAX_SRC; j++) {
970
1121
  struct ggml_tensor * src = node->src[j];
971
1122
  if (src == NULL) {
@@ -973,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
973
1124
  }
974
1125
  ggml_tallocr_t src_allocr = node_allocr(src);
975
1126
  if (src_allocr == NULL) {
976
- node_allocr(src) = node_allocr;
1127
+ if (src->view_src != NULL) {
1128
+ // views are always on the same backend as the source
1129
+ node_allocr(src) = node_allocr(src->view_src);
1130
+ SET_CAUSE(src, "3.vsrc");
1131
+ } else {
1132
+ node_allocr(src) = cur_allocr;
1133
+ SET_CAUSE(src, "3.cur");
1134
+ }
977
1135
  }
978
1136
  }
979
1137
  }
980
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1138
+ #ifdef DEBUG_PASS3
1139
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1140
+ #endif
981
1141
 
982
1142
  // pass 4: split graph, find tensors that need to be copied
983
- // TODO:
984
- // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
985
- // find first backend
986
- int cur_split = 0;
987
- for (int i = 0; i < graph->n_nodes; i++) {
988
- struct ggml_tensor * node = graph->nodes[i];
989
- if (node->view_src == NULL) {
990
- sched->splits[0].tallocr = node_allocr(node);
991
- break;
992
- }
993
- }
994
- sched->splits[0].i_start = 0;
995
- sched->splits[0].n_inputs = 0;
996
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
997
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
998
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
999
- for (int i = 0; i < graph->n_nodes; i++) {
1000
- struct ggml_tensor * node = graph->nodes[i];
1001
-
1002
- if (ggml_is_view_op(node->op)) {
1003
- continue;
1143
+ {
1144
+ int cur_split = 0;
1145
+ // find the backend of the first split, skipping view ops
1146
+ for (int i = 0; i < graph->n_nodes; i++) {
1147
+ struct ggml_tensor * node = graph->nodes[i];
1148
+ if (!ggml_is_view_op(node->op)) {
1149
+ sched->splits[0].tallocr = node_allocr(node);
1150
+ break;
1151
+ }
1004
1152
  }
1153
+ sched->splits[0].i_start = 0;
1154
+ sched->splits[0].n_inputs = 0;
1155
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1156
+ ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1157
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1158
+ for (int i = 0; i < graph->n_nodes; i++) {
1159
+ struct ggml_tensor * node = graph->nodes[i];
1160
+
1161
+ if (ggml_is_view_op(node->op)) {
1162
+ continue;
1163
+ }
1005
1164
 
1006
- ggml_tallocr_t node_allocr = node_allocr(node);
1165
+ ggml_tallocr_t node_allocr = node_allocr(node);
1007
1166
 
1008
- if (node_allocr != cur_allocr) {
1009
- sched->splits[cur_split].i_end = i;
1010
- cur_split++;
1011
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1012
- sched->splits[cur_split].tallocr = node_allocr;
1013
- sched->splits[cur_split].i_start = i;
1014
- sched->splits[cur_split].n_inputs = 0;
1015
- memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
1016
- cur_allocr = node_allocr;
1017
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1018
- }
1167
+ GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1019
1168
 
1020
- // find inputs that are not on the same backend
1021
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1022
- struct ggml_tensor * src = node->src[j];
1023
- if (src == NULL) {
1024
- break;
1169
+ if (node_allocr != cur_allocr) {
1170
+ sched->splits[cur_split].i_end = i;
1171
+ cur_split++;
1172
+ GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1173
+ sched->splits[cur_split].tallocr = node_allocr;
1174
+ sched->splits[cur_split].i_start = i;
1175
+ sched->splits[cur_split].n_inputs = 0;
1176
+ cur_allocr = node_allocr;
1177
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1025
1178
  }
1026
- ggml_tallocr_t src_allocr = node_allocr(src);
1027
- if (src_allocr != node_allocr) {
1028
- int n_inputs = sched->splits[cur_split].n_inputs++;
1029
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1030
- sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1031
-
1032
- // create copies
1033
- size_t id = hash_id(src);
1034
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1035
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1036
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1037
- node_allocr(tensor_copy) = cur_allocr;
1038
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1039
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1179
+
1180
+ // find inputs that are not on the same backend
1181
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1182
+ struct ggml_tensor * src = node->src[j];
1183
+ if (src == NULL) {
1184
+ break;
1185
+ }
1186
+ ggml_tallocr_t src_allocr = node_allocr(src);
1187
+ GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1188
+ if (src_allocr != node_allocr) {
1189
+ // check if the input is already in the split
1190
+ bool found = false;
1191
+ for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1192
+ if (sched->splits[cur_split].inputs[k] == src) {
1193
+ found = true;
1194
+ break;
1195
+ }
1196
+ }
1197
+
1198
+ if (!found) {
1199
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1200
+ //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1201
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1202
+ sched->splits[cur_split].inputs[n_inputs] = src;
1203
+ }
1204
+
1205
+ // create a copy of the input in the split's backend
1206
+ size_t id = hash_id(src);
1207
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1208
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1209
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1210
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1211
+
1212
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1213
+ node_allocr(tensor_copy) = cur_allocr;
1214
+ SET_CAUSE(tensor_copy, "4.cpy");
1215
+ }
1216
+ node->src[j] = sched->node_copies[id][cur_backend_id];
1040
1217
  }
1041
- node->src[j] = sched->node_copies[id][cur_backend_id];
1042
1218
  }
1043
1219
  }
1220
+ sched->splits[cur_split].i_end = graph->n_nodes;
1221
+ sched->n_splits = cur_split + 1;
1044
1222
  }
1045
- sched->splits[cur_split].i_end = graph->n_nodes;
1046
- sched->n_splits = cur_split + 1;
1047
-
1048
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
1223
+ #ifdef DEBUG_PASS4
1224
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1225
+ #endif
1049
1226
 
1050
- #if 1
1227
+ #ifndef NDEBUG
1051
1228
  // sanity check: all sources should have the same backend as the node
1052
1229
  for (int i = 0; i < graph->n_nodes; i++) {
1053
1230
  struct ggml_tensor * node = graph->nodes[i];
@@ -1055,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1055
1232
  if (node_allocr == NULL) {
1056
1233
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1057
1234
  }
1235
+ if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1236
+ fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1237
+ node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1238
+ node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1239
+ }
1058
1240
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1059
1241
  struct ggml_tensor * src = node->src[j];
1060
1242
  if (src == NULL) {
@@ -1066,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1066
1248
  node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1067
1249
  j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1068
1250
  }
1251
+ if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1252
+ fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1253
+ src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1254
+ src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1255
+ }
1069
1256
  }
1070
1257
  }
1258
+ fflush(stderr);
1071
1259
  #endif
1072
1260
 
1073
1261
  // create copies of the graph for each split
@@ -1081,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1081
1269
  for (int j = 0; j < split->n_inputs; j++) {
1082
1270
  struct ggml_tensor * input = split->inputs[j];
1083
1271
  struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1272
+ // add a dependency to the input source so that it is not freed before the copy is done
1273
+ GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1084
1274
  input_cpy->src[0] = input;
1085
1275
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1086
1276
  }
@@ -1115,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1115
1305
  uint64_t copy_start_us = ggml_time_us();
1116
1306
  for (int j = 0; j < split->n_inputs; j++) {
1117
1307
  struct ggml_tensor * input = split->inputs[j];
1118
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
1119
- if (input->buffer == NULL) {
1120
- if (input->view_src == NULL) {
1121
- fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
1122
- exit(1);
1123
- }
1124
- // FIXME: may need to use the sched buffer instead
1125
- ggml_backend_view_init(input->view_src->buffer, input);
1126
- }
1127
- if (input_cpy->buffer == NULL) {
1128
- fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
1129
- exit(1);
1130
- }
1131
- //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
1132
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1133
- ggml_backend_tensor_copy(input, input_cpy);
1308
+ struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1309
+
1310
+ GGML_ASSERT(input->buffer != NULL);
1311
+ GGML_ASSERT(input_cpy->buffer != NULL);
1312
+
1313
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1314
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1315
+ ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1134
1316
  }
1135
- // ggml_backend_synchronize(split_backend);
1317
+ //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1136
1318
  int64_t copy_end_us = ggml_time_us();
1137
1319
  copy_us[split_backend_id] += copy_end_us - copy_start_us;
1138
1320
 
@@ -1144,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1144
1326
 
1145
1327
  uint64_t compute_start_us = ggml_time_us();
1146
1328
  ggml_backend_graph_compute(split_backend, &split->graph);
1147
- // ggml_backend_synchronize(split_backend);
1329
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1148
1330
  uint64_t compute_end_us = ggml_time_us();
1149
1331
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1150
1332
  }
@@ -1164,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
1164
1346
  for (int i = 0; i < sched->n_backends; i++) {
1165
1347
  ggml_tallocr_reset(sched->tallocs[i]);
1166
1348
  }
1349
+ // reset state for the next run
1350
+ size_t hash_size = sched->hash_set.size;
1351
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1352
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1353
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1354
+
1355
+ sched->is_reset = true;
1167
1356
  }
1168
1357
 
1169
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
1358
+ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1359
+ GGML_ASSERT(n_backends > 0);
1170
1360
  GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1171
1361
 
1172
- struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
1173
- memset(sched, 0, sizeof(struct ggml_backend_sched));
1362
+ struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1363
+
1364
+ // initialize hash table
1365
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1366
+ sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1367
+ sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1174
1368
 
1175
1369
  sched->n_backends = n_backends;
1176
1370
  for (int i = 0; i < n_backends; i++) {
1177
1371
  sched->backends[i] = backends[i];
1372
+ sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1178
1373
  }
1179
1374
 
1180
1375
  sched->galloc = ggml_gallocr_new();
1181
1376
 
1182
1377
  // init measure allocs for each backend
1183
1378
  for (int i = 0; i < n_backends; i++) {
1184
- sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
1379
+ sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1185
1380
  }
1186
1381
 
1382
+ sched_reset(sched);
1383
+
1187
1384
  return sched;
1188
1385
  }
1189
1386
 
@@ -1195,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1195
1392
  ggml_tallocr_free(sched->tallocs[i]);
1196
1393
  }
1197
1394
  ggml_gallocr_free(sched->galloc);
1395
+ ggml_free(sched->ctx);
1198
1396
  free(sched->hash_set.keys);
1199
1397
  free(sched->node_talloc);
1200
1398
  free(sched->node_copies);
@@ -1202,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1202
1400
  }
1203
1401
 
1204
1402
  void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1205
- // initialize hash tables
1206
- size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
1207
- sched->hash_set.size = hash_size;
1208
- sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
1209
- sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
1210
- sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
1403
+ GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
1211
1404
 
1212
1405
  sched_split_graph(sched, measure_graph);
1213
1406
  sched_alloc_splits(sched);
@@ -1216,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
1216
1409
  for (int i = 0; i < sched->n_backends; i++) {
1217
1410
  size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1218
1411
  ggml_tallocr_free(sched->tallocs[i]);
1219
- sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
1412
+ sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1220
1413
  }
1221
1414
 
1222
1415
  sched_reset(sched);
1223
1416
  }
1224
1417
 
1225
1418
  void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1226
- GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1419
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1420
+
1421
+ if (!sched->is_reset) {
1422
+ sched_reset(sched);
1423
+ }
1227
1424
 
1228
1425
  sched_split_graph(sched, graph);
1229
1426
  sched_alloc_splits(sched);
1230
1427
  sched_compute_splits(sched);
1428
+ }
1429
+
1430
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1231
1431
  sched_reset(sched);
1232
1432
  }
1233
1433
 
1434
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1435
+ return sched->n_splits;
1436
+ }
1437
+
1234
1438
  ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1235
1439
  int backend_index = sched_backend_prio(sched, backend);
1440
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1236
1441
  return sched->tallocs[backend_index];
1237
1442
  }
1238
1443
 
1239
1444
  ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1240
1445
  int backend_index = sched_backend_prio(sched, backend);
1446
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1241
1447
  return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1242
1448
  }
1243
1449
 
@@ -1247,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1247
1453
  node_allocr(node) = sched->tallocs[backend_index];
1248
1454
  }
1249
1455
 
1456
+ ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1457
+ ggml_tallocr_t allocr = node_allocr(node);
1458
+ if (allocr == NULL) {
1459
+ return NULL;
1460
+ }
1461
+ return get_allocr_backend(sched, allocr);
1462
+ }
1463
+
1250
1464
  // utils
1465
+
1251
1466
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1252
1467
  GGML_ASSERT(tensor->buffer == NULL);
1253
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1468
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1254
1469
  GGML_ASSERT(tensor->view_src != NULL);
1255
1470
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1256
1471
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1316,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1316
1531
 
1317
1532
  struct ggml_tensor * dst = node_copies[id];
1318
1533
  if (dst->view_src != NULL) {
1534
+ graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1319
1535
  ggml_backend_view_init(dst->view_src->buffer, dst);
1320
1536
  }
1321
1537
  else {
@@ -1349,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1349
1565
  struct ggml_context * ctx_allocated = ggml_init(params);
1350
1566
  struct ggml_context * ctx_unallocated = ggml_init(params);
1351
1567
 
1568
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1569
+ fprintf(stderr, "failed to allocate context for graph copy\n");
1570
+ free(hash_set.keys);
1571
+ free(node_copies);
1572
+ free(node_init);
1573
+ ggml_free(ctx_allocated);
1574
+ ggml_free(ctx_unallocated);
1575
+ return (struct ggml_backend_graph_copy) {
1576
+ /* .buffer = */ NULL,
1577
+ /* .ctx_allocated = */ NULL,
1578
+ /* .ctx_unallocated = */ NULL,
1579
+ /* .graph = */ NULL,
1580
+ };
1581
+ }
1582
+
1352
1583
  // dup nodes
1353
1584
  for (int i = 0; i < graph->n_nodes; i++) {
1354
1585
  struct ggml_tensor * node = graph->nodes[i];
@@ -1357,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1357
1588
 
1358
1589
  // allocate nodes
1359
1590
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1591
+ if (buffer == NULL) {
1592
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
1593
+ free(hash_set.keys);
1594
+ free(node_copies);
1595
+ free(node_init);
1596
+ ggml_free(ctx_allocated);
1597
+ ggml_free(ctx_unallocated);
1598
+ return (struct ggml_backend_graph_copy) {
1599
+ /* .buffer = */ NULL,
1600
+ /* .ctx_allocated = */ NULL,
1601
+ /* .ctx_unallocated = */ NULL,
1602
+ /* .graph = */ NULL,
1603
+ };
1604
+ }
1360
1605
 
1361
1606
  //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1362
1607
 
@@ -1393,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1393
1638
  ggml_free(copy.ctx_unallocated);
1394
1639
  }
1395
1640
 
1396
- void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1641
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1397
1642
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1643
+ if (copy.buffer == NULL) {
1644
+ return false;
1645
+ }
1646
+
1398
1647
  struct ggml_cgraph * g1 = graph;
1399
1648
  struct ggml_cgraph * g2 = copy.graph;
1400
1649
 
@@ -1424,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1424
1673
  }
1425
1674
 
1426
1675
  ggml_backend_graph_copy_free(copy);
1676
+
1677
+ return true;
1427
1678
  }