llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,7 +15,11 @@
15
15
 
16
16
  // backend buffer type
17
17
 
18
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
18
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
+ return buft->iface.get_name(buft);
20
+ }
21
+
22
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
19
23
  return buft->iface.alloc_buffer(buft, size);
20
24
  }
21
25
 
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
23
27
  return buft->iface.get_alignment(buft);
24
28
  }
25
29
 
26
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
30
+ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
27
31
  // get_alloc_size is optional, defaults to ggml_nbytes
28
32
  if (buft->iface.get_alloc_size) {
29
33
  return buft->iface.get_alloc_size(buft, tensor);
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
44
48
 
45
49
  // backend buffer
46
50
 
47
- ggml_backend_buffer_t ggml_backend_buffer_init(
51
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
48
52
  ggml_backend_buffer_type_t buft,
49
53
  struct ggml_backend_buffer_i iface,
50
54
  ggml_backend_buffer_context_t context,
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
58
62
  /* .buft = */ buft,
59
63
  /* .context = */ context,
60
64
  /* .size = */ size,
65
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
61
66
  };
62
67
 
63
68
  return buffer;
64
69
  }
65
70
 
71
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
72
+ return buffer->iface.get_name(buffer);
73
+ }
74
+
66
75
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
67
76
  if (buffer == NULL) {
68
77
  return;
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
86
95
  return base;
87
96
  }
88
97
 
89
- void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
98
+ GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
90
99
  // init_tensor is optional
91
100
  if (buffer->iface.init_tensor) {
92
101
  buffer->iface.init_tensor(buffer, tensor);
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
94
103
  }
95
104
 
96
105
  size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
97
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
106
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
98
107
  }
99
108
 
100
109
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
101
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
110
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
102
111
  }
103
112
 
104
113
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
106
115
  }
107
116
 
108
117
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
118
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
110
119
  }
111
120
 
112
- ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
121
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
+ buffer->usage = usage;
123
+ }
124
+
125
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
113
126
  return buffer->buft;
114
127
  }
115
128
 
129
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
130
+ if (buffer->iface.reset) {
131
+ buffer->iface.reset(buffer);
132
+ }
133
+ }
134
+
135
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
136
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
137
+ if (dst_buf->iface.cpy_tensor) {
138
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
139
+ }
140
+ return false;
141
+ }
142
+
116
143
  // backend
117
144
 
118
145
  const char * ggml_backend_name(ggml_backend_t backend) {
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
146
173
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
147
174
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
148
175
 
149
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
176
+ if (backend->iface.set_tensor_async == NULL) {
177
+ ggml_backend_tensor_set(tensor, data, offset, size);
178
+ } else {
179
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
180
+ }
150
181
  }
151
182
 
152
183
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
153
184
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
154
185
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
155
186
 
156
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
187
+ if (backend->iface.get_tensor_async == NULL) {
188
+ ggml_backend_tensor_get(tensor, data, offset, size);
189
+ } else {
190
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
191
+ }
157
192
  }
158
193
 
159
- void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
194
+ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
+
160
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
161
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
198
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
162
199
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
163
200
 
164
- tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
201
+ tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
165
202
  }
166
203
 
167
- void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
204
+ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
+
168
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
169
208
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
170
209
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
171
210
 
172
- tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
211
+ tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
173
212
  }
174
213
 
175
214
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
190
229
 
191
230
  void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
192
231
  backend->iface.graph_plan_compute(backend, plan);
193
-
194
- // TODO: optional sync
195
- ggml_backend_synchronize(backend);
196
232
  }
197
233
 
198
234
  bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
- if (!backend->iface.graph_compute(backend, cgraph)) {
200
- return false;
201
- }
202
-
203
- // TODO: optional sync
204
- ggml_backend_synchronize(backend);
205
- return true;
235
+ return backend->iface.graph_compute(backend, cgraph);
206
236
  }
207
237
 
208
238
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
227
257
  }
228
258
 
229
259
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
230
- //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
231
- //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
232
260
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
233
261
 
234
- // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
235
-
236
262
  if (src == dst) {
237
263
  return;
238
264
  }
239
265
 
240
- // TODO: allow backends to support copy to/from same backend
241
-
242
- if (dst->buffer->iface.cpy_tensor_from != NULL) {
243
- dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
244
- } else if (src->buffer->iface.cpy_tensor_to != NULL) {
245
- src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
246
- } else {
247
- // shouldn't be hit when copying from/to CPU
248
- #ifndef NDEBUG
249
- fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
250
- "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
251
- #endif
266
+ if (ggml_backend_buffer_is_host(src->buffer)) {
267
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
268
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
269
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
270
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
271
+ #ifndef NDEBUG
272
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
273
+ #endif
252
274
  size_t nbytes = ggml_nbytes(src);
253
275
  void * data = malloc(nbytes);
254
276
  ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
257
279
  }
258
280
  }
259
281
 
282
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
283
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
284
+
285
+ if (src == dst) {
286
+ return;
287
+ }
288
+
289
+ if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
290
+ if (backend->iface.cpy_tensor_async != NULL) {
291
+ if (backend->iface.cpy_tensor_async(backend, src, dst)) {
292
+ return;
293
+ }
294
+ }
295
+ }
296
+
297
+ size_t nbytes = ggml_nbytes(src);
298
+ if (ggml_backend_buffer_is_host(src->buffer)) {
299
+ ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
300
+ }
301
+ else {
302
+ ggml_backend_tensor_copy(src, dst);
303
+ }
304
+ }
305
+
306
+
260
307
  // backend registry
261
308
 
262
309
  #define GGML_MAX_BACKENDS_REG 16
@@ -271,9 +318,9 @@ struct ggml_backend_reg {
271
318
  static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
272
319
  static size_t ggml_backend_registry_count = 0;
273
320
 
274
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
321
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
275
322
 
276
- static void ggml_backend_registry_init(void) {
323
+ GGML_CALL static void ggml_backend_registry_init(void) {
277
324
  static bool initialized = false;
278
325
 
279
326
  if (initialized) {
@@ -286,18 +333,18 @@ static void ggml_backend_registry_init(void) {
286
333
 
287
334
  // add forward decls here to avoid including the backend headers
288
335
  #ifdef GGML_USE_CUBLAS
289
- extern void ggml_backend_cuda_reg_devices(void);
336
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
290
337
  ggml_backend_cuda_reg_devices();
291
338
  #endif
292
339
 
293
340
  #ifdef GGML_USE_METAL
294
- extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
295
- extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
341
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
296
343
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
297
344
  #endif
298
345
  }
299
346
 
300
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
347
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
301
348
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
302
349
 
303
350
  size_t id = ggml_backend_registry_count;
@@ -392,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
392
439
 
393
440
  // backend CPU
394
441
 
395
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
442
+ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
+ return "CPU";
444
+
445
+ GGML_UNUSED(buffer);
446
+ }
447
+
448
+ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
396
449
  return (void *)buffer->context;
397
450
  }
398
451
 
399
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
452
+ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
400
453
  free(buffer->context);
401
454
  }
402
455
 
403
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
456
+ GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
404
457
  memcpy((char *)tensor->data + offset, data, size);
405
458
 
406
459
  GGML_UNUSED(buffer);
407
460
  }
408
461
 
409
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
462
+ GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
410
463
  memcpy(data, (const char *)tensor->data + offset, size);
411
464
 
412
465
  GGML_UNUSED(buffer);
413
466
  }
414
467
 
415
- static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
416
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
417
-
418
- GGML_UNUSED(buffer);
419
- }
420
-
421
- static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
422
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
468
+ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
+ if (ggml_backend_buffer_is_host(src->buffer)) {
470
+ memcpy(dst->data, src->data, ggml_nbytes(src));
471
+ return true;
472
+ }
473
+ return false;
423
474
 
424
475
  GGML_UNUSED(buffer);
425
476
  }
426
477
 
427
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
478
+ GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
428
479
  memset(buffer->context, value, buffer->size);
429
480
  }
430
481
 
431
482
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
483
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
432
484
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
433
485
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
434
486
  /* .init_tensor = */ NULL, // no initialization required
435
487
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
436
488
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
437
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
438
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
489
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
439
490
  /* .clear = */ ggml_backend_cpu_buffer_clear,
491
+ /* .reset = */ NULL,
440
492
  };
441
493
 
442
494
  // for buffers from ptr, free is not called
443
495
  static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
496
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
444
497
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
445
498
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
446
499
  /* .init_tensor = */ NULL, // no initialization required
447
500
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
448
501
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
449
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
450
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
502
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
451
503
  /* .clear = */ ggml_backend_cpu_buffer_clear,
504
+ /* .reset = */ NULL,
452
505
  };
453
506
 
454
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
455
508
 
456
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
509
+ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
+ return "CPU";
511
+
512
+ GGML_UNUSED(buft);
513
+ }
514
+
515
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
457
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
458
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
459
518
 
@@ -462,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
462
521
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
463
522
  }
464
523
 
465
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
524
+ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
466
525
  return TENSOR_ALIGNMENT;
467
526
 
468
527
  GGML_UNUSED(buft);
469
528
  }
470
529
 
471
- static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
530
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
472
531
  return ggml_backend_is_cpu(backend);
473
532
 
474
533
  GGML_UNUSED(buft);
475
534
  }
476
535
 
477
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
536
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
478
537
  return true;
479
538
 
480
539
  GGML_UNUSED(buft);
481
540
  }
482
541
 
483
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
542
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
484
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
485
544
  /* .iface = */ {
545
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
486
546
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
487
547
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
488
548
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -501,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
501
561
 
502
562
  #include <hbwmalloc.h>
503
563
 
504
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
564
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
+ return "CPU_HBM";
566
+
567
+ GGML_UNUSED(buft);
568
+ }
569
+
570
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
+ return "CPU_HBM";
572
+
573
+ GGML_UNUSED(buf);
574
+ }
575
+
576
+ GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
505
577
  hbw_free(buffer->context);
506
578
  }
507
579
 
508
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
580
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
509
581
  //void * ptr = hbw_malloc(size);
510
582
  void * ptr;
511
583
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
514
586
  return NULL;
515
587
  }
516
588
 
517
- // FIXME: this is a hack to avoid having to implement a new buffer type
518
589
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
519
590
  buffer->buft = buft;
591
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
520
592
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
521
593
 
522
594
  return buffer;
523
595
  }
524
596
 
525
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
597
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
526
598
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
527
599
  /* .iface = */ {
600
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
528
601
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
529
602
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
530
603
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -544,20 +617,20 @@ struct ggml_backend_cpu_context {
544
617
  size_t work_size;
545
618
  };
546
619
 
547
- static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
620
+ GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
548
621
  return "CPU";
549
622
 
550
623
  GGML_UNUSED(backend);
551
624
  }
552
625
 
553
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
626
+ GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
554
627
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
555
628
  free(cpu_ctx->work_data);
556
629
  free(cpu_ctx);
557
630
  free(backend);
558
631
  }
559
632
 
560
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
633
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
561
634
  return ggml_backend_cpu_buffer_type();
562
635
 
563
636
  GGML_UNUSED(backend);
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
568
641
  struct ggml_cgraph cgraph;
569
642
  };
570
643
 
571
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
644
+ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
572
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
573
646
 
574
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -583,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
583
656
  return cpu_plan;
584
657
  }
585
658
 
586
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
659
+ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
587
660
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
588
661
 
589
662
  free(cpu_plan->cplan.work_data);
@@ -592,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
592
665
  GGML_UNUSED(backend);
593
666
  }
594
667
 
595
- static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
668
+ GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
596
669
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
597
670
 
598
671
  ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -600,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
600
673
  GGML_UNUSED(backend);
601
674
  }
602
675
 
603
- static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
676
+ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
604
677
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
605
678
 
606
679
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -617,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
617
690
  return true;
618
691
  }
619
692
 
620
- static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
693
+ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
621
694
  switch (op->op) {
622
695
  case GGML_OP_MUL_MAT:
623
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
634
707
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
635
708
  /* .set_tensor_async = */ NULL,
636
709
  /* .get_tensor_async = */ NULL,
637
- /* .cpy_tensor_from_async = */ NULL,
638
- /* .cpy_tensor_to_async = */ NULL,
710
+ /* .cpy_tensor_async = */ NULL,
639
711
  /* .synchronize = */ NULL,
640
712
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
641
713
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
@@ -660,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
660
732
  return cpu_backend;
661
733
  }
662
734
 
663
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
664
- return backend->iface.get_name == ggml_backend_cpu_name;
735
+ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
+ return backend && backend->iface.get_name == ggml_backend_cpu_name;
665
737
  }
666
738
 
667
739
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -671,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
671
743
  ctx->n_threads = n_threads;
672
744
  }
673
745
 
674
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
746
+ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
675
747
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
676
748
  }
677
749
 
678
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
750
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
679
751
  return ggml_backend_cpu_init();
680
752
 
681
753
  GGML_UNUSED(params);
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
685
757
 
686
758
  // scheduler
687
759
 
688
- #define GGML_MAX_BACKENDS 4
760
+ #define GGML_MAX_BACKENDS 16
689
761
  #define GGML_MAX_SPLITS 256
690
762
  #define GGML_MAX_SPLIT_INPUTS 16
691
763
 
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
695
767
  int i_end;
696
768
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
697
769
  int n_inputs;
770
+ // graph view of this split
698
771
  struct ggml_cgraph graph;
699
772
  };
700
773
 
701
774
  struct ggml_backend_sched {
775
+ bool is_reset; // true if the scheduler has been reset since the last graph split
776
+
702
777
  int n_backends;
703
778
  ggml_backend_t backends[GGML_MAX_BACKENDS];
779
+ ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
704
780
  ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
705
781
 
706
782
  ggml_gallocr_t galloc;
707
783
 
784
+ // hash keys of the nodes in the graph
708
785
  struct ggml_hash_set hash_set;
709
- ggml_tallocr_t * node_talloc; // [hash_set.size]
710
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
786
+ // hash values (arrays of [hash_set.size])
787
+ ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
788
+ struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
711
789
 
790
+ // copy of the graph with modified inputs
712
791
  struct ggml_cgraph * graph;
792
+
713
793
  struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
714
794
  int n_splits;
715
795
 
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
750
830
  return INT_MAX;
751
831
  }
752
832
 
753
- static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
833
+ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
754
834
  if (buffer == NULL) {
755
835
  return NULL;
756
836
  }
837
+
838
+ // check if this is already allocate in a allocr buffer (from user manual allocations)
839
+ for (int i = 0; i < sched->n_backends; i++) {
840
+ if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
841
+ return sched->tallocs[i];
842
+ }
843
+ }
844
+
757
845
  // find highest prio backend that supports the buffer type
758
846
  for (int i = 0; i < sched->n_backends; i++) {
759
847
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
760
- return sched->backends[i];
848
+ return sched->tallocs[i];
761
849
  }
762
850
  }
763
851
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
767
855
  if (allocr == NULL) {
768
856
  return NULL;
769
857
  }
770
- // find highest prio backend that supports the buffer type
771
858
  for (int i = 0; i < sched->n_backends; i++) {
772
859
  if (sched->tallocs[i] == allocr) {
773
860
  return sched->backends[i];
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
777
864
  }
778
865
 
779
866
  #if 0
780
- static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
867
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
781
868
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
782
869
  #define GET_CAUSE(node) causes[hash_id(node)]
783
870
  #else
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
786
873
  #endif
787
874
 
788
875
  // returns the backend that should be used for the node based on the current locations
789
- static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
790
- // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
791
- // ie. kv cache updates
792
- // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
876
+ static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
877
+ // assign pre-allocated nodes to their backend
793
878
  // dst
794
- ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
795
- if (cur_backend != NULL) {
879
+ ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
880
+ if (cur_allocr != NULL) {
796
881
  SET_CAUSE(node, "1.dst");
797
- return cur_backend;
882
+ return cur_allocr;
798
883
  }
799
-
800
884
  // view_src
801
- if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
802
- SET_CAUSE(node, "1.vsrc");
803
- return get_buffer_backend(sched, node->view_src->buffer);
885
+ if (node->view_src != NULL) {
886
+ cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
887
+ if (cur_allocr != NULL) {
888
+ SET_CAUSE(node, "1.vsrc");
889
+ return cur_allocr;
890
+ }
804
891
  }
805
-
806
- // src
807
- int cur_prio = INT_MAX;
808
- size_t cur_size = 0;
809
-
892
+ // assign nodes that use weights to the backend of the weights
810
893
  for (int i = 0; i < GGML_MAX_SRC; i++) {
811
894
  const struct ggml_tensor * src = node->src[i];
812
895
  if (src == NULL) {
813
896
  break;
814
897
  }
815
- ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
816
- if (src_backend != NULL) {
817
- int src_prio = sched_backend_prio(sched, src_backend);
818
- size_t src_size = ggml_nbytes(src);
819
- if (src_prio < cur_prio && src_size >= cur_size) {
820
- cur_prio = src_prio;
821
- cur_size = src_size;
822
- cur_backend = src_backend;
823
- SET_CAUSE(node, "1.src%d", i);
824
- }
898
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
899
+ ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
900
+ // operations with weights are always run on the same backend as the weights
901
+ SET_CAUSE(node, "1.wgt%d", i);
902
+ return src_allocr;
825
903
  }
826
904
  }
827
- return cur_backend;
905
+
906
+ return NULL;
828
907
  }
829
908
 
830
909
  static char * fmt_size(size_t size) {
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
857
936
  }
858
937
  ggml_tallocr_t node_allocr = node_allocr(node);
859
938
  ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
860
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
939
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
861
940
  fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
862
941
  for (int j = 0; j < GGML_MAX_SRC; j++) {
863
942
  struct ggml_tensor * src = node->src[j];
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
866
945
  }
867
946
  ggml_tallocr_t src_allocr = node_allocr(src);
868
947
  ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
869
- fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
948
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
870
949
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
871
950
  }
872
951
  fprintf(stderr, "\n");
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
882
961
  return dup;
883
962
  }
884
963
 
964
+
965
+ //#define DEBUG_PASS1
966
+ //#define DEBUG_PASS2
967
+ //#define DEBUG_PASS3
968
+ //#define DEBUG_PASS4
969
+
885
970
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
886
- // TODO: merge passes
887
971
  static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
888
- // reset state
889
- size_t hash_size = sched->hash_set.size;
890
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
891
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
892
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
972
+ // reset splits
893
973
  sched->n_splits = 0;
974
+ sched->is_reset = false;
894
975
 
895
976
  struct ggml_init_params params = {
896
977
  /* .mem_size = */ sizeof(sched->context_buffer),
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
898
979
  /* .no_alloc = */ true
899
980
  };
900
981
 
901
- if (sched->ctx != NULL) {
902
- ggml_free(sched->ctx);
903
- }
982
+ ggml_free(sched->ctx);
904
983
 
905
984
  sched->ctx = ggml_init(params);
985
+ if (sched->ctx == NULL) {
986
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
987
+ GGML_ASSERT(false);
988
+ }
906
989
 
907
- // pass 1: assign backends to ops with allocated inputs
990
+ // pass 1: assign backends to ops with pre-allocated inputs
908
991
  for (int i = 0; i < graph->n_leafs; i++) {
909
992
  struct ggml_tensor * leaf = graph->leafs[i];
910
993
  if (node_allocr(leaf) != NULL) {
911
994
  // do not overwrite user assignments
912
995
  continue;
913
996
  }
914
- ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
915
- if (leaf_backend == NULL && leaf->view_src != NULL) {
916
- leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
917
- }
918
- if (leaf_backend != NULL) {
919
- node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
920
- }
997
+ node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
921
998
  }
922
999
 
923
1000
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -926,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
926
1003
  // do not overwrite user assignments
927
1004
  continue;
928
1005
  }
929
- ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
930
- if (node_backend != NULL) {
931
- node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
1006
+ node_allocr(node) = sched_allocr_from_cur(sched, node);
1007
+ // src
1008
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1009
+ struct ggml_tensor * src = node->src[j];
1010
+ if (src == NULL) {
1011
+ break;
1012
+ }
1013
+ if (node_allocr(src) == NULL) {
1014
+ node_allocr(src) = sched_allocr_from_cur(sched, src);
1015
+ }
932
1016
  }
933
1017
  }
934
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1018
+ #ifdef DEBUG_PASS1
1019
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1020
+ #endif
935
1021
 
936
- // pass 2: assign backends to ops from current assignments
937
- // TODO:
938
- // - reuse sched_backend_from_cur
939
- for (int i = 0; i < graph->n_nodes; i++) {
940
- struct ggml_tensor * node = graph->nodes[i];
941
- ggml_tallocr_t node_allocr = node_allocr(node);
942
- if (node_allocr == NULL) {
943
- int cur_prio = INT_MAX;
944
- size_t cur_size = 0;
945
- for (int j = 0; j < GGML_MAX_SRC; j++) {
946
- struct ggml_tensor * src = node->src[j];
947
- if (src == NULL) {
948
- break;
1022
+ // pass 2: expand current backend assignments
1023
+ // assign the same backend to adjacent nodes
1024
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1025
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1026
+
1027
+ // pass 2.1 expand gpu up
1028
+ {
1029
+ ggml_tallocr_t cur_allocr = NULL;
1030
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1031
+ struct ggml_tensor * node = graph->nodes[i];
1032
+ if (ggml_is_view_op(node->op)) {
1033
+ continue;
1034
+ }
1035
+ ggml_tallocr_t node_allocr = node_allocr(node);
1036
+ if (node_allocr != NULL) {
1037
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1038
+ // skip cpu (lowest prio backend)
1039
+ cur_allocr = NULL;
1040
+ } else {
1041
+ cur_allocr = node_allocr;
949
1042
  }
950
- ggml_tallocr_t src_allocr = node_allocr(src);
951
- if (src_allocr != NULL) {
952
- int src_prio = sched_allocr_prio(sched, src_allocr);
953
- size_t src_size = ggml_nbytes(src);
954
- if (src_prio < cur_prio && src_size >= cur_size) {
955
- cur_prio = src_prio;
956
- cur_size = src_size;
957
- node_allocr = src_allocr;
958
- SET_CAUSE(node, "2.src%d", j);
959
- }
1043
+ } else {
1044
+ node_allocr(node) = cur_allocr;
1045
+ SET_CAUSE(node, "2.1");
1046
+ }
1047
+ }
1048
+ }
1049
+
1050
+ // pass 2.2 expand gpu down
1051
+ {
1052
+ ggml_tallocr_t cur_allocr = NULL;
1053
+ for (int i = 0; i < graph->n_nodes; i++) {
1054
+ struct ggml_tensor * node = graph->nodes[i];
1055
+ if (ggml_is_view_op(node->op)) {
1056
+ continue;
1057
+ }
1058
+ ggml_tallocr_t node_allocr = node_allocr(node);
1059
+ if (node_allocr != NULL) {
1060
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1061
+ // skip cpu (lowest prio backend)
1062
+ cur_allocr = NULL;
1063
+ } else {
1064
+ cur_allocr = node_allocr;
960
1065
  }
1066
+ } else {
1067
+ node_allocr(node) = cur_allocr;
1068
+ SET_CAUSE(node, "2.2");
961
1069
  }
1070
+ }
1071
+ }
1072
+
1073
+ // pass 2.3 expand rest up
1074
+ {
1075
+ ggml_tallocr_t cur_allocr = NULL;
1076
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1077
+ struct ggml_tensor * node = graph->nodes[i];
1078
+ if (ggml_is_view_op(node->op)) {
1079
+ continue;
1080
+ }
1081
+ ggml_tallocr_t node_allocr = node_allocr(node);
962
1082
  if (node_allocr != NULL) {
963
- node_allocr(node) = node_allocr;
1083
+ cur_allocr = node_allocr;
1084
+ } else {
1085
+ node_allocr(node) = cur_allocr;
1086
+ SET_CAUSE(node, "2.3");
964
1087
  }
965
1088
  }
966
1089
  }
967
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
968
1090
 
969
- // pass 3: assign backends to remaining src from dst (should only be leafs)
1091
+ // pass 2.4 expand rest down
1092
+ {
1093
+ ggml_tallocr_t cur_allocr = NULL;
1094
+ for (int i = 0; i < graph->n_nodes; i++) {
1095
+ struct ggml_tensor * node = graph->nodes[i];
1096
+ if (ggml_is_view_op(node->op)) {
1097
+ continue;
1098
+ }
1099
+ ggml_tallocr_t node_allocr = node_allocr(node);
1100
+ if (node_allocr != NULL) {
1101
+ cur_allocr = node_allocr;
1102
+ } else {
1103
+ node_allocr(node) = cur_allocr;
1104
+ SET_CAUSE(node, "2.4");
1105
+ }
1106
+ }
1107
+ }
1108
+ #ifdef DEBUG_PASS2
1109
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1110
+ #endif
1111
+
1112
+ // pass 3: assign backends to remaining src from dst and view_src
970
1113
  for (int i = 0; i < graph->n_nodes; i++) {
971
1114
  struct ggml_tensor * node = graph->nodes[i];
972
- ggml_tallocr_t node_allocr = node_allocr(node);
1115
+ ggml_tallocr_t cur_allocr = node_allocr(node);
1116
+ if (node->view_src != NULL && cur_allocr == NULL) {
1117
+ cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1118
+ SET_CAUSE(node, "3.vsrc");
1119
+ }
973
1120
  for (int j = 0; j < GGML_MAX_SRC; j++) {
974
1121
  struct ggml_tensor * src = node->src[j];
975
1122
  if (src == NULL) {
@@ -977,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977
1124
  }
978
1125
  ggml_tallocr_t src_allocr = node_allocr(src);
979
1126
  if (src_allocr == NULL) {
980
- node_allocr(src) = node_allocr;
1127
+ if (src->view_src != NULL) {
1128
+ // views are always on the same backend as the source
1129
+ node_allocr(src) = node_allocr(src->view_src);
1130
+ SET_CAUSE(src, "3.vsrc");
1131
+ } else {
1132
+ node_allocr(src) = cur_allocr;
1133
+ SET_CAUSE(src, "3.cur");
1134
+ }
981
1135
  }
982
1136
  }
983
1137
  }
984
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1138
+ #ifdef DEBUG_PASS3
1139
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1140
+ #endif
985
1141
 
986
1142
  // pass 4: split graph, find tensors that need to be copied
987
- // TODO:
988
- // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
989
- // find first backend
990
- int cur_split = 0;
991
- for (int i = 0; i < graph->n_nodes; i++) {
992
- struct ggml_tensor * node = graph->nodes[i];
993
- if (node->view_src == NULL) {
994
- sched->splits[0].tallocr = node_allocr(node);
995
- break;
996
- }
997
- }
998
- sched->splits[0].i_start = 0;
999
- sched->splits[0].n_inputs = 0;
1000
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1001
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1002
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1003
- for (int i = 0; i < graph->n_nodes; i++) {
1004
- struct ggml_tensor * node = graph->nodes[i];
1005
-
1006
- if (ggml_is_view_op(node->op)) {
1007
- continue;
1143
+ {
1144
+ int cur_split = 0;
1145
+ // find the backend of the first split, skipping view ops
1146
+ for (int i = 0; i < graph->n_nodes; i++) {
1147
+ struct ggml_tensor * node = graph->nodes[i];
1148
+ if (!ggml_is_view_op(node->op)) {
1149
+ sched->splits[0].tallocr = node_allocr(node);
1150
+ break;
1151
+ }
1008
1152
  }
1153
+ sched->splits[0].i_start = 0;
1154
+ sched->splits[0].n_inputs = 0;
1155
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1156
+ ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1157
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1158
+ for (int i = 0; i < graph->n_nodes; i++) {
1159
+ struct ggml_tensor * node = graph->nodes[i];
1160
+
1161
+ if (ggml_is_view_op(node->op)) {
1162
+ continue;
1163
+ }
1009
1164
 
1010
- ggml_tallocr_t node_allocr = node_allocr(node);
1165
+ ggml_tallocr_t node_allocr = node_allocr(node);
1011
1166
 
1012
- if (node_allocr != cur_allocr) {
1013
- sched->splits[cur_split].i_end = i;
1014
- cur_split++;
1015
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1016
- sched->splits[cur_split].tallocr = node_allocr;
1017
- sched->splits[cur_split].i_start = i;
1018
- sched->splits[cur_split].n_inputs = 0;
1019
- memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
1020
- cur_allocr = node_allocr;
1021
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1022
- }
1167
+ GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1023
1168
 
1024
- // find inputs that are not on the same backend
1025
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1026
- struct ggml_tensor * src = node->src[j];
1027
- if (src == NULL) {
1028
- break;
1169
+ if (node_allocr != cur_allocr) {
1170
+ sched->splits[cur_split].i_end = i;
1171
+ cur_split++;
1172
+ GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1173
+ sched->splits[cur_split].tallocr = node_allocr;
1174
+ sched->splits[cur_split].i_start = i;
1175
+ sched->splits[cur_split].n_inputs = 0;
1176
+ cur_allocr = node_allocr;
1177
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1029
1178
  }
1030
- ggml_tallocr_t src_allocr = node_allocr(src);
1031
- if (src_allocr != node_allocr) {
1032
- int n_inputs = sched->splits[cur_split].n_inputs++;
1033
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1034
- sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1035
-
1036
- // create copies
1037
- size_t id = hash_id(src);
1038
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1039
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1040
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1041
- node_allocr(tensor_copy) = cur_allocr;
1042
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1043
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1179
+
1180
+ // find inputs that are not on the same backend
1181
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1182
+ struct ggml_tensor * src = node->src[j];
1183
+ if (src == NULL) {
1184
+ break;
1185
+ }
1186
+ ggml_tallocr_t src_allocr = node_allocr(src);
1187
+ GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1188
+ if (src_allocr != node_allocr) {
1189
+ // check if the input is already in the split
1190
+ bool found = false;
1191
+ for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1192
+ if (sched->splits[cur_split].inputs[k] == src) {
1193
+ found = true;
1194
+ break;
1195
+ }
1196
+ }
1197
+
1198
+ if (!found) {
1199
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1200
+ //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1201
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1202
+ sched->splits[cur_split].inputs[n_inputs] = src;
1203
+ }
1204
+
1205
+ // create a copy of the input in the split's backend
1206
+ size_t id = hash_id(src);
1207
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1208
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1209
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1210
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1211
+
1212
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1213
+ node_allocr(tensor_copy) = cur_allocr;
1214
+ SET_CAUSE(tensor_copy, "4.cpy");
1215
+ }
1216
+ node->src[j] = sched->node_copies[id][cur_backend_id];
1044
1217
  }
1045
- node->src[j] = sched->node_copies[id][cur_backend_id];
1046
1218
  }
1047
1219
  }
1220
+ sched->splits[cur_split].i_end = graph->n_nodes;
1221
+ sched->n_splits = cur_split + 1;
1048
1222
  }
1049
- sched->splits[cur_split].i_end = graph->n_nodes;
1050
- sched->n_splits = cur_split + 1;
1051
-
1052
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
1223
+ #ifdef DEBUG_PASS4
1224
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1225
+ #endif
1053
1226
 
1054
- #if 1
1227
+ #ifndef NDEBUG
1055
1228
  // sanity check: all sources should have the same backend as the node
1056
1229
  for (int i = 0; i < graph->n_nodes; i++) {
1057
1230
  struct ggml_tensor * node = graph->nodes[i];
@@ -1059,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1059
1232
  if (node_allocr == NULL) {
1060
1233
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1061
1234
  }
1235
+ if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1236
+ fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1237
+ node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1238
+ node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1239
+ }
1062
1240
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1063
1241
  struct ggml_tensor * src = node->src[j];
1064
1242
  if (src == NULL) {
@@ -1070,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1070
1248
  node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1071
1249
  j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1072
1250
  }
1251
+ if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1252
+ fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1253
+ src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1254
+ src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1255
+ }
1073
1256
  }
1074
1257
  }
1258
+ fflush(stderr);
1075
1259
  #endif
1076
1260
 
1077
1261
  // create copies of the graph for each split
@@ -1085,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1085
1269
  for (int j = 0; j < split->n_inputs; j++) {
1086
1270
  struct ggml_tensor * input = split->inputs[j];
1087
1271
  struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1272
+ // add a dependency to the input source so that it is not freed before the copy is done
1273
+ GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1088
1274
  input_cpy->src[0] = input;
1089
1275
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1090
1276
  }
@@ -1119,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1119
1305
  uint64_t copy_start_us = ggml_time_us();
1120
1306
  for (int j = 0; j < split->n_inputs; j++) {
1121
1307
  struct ggml_tensor * input = split->inputs[j];
1122
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
1123
- if (input->buffer == NULL) {
1124
- if (input->view_src == NULL) {
1125
- fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
1126
- exit(1);
1127
- }
1128
- // FIXME: may need to use the sched buffer instead
1129
- ggml_backend_view_init(input->view_src->buffer, input);
1130
- }
1131
- if (input_cpy->buffer == NULL) {
1132
- fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
1133
- exit(1);
1134
- }
1135
- //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
1136
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1137
- ggml_backend_tensor_copy(input, input_cpy);
1308
+ struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1309
+
1310
+ GGML_ASSERT(input->buffer != NULL);
1311
+ GGML_ASSERT(input_cpy->buffer != NULL);
1312
+
1313
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1314
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1315
+ ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1138
1316
  }
1139
- // ggml_backend_synchronize(split_backend);
1317
+ //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1140
1318
  int64_t copy_end_us = ggml_time_us();
1141
1319
  copy_us[split_backend_id] += copy_end_us - copy_start_us;
1142
1320
 
@@ -1148,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1148
1326
 
1149
1327
  uint64_t compute_start_us = ggml_time_us();
1150
1328
  ggml_backend_graph_compute(split_backend, &split->graph);
1151
- // ggml_backend_synchronize(split_backend);
1329
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1152
1330
  uint64_t compute_end_us = ggml_time_us();
1153
1331
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1154
1332
  }
@@ -1168,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
1168
1346
  for (int i = 0; i < sched->n_backends; i++) {
1169
1347
  ggml_tallocr_reset(sched->tallocs[i]);
1170
1348
  }
1349
+ // reset state for the next run
1350
+ size_t hash_size = sched->hash_set.size;
1351
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1352
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1353
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1354
+
1355
+ sched->is_reset = true;
1171
1356
  }
1172
1357
 
1173
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
1358
+ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1359
+ GGML_ASSERT(n_backends > 0);
1174
1360
  GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1175
1361
 
1176
- struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
1177
- memset(sched, 0, sizeof(struct ggml_backend_sched));
1362
+ struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1363
+
1364
+ // initialize hash table
1365
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1366
+ sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1367
+ sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1178
1368
 
1179
1369
  sched->n_backends = n_backends;
1180
1370
  for (int i = 0; i < n_backends; i++) {
1181
1371
  sched->backends[i] = backends[i];
1372
+ sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1182
1373
  }
1183
1374
 
1184
1375
  sched->galloc = ggml_gallocr_new();
1185
1376
 
1186
1377
  // init measure allocs for each backend
1187
1378
  for (int i = 0; i < n_backends; i++) {
1188
- sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
1379
+ sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1189
1380
  }
1190
1381
 
1382
+ sched_reset(sched);
1383
+
1191
1384
  return sched;
1192
1385
  }
1193
1386
 
@@ -1199,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1199
1392
  ggml_tallocr_free(sched->tallocs[i]);
1200
1393
  }
1201
1394
  ggml_gallocr_free(sched->galloc);
1395
+ ggml_free(sched->ctx);
1202
1396
  free(sched->hash_set.keys);
1203
1397
  free(sched->node_talloc);
1204
1398
  free(sched->node_copies);
@@ -1206,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1206
1400
  }
1207
1401
 
1208
1402
  void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1209
- // initialize hash tables
1210
- size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
1211
- sched->hash_set.size = hash_size;
1212
- sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
1213
- sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
1214
- sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
1403
+ GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
1215
1404
 
1216
1405
  sched_split_graph(sched, measure_graph);
1217
1406
  sched_alloc_splits(sched);
@@ -1220,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
1220
1409
  for (int i = 0; i < sched->n_backends; i++) {
1221
1410
  size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1222
1411
  ggml_tallocr_free(sched->tallocs[i]);
1223
- sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
1412
+ sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1224
1413
  }
1225
1414
 
1226
1415
  sched_reset(sched);
1227
1416
  }
1228
1417
 
1229
1418
  void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1230
- GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1419
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1420
+
1421
+ if (!sched->is_reset) {
1422
+ sched_reset(sched);
1423
+ }
1231
1424
 
1232
1425
  sched_split_graph(sched, graph);
1233
1426
  sched_alloc_splits(sched);
1234
1427
  sched_compute_splits(sched);
1428
+ }
1429
+
1430
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1235
1431
  sched_reset(sched);
1236
1432
  }
1237
1433
 
1434
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1435
+ return sched->n_splits;
1436
+ }
1437
+
1238
1438
  ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1239
1439
  int backend_index = sched_backend_prio(sched, backend);
1440
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1240
1441
  return sched->tallocs[backend_index];
1241
1442
  }
1242
1443
 
1243
1444
  ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1244
1445
  int backend_index = sched_backend_prio(sched, backend);
1446
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1245
1447
  return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1246
1448
  }
1247
1449
 
@@ -1251,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1251
1453
  node_allocr(node) = sched->tallocs[backend_index];
1252
1454
  }
1253
1455
 
1456
+ ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1457
+ ggml_tallocr_t allocr = node_allocr(node);
1458
+ if (allocr == NULL) {
1459
+ return NULL;
1460
+ }
1461
+ return get_allocr_backend(sched, allocr);
1462
+ }
1463
+
1254
1464
  // utils
1465
+
1255
1466
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1256
1467
  GGML_ASSERT(tensor->buffer == NULL);
1257
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1468
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1258
1469
  GGML_ASSERT(tensor->view_src != NULL);
1259
1470
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1260
1471
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1320,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1320
1531
 
1321
1532
  struct ggml_tensor * dst = node_copies[id];
1322
1533
  if (dst->view_src != NULL) {
1534
+ graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1323
1535
  ggml_backend_view_init(dst->view_src->buffer, dst);
1324
1536
  }
1325
1537
  else {
@@ -1353,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1353
1565
  struct ggml_context * ctx_allocated = ggml_init(params);
1354
1566
  struct ggml_context * ctx_unallocated = ggml_init(params);
1355
1567
 
1568
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1569
+ fprintf(stderr, "failed to allocate context for graph copy\n");
1570
+ free(hash_set.keys);
1571
+ free(node_copies);
1572
+ free(node_init);
1573
+ ggml_free(ctx_allocated);
1574
+ ggml_free(ctx_unallocated);
1575
+ return (struct ggml_backend_graph_copy) {
1576
+ /* .buffer = */ NULL,
1577
+ /* .ctx_allocated = */ NULL,
1578
+ /* .ctx_unallocated = */ NULL,
1579
+ /* .graph = */ NULL,
1580
+ };
1581
+ }
1582
+
1356
1583
  // dup nodes
1357
1584
  for (int i = 0; i < graph->n_nodes; i++) {
1358
1585
  struct ggml_tensor * node = graph->nodes[i];
@@ -1361,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1361
1588
 
1362
1589
  // allocate nodes
1363
1590
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1591
+ if (buffer == NULL) {
1592
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
1593
+ free(hash_set.keys);
1594
+ free(node_copies);
1595
+ free(node_init);
1596
+ ggml_free(ctx_allocated);
1597
+ ggml_free(ctx_unallocated);
1598
+ return (struct ggml_backend_graph_copy) {
1599
+ /* .buffer = */ NULL,
1600
+ /* .ctx_allocated = */ NULL,
1601
+ /* .ctx_unallocated = */ NULL,
1602
+ /* .graph = */ NULL,
1603
+ };
1604
+ }
1364
1605
 
1365
1606
  //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1366
1607
 
@@ -1397,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1397
1638
  ggml_free(copy.ctx_unallocated);
1398
1639
  }
1399
1640
 
1400
- void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1641
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1401
1642
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1643
+ if (copy.buffer == NULL) {
1644
+ return false;
1645
+ }
1646
+
1402
1647
  struct ggml_cgraph * g1 = graph;
1403
1648
  struct ggml_cgraph * g2 = copy.graph;
1404
1649
 
@@ -1428,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1428
1673
  }
1429
1674
 
1430
1675
  ggml_backend_graph_copy_free(copy);
1676
+
1677
+ return true;
1431
1678
  }