llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,11 @@
15
15
 
16
16
  // backend buffer type
17
17
 
18
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
18
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
+ return buft->iface.get_name(buft);
20
+ }
21
+
22
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
19
23
  return buft->iface.alloc_buffer(buft, size);
20
24
  }
21
25
 
@@ -23,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
23
27
  return buft->iface.get_alignment(buft);
24
28
  }
25
29
 
26
- size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
30
+ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
27
31
  // get_alloc_size is optional, defaults to ggml_nbytes
28
32
  if (buft->iface.get_alloc_size) {
29
33
  return buft->iface.get_alloc_size(buft, tensor);
@@ -44,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
44
48
 
45
49
  // backend buffer
46
50
 
47
- ggml_backend_buffer_t ggml_backend_buffer_init(
51
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
48
52
  ggml_backend_buffer_type_t buft,
49
53
  struct ggml_backend_buffer_i iface,
50
54
  ggml_backend_buffer_context_t context,
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
58
62
  /* .buft = */ buft,
59
63
  /* .context = */ context,
60
64
  /* .size = */ size,
65
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
61
66
  };
62
67
 
63
68
  return buffer;
64
69
  }
65
70
 
71
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
72
+ return buffer->iface.get_name(buffer);
73
+ }
74
+
66
75
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
67
76
  if (buffer == NULL) {
68
77
  return;
@@ -86,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
86
95
  return base;
87
96
  }
88
97
 
89
- void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
98
+ GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
90
99
  // init_tensor is optional
91
100
  if (buffer->iface.init_tensor) {
92
101
  buffer->iface.init_tensor(buffer, tensor);
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
94
103
  }
95
104
 
96
105
  size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
97
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
106
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
98
107
  }
99
108
 
100
109
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
101
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
110
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
102
111
  }
103
112
 
104
113
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
106
115
  }
107
116
 
108
117
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
118
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
110
119
  }
111
120
 
112
- ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
121
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
+ buffer->usage = usage;
123
+ }
124
+
125
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
113
126
  return buffer->buft;
114
127
  }
115
128
 
129
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
130
+ if (buffer->iface.reset) {
131
+ buffer->iface.reset(buffer);
132
+ }
133
+ }
134
+
135
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
136
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
137
+ if (dst_buf->iface.cpy_tensor) {
138
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
139
+ }
140
+ return false;
141
+ }
142
+
116
143
  // backend
117
144
 
118
145
  const char * ggml_backend_name(ggml_backend_t backend) {
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
146
173
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
147
174
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
148
175
 
149
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
176
+ if (backend->iface.set_tensor_async == NULL) {
177
+ ggml_backend_tensor_set(tensor, data, offset, size);
178
+ } else {
179
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
180
+ }
150
181
  }
151
182
 
152
183
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
153
184
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
154
185
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
155
186
 
156
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
187
+ if (backend->iface.get_tensor_async == NULL) {
188
+ ggml_backend_tensor_get(tensor, data, offset, size);
189
+ } else {
190
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
191
+ }
157
192
  }
158
193
 
159
- void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
194
+ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
+
160
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
161
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
198
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
162
199
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
163
200
 
164
- tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
201
+ tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
165
202
  }
166
203
 
167
- void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
204
+ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
+
168
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
169
208
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
170
209
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
171
210
 
172
- tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
211
+ tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
173
212
  }
174
213
 
175
214
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
190
229
 
191
230
  void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
192
231
  backend->iface.graph_plan_compute(backend, plan);
193
-
194
- // TODO: optional sync
195
- ggml_backend_synchronize(backend);
196
232
  }
197
233
 
198
234
  bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
- if (!backend->iface.graph_compute(backend, cgraph)) {
200
- return false;
201
- }
202
-
203
- // TODO: optional sync
204
- ggml_backend_synchronize(backend);
205
- return true;
235
+ return backend->iface.graph_compute(backend, cgraph);
206
236
  }
207
237
 
208
238
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
227
257
  }
228
258
 
229
259
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
230
- //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
231
- //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
232
260
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
233
261
 
234
- // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
235
-
236
262
  if (src == dst) {
237
263
  return;
238
264
  }
239
265
 
240
- // TODO: allow backends to support copy to/from same backend
241
-
242
- if (dst->buffer->iface.cpy_tensor_from != NULL) {
243
- dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
244
- } else if (src->buffer->iface.cpy_tensor_to != NULL) {
245
- src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
246
- } else {
247
- // shouldn't be hit when copying from/to CPU
248
- #ifndef NDEBUG
249
- fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
250
- "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
251
- #endif
266
+ if (ggml_backend_buffer_is_host(src->buffer)) {
267
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
268
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
269
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
270
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
271
+ #ifndef NDEBUG
272
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
273
+ #endif
252
274
  size_t nbytes = ggml_nbytes(src);
253
275
  void * data = malloc(nbytes);
254
276
  ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
257
279
  }
258
280
  }
259
281
 
282
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
283
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
284
+
285
+ if (src == dst) {
286
+ return;
287
+ }
288
+
289
+ if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
290
+ if (backend->iface.cpy_tensor_async != NULL) {
291
+ if (backend->iface.cpy_tensor_async(backend, src, dst)) {
292
+ return;
293
+ }
294
+ }
295
+ }
296
+
297
+ size_t nbytes = ggml_nbytes(src);
298
+ if (ggml_backend_buffer_is_host(src->buffer)) {
299
+ ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
300
+ }
301
+ else {
302
+ ggml_backend_tensor_copy(src, dst);
303
+ }
304
+ }
305
+
306
+
260
307
  // backend registry
261
308
 
262
309
  #define GGML_MAX_BACKENDS_REG 16
@@ -271,9 +318,9 @@ struct ggml_backend_reg {
271
318
  static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
272
319
  static size_t ggml_backend_registry_count = 0;
273
320
 
274
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
321
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
275
322
 
276
- static void ggml_backend_registry_init(void) {
323
+ GGML_CALL static void ggml_backend_registry_init(void) {
277
324
  static bool initialized = false;
278
325
 
279
326
  if (initialized) {
@@ -286,18 +333,18 @@ static void ggml_backend_registry_init(void) {
286
333
 
287
334
  // add forward decls here to avoid including the backend headers
288
335
  #ifdef GGML_USE_CUBLAS
289
- extern void ggml_backend_cuda_reg_devices(void);
336
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
290
337
  ggml_backend_cuda_reg_devices();
291
338
  #endif
292
339
 
293
340
  #ifdef GGML_USE_METAL
294
- extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
295
- extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
341
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
296
343
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
297
344
  #endif
298
345
  }
299
346
 
300
- void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
347
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
301
348
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
302
349
 
303
350
  size_t id = ggml_backend_registry_count;
@@ -392,68 +439,80 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
392
439
 
393
440
  // backend CPU
394
441
 
395
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
442
+ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
+ return "CPU";
444
+
445
+ GGML_UNUSED(buffer);
446
+ }
447
+
448
+ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
396
449
  return (void *)buffer->context;
397
450
  }
398
451
 
399
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
452
+ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
400
453
  free(buffer->context);
401
454
  }
402
455
 
403
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
456
+ GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
404
457
  memcpy((char *)tensor->data + offset, data, size);
405
458
 
406
459
  GGML_UNUSED(buffer);
407
460
  }
408
461
 
409
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
462
+ GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
410
463
  memcpy(data, (const char *)tensor->data + offset, size);
411
464
 
412
465
  GGML_UNUSED(buffer);
413
466
  }
414
467
 
415
- static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
416
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
417
-
418
- GGML_UNUSED(buffer);
419
- }
420
-
421
- static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
422
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
468
+ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
+ if (ggml_backend_buffer_is_host(src->buffer)) {
470
+ memcpy(dst->data, src->data, ggml_nbytes(src));
471
+ return true;
472
+ }
473
+ return false;
423
474
 
424
475
  GGML_UNUSED(buffer);
425
476
  }
426
477
 
427
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
478
+ GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
428
479
  memset(buffer->context, value, buffer->size);
429
480
  }
430
481
 
431
482
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
483
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
432
484
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
433
485
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
434
486
  /* .init_tensor = */ NULL, // no initialization required
435
487
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
436
488
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
437
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
438
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
489
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
439
490
  /* .clear = */ ggml_backend_cpu_buffer_clear,
491
+ /* .reset = */ NULL,
440
492
  };
441
493
 
442
494
  // for buffers from ptr, free is not called
443
495
  static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
496
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
444
497
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
445
498
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
446
499
  /* .init_tensor = */ NULL, // no initialization required
447
500
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
448
501
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
449
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
450
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
502
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
451
503
  /* .clear = */ ggml_backend_cpu_buffer_clear,
504
+ /* .reset = */ NULL,
452
505
  };
453
506
 
454
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
455
508
 
456
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
509
+ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
+ return "CPU";
511
+
512
+ GGML_UNUSED(buft);
513
+ }
514
+
515
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
457
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
458
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
459
518
 
@@ -462,27 +521,28 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
462
521
  return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
463
522
  }
464
523
 
465
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
524
+ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
466
525
  return TENSOR_ALIGNMENT;
467
526
 
468
527
  GGML_UNUSED(buft);
469
528
  }
470
529
 
471
- static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
530
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
472
531
  return ggml_backend_is_cpu(backend);
473
532
 
474
533
  GGML_UNUSED(buft);
475
534
  }
476
535
 
477
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
536
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
478
537
  return true;
479
538
 
480
539
  GGML_UNUSED(buft);
481
540
  }
482
541
 
483
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
542
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
484
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
485
544
  /* .iface = */ {
545
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
486
546
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
487
547
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
488
548
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -501,11 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
501
561
 
502
562
  #include <hbwmalloc.h>
503
563
 
504
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
564
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
+ return "CPU_HBM";
566
+
567
+ GGML_UNUSED(buft);
568
+ }
569
+
570
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
+ return "CPU_HBM";
572
+
573
+ GGML_UNUSED(buf);
574
+ }
575
+
576
+ GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
505
577
  hbw_free(buffer->context);
506
578
  }
507
579
 
508
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
580
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
509
581
  //void * ptr = hbw_malloc(size);
510
582
  void * ptr;
511
583
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
514
586
  return NULL;
515
587
  }
516
588
 
517
- // FIXME: this is a hack to avoid having to implement a new buffer type
518
589
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
519
590
  buffer->buft = buft;
591
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
520
592
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
521
593
 
522
594
  return buffer;
523
595
  }
524
596
 
525
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
597
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
526
598
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
527
599
  /* .iface = */ {
600
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
528
601
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
529
602
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
530
603
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -544,20 +617,20 @@ struct ggml_backend_cpu_context {
544
617
  size_t work_size;
545
618
  };
546
619
 
547
- static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
620
+ GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
548
621
  return "CPU";
549
622
 
550
623
  GGML_UNUSED(backend);
551
624
  }
552
625
 
553
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
626
+ GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
554
627
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
555
628
  free(cpu_ctx->work_data);
556
629
  free(cpu_ctx);
557
630
  free(backend);
558
631
  }
559
632
 
560
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
633
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
561
634
  return ggml_backend_cpu_buffer_type();
562
635
 
563
636
  GGML_UNUSED(backend);
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
568
641
  struct ggml_cgraph cgraph;
569
642
  };
570
643
 
571
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
644
+ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
572
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
573
646
 
574
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -583,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
583
656
  return cpu_plan;
584
657
  }
585
658
 
586
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
659
+ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
587
660
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
588
661
 
589
662
  free(cpu_plan->cplan.work_data);
@@ -592,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
592
665
  GGML_UNUSED(backend);
593
666
  }
594
667
 
595
- static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
668
+ GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
596
669
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
597
670
 
598
671
  ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -600,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
600
673
  GGML_UNUSED(backend);
601
674
  }
602
675
 
603
- static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
676
+ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
604
677
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
605
678
 
606
679
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -617,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
617
690
  return true;
618
691
  }
619
692
 
620
- static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
693
+ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
621
694
  switch (op->op) {
622
695
  case GGML_OP_MUL_MAT:
623
696
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
634
707
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
635
708
  /* .set_tensor_async = */ NULL,
636
709
  /* .get_tensor_async = */ NULL,
637
- /* .cpy_tensor_from_async = */ NULL,
638
- /* .cpy_tensor_to_async = */ NULL,
710
+ /* .cpy_tensor_async = */ NULL,
639
711
  /* .synchronize = */ NULL,
640
712
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
641
713
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
@@ -660,8 +732,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
660
732
  return cpu_backend;
661
733
  }
662
734
 
663
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
664
- return backend->iface.get_name == ggml_backend_cpu_name;
735
+ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
+ return backend && backend->iface.get_name == ggml_backend_cpu_name;
665
737
  }
666
738
 
667
739
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -671,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
671
743
  ctx->n_threads = n_threads;
672
744
  }
673
745
 
674
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
746
+ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
675
747
  return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
676
748
  }
677
749
 
678
- static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
750
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
679
751
  return ggml_backend_cpu_init();
680
752
 
681
753
  GGML_UNUSED(params);
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
685
757
 
686
758
  // scheduler
687
759
 
688
- #define GGML_MAX_BACKENDS 4
760
+ #define GGML_MAX_BACKENDS 16
689
761
  #define GGML_MAX_SPLITS 256
690
762
  #define GGML_MAX_SPLIT_INPUTS 16
691
763
 
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
695
767
  int i_end;
696
768
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
697
769
  int n_inputs;
770
+ // graph view of this split
698
771
  struct ggml_cgraph graph;
699
772
  };
700
773
 
701
774
  struct ggml_backend_sched {
775
+ bool is_reset; // true if the scheduler has been reset since the last graph split
776
+
702
777
  int n_backends;
703
778
  ggml_backend_t backends[GGML_MAX_BACKENDS];
779
+ ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
704
780
  ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
705
781
 
706
782
  ggml_gallocr_t galloc;
707
783
 
784
+ // hash keys of the nodes in the graph
708
785
  struct ggml_hash_set hash_set;
709
- ggml_tallocr_t * node_talloc; // [hash_set.size]
710
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
786
+ // hash values (arrays of [hash_set.size])
787
+ ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
788
+ struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
711
789
 
790
+ // copy of the graph with modified inputs
712
791
  struct ggml_cgraph * graph;
792
+
713
793
  struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
714
794
  int n_splits;
715
795
 
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
750
830
  return INT_MAX;
751
831
  }
752
832
 
753
- static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
833
+ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
754
834
  if (buffer == NULL) {
755
835
  return NULL;
756
836
  }
837
+
838
+ // check if this is already allocate in a allocr buffer (from user manual allocations)
839
+ for (int i = 0; i < sched->n_backends; i++) {
840
+ if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
841
+ return sched->tallocs[i];
842
+ }
843
+ }
844
+
757
845
  // find highest prio backend that supports the buffer type
758
846
  for (int i = 0; i < sched->n_backends; i++) {
759
847
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
760
- return sched->backends[i];
848
+ return sched->tallocs[i];
761
849
  }
762
850
  }
763
851
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
767
855
  if (allocr == NULL) {
768
856
  return NULL;
769
857
  }
770
- // find highest prio backend that supports the buffer type
771
858
  for (int i = 0; i < sched->n_backends; i++) {
772
859
  if (sched->tallocs[i] == allocr) {
773
860
  return sched->backends[i];
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
777
864
  }
778
865
 
779
866
  #if 0
780
- static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
867
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
781
868
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
782
869
  #define GET_CAUSE(node) causes[hash_id(node)]
783
870
  #else
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
786
873
  #endif
787
874
 
788
875
  // returns the backend that should be used for the node based on the current locations
789
- static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
790
- // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
791
- // ie. kv cache updates
792
- // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
876
+ static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
877
+ // assign pre-allocated nodes to their backend
793
878
  // dst
794
- ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
795
- if (cur_backend != NULL) {
879
+ ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
880
+ if (cur_allocr != NULL) {
796
881
  SET_CAUSE(node, "1.dst");
797
- return cur_backend;
882
+ return cur_allocr;
798
883
  }
799
-
800
884
  // view_src
801
- if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
802
- SET_CAUSE(node, "1.vsrc");
803
- return get_buffer_backend(sched, node->view_src->buffer);
885
+ if (node->view_src != NULL) {
886
+ cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
887
+ if (cur_allocr != NULL) {
888
+ SET_CAUSE(node, "1.vsrc");
889
+ return cur_allocr;
890
+ }
804
891
  }
805
-
806
- // src
807
- int cur_prio = INT_MAX;
808
- size_t cur_size = 0;
809
-
892
+ // assign nodes that use weights to the backend of the weights
810
893
  for (int i = 0; i < GGML_MAX_SRC; i++) {
811
894
  const struct ggml_tensor * src = node->src[i];
812
895
  if (src == NULL) {
813
896
  break;
814
897
  }
815
- ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
816
- if (src_backend != NULL) {
817
- int src_prio = sched_backend_prio(sched, src_backend);
818
- size_t src_size = ggml_nbytes(src);
819
- if (src_prio < cur_prio && src_size >= cur_size) {
820
- cur_prio = src_prio;
821
- cur_size = src_size;
822
- cur_backend = src_backend;
823
- SET_CAUSE(node, "1.src%d", i);
824
- }
898
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
899
+ ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
900
+ // operations with weights are always run on the same backend as the weights
901
+ SET_CAUSE(node, "1.wgt%d", i);
902
+ return src_allocr;
825
903
  }
826
904
  }
827
- return cur_backend;
905
+
906
+ return NULL;
828
907
  }
829
908
 
830
909
  static char * fmt_size(size_t size) {
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
857
936
  }
858
937
  ggml_tallocr_t node_allocr = node_allocr(node);
859
938
  ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
860
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
939
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
861
940
  fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
862
941
  for (int j = 0; j < GGML_MAX_SRC; j++) {
863
942
  struct ggml_tensor * src = node->src[j];
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
866
945
  }
867
946
  ggml_tallocr_t src_allocr = node_allocr(src);
868
947
  ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
869
- fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
948
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
870
949
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
871
950
  }
872
951
  fprintf(stderr, "\n");
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
882
961
  return dup;
883
962
  }
884
963
 
964
+
965
+ //#define DEBUG_PASS1
966
+ //#define DEBUG_PASS2
967
+ //#define DEBUG_PASS3
968
+ //#define DEBUG_PASS4
969
+
885
970
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
886
- // TODO: merge passes
887
971
  static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
888
- // reset state
889
- size_t hash_size = sched->hash_set.size;
890
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
891
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
892
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
972
+ // reset splits
893
973
  sched->n_splits = 0;
974
+ sched->is_reset = false;
894
975
 
895
976
  struct ggml_init_params params = {
896
977
  /* .mem_size = */ sizeof(sched->context_buffer),
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
898
979
  /* .no_alloc = */ true
899
980
  };
900
981
 
901
- if (sched->ctx != NULL) {
902
- ggml_free(sched->ctx);
903
- }
982
+ ggml_free(sched->ctx);
904
983
 
905
984
  sched->ctx = ggml_init(params);
985
+ if (sched->ctx == NULL) {
986
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
987
+ GGML_ASSERT(false);
988
+ }
906
989
 
907
- // pass 1: assign backends to ops with allocated inputs
990
+ // pass 1: assign backends to ops with pre-allocated inputs
908
991
  for (int i = 0; i < graph->n_leafs; i++) {
909
992
  struct ggml_tensor * leaf = graph->leafs[i];
910
993
  if (node_allocr(leaf) != NULL) {
911
994
  // do not overwrite user assignments
912
995
  continue;
913
996
  }
914
- ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
915
- if (leaf_backend == NULL && leaf->view_src != NULL) {
916
- leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
917
- }
918
- if (leaf_backend != NULL) {
919
- node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
920
- }
997
+ node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
921
998
  }
922
999
 
923
1000
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -926,50 +1003,120 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
926
1003
  // do not overwrite user assignments
927
1004
  continue;
928
1005
  }
929
- ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
930
- if (node_backend != NULL) {
931
- node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
1006
+ node_allocr(node) = sched_allocr_from_cur(sched, node);
1007
+ // src
1008
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1009
+ struct ggml_tensor * src = node->src[j];
1010
+ if (src == NULL) {
1011
+ break;
1012
+ }
1013
+ if (node_allocr(src) == NULL) {
1014
+ node_allocr(src) = sched_allocr_from_cur(sched, src);
1015
+ }
932
1016
  }
933
1017
  }
934
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1018
+ #ifdef DEBUG_PASS1
1019
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1020
+ #endif
935
1021
 
936
- // pass 2: assign backends to ops from current assignments
937
- // TODO:
938
- // - reuse sched_backend_from_cur
939
- for (int i = 0; i < graph->n_nodes; i++) {
940
- struct ggml_tensor * node = graph->nodes[i];
941
- ggml_tallocr_t node_allocr = node_allocr(node);
942
- if (node_allocr == NULL) {
943
- int cur_prio = INT_MAX;
944
- size_t cur_size = 0;
945
- for (int j = 0; j < GGML_MAX_SRC; j++) {
946
- struct ggml_tensor * src = node->src[j];
947
- if (src == NULL) {
948
- break;
1022
+ // pass 2: expand current backend assignments
1023
+ // assign the same backend to adjacent nodes
1024
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1025
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1026
+
1027
+ // pass 2.1 expand gpu up
1028
+ {
1029
+ ggml_tallocr_t cur_allocr = NULL;
1030
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1031
+ struct ggml_tensor * node = graph->nodes[i];
1032
+ if (ggml_is_view_op(node->op)) {
1033
+ continue;
1034
+ }
1035
+ ggml_tallocr_t node_allocr = node_allocr(node);
1036
+ if (node_allocr != NULL) {
1037
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1038
+ // skip cpu (lowest prio backend)
1039
+ cur_allocr = NULL;
1040
+ } else {
1041
+ cur_allocr = node_allocr;
949
1042
  }
950
- ggml_tallocr_t src_allocr = node_allocr(src);
951
- if (src_allocr != NULL) {
952
- int src_prio = sched_allocr_prio(sched, src_allocr);
953
- size_t src_size = ggml_nbytes(src);
954
- if (src_prio < cur_prio && src_size >= cur_size) {
955
- cur_prio = src_prio;
956
- cur_size = src_size;
957
- node_allocr = src_allocr;
958
- SET_CAUSE(node, "2.src%d", j);
959
- }
1043
+ } else {
1044
+ node_allocr(node) = cur_allocr;
1045
+ SET_CAUSE(node, "2.1");
1046
+ }
1047
+ }
1048
+ }
1049
+
1050
+ // pass 2.2 expand gpu down
1051
+ {
1052
+ ggml_tallocr_t cur_allocr = NULL;
1053
+ for (int i = 0; i < graph->n_nodes; i++) {
1054
+ struct ggml_tensor * node = graph->nodes[i];
1055
+ if (ggml_is_view_op(node->op)) {
1056
+ continue;
1057
+ }
1058
+ ggml_tallocr_t node_allocr = node_allocr(node);
1059
+ if (node_allocr != NULL) {
1060
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1061
+ // skip cpu (lowest prio backend)
1062
+ cur_allocr = NULL;
1063
+ } else {
1064
+ cur_allocr = node_allocr;
960
1065
  }
1066
+ } else {
1067
+ node_allocr(node) = cur_allocr;
1068
+ SET_CAUSE(node, "2.2");
961
1069
  }
1070
+ }
1071
+ }
1072
+
1073
+ // pass 2.3 expand rest up
1074
+ {
1075
+ ggml_tallocr_t cur_allocr = NULL;
1076
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1077
+ struct ggml_tensor * node = graph->nodes[i];
1078
+ if (ggml_is_view_op(node->op)) {
1079
+ continue;
1080
+ }
1081
+ ggml_tallocr_t node_allocr = node_allocr(node);
962
1082
  if (node_allocr != NULL) {
963
- node_allocr(node) = node_allocr;
1083
+ cur_allocr = node_allocr;
1084
+ } else {
1085
+ node_allocr(node) = cur_allocr;
1086
+ SET_CAUSE(node, "2.3");
964
1087
  }
965
1088
  }
966
1089
  }
967
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
968
1090
 
969
- // pass 3: assign backends to remaining src from dst (should only be leafs)
1091
+ // pass 2.4 expand rest down
1092
+ {
1093
+ ggml_tallocr_t cur_allocr = NULL;
1094
+ for (int i = 0; i < graph->n_nodes; i++) {
1095
+ struct ggml_tensor * node = graph->nodes[i];
1096
+ if (ggml_is_view_op(node->op)) {
1097
+ continue;
1098
+ }
1099
+ ggml_tallocr_t node_allocr = node_allocr(node);
1100
+ if (node_allocr != NULL) {
1101
+ cur_allocr = node_allocr;
1102
+ } else {
1103
+ node_allocr(node) = cur_allocr;
1104
+ SET_CAUSE(node, "2.4");
1105
+ }
1106
+ }
1107
+ }
1108
+ #ifdef DEBUG_PASS2
1109
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1110
+ #endif
1111
+
1112
+ // pass 3: assign backends to remaining src from dst and view_src
970
1113
  for (int i = 0; i < graph->n_nodes; i++) {
971
1114
  struct ggml_tensor * node = graph->nodes[i];
972
- ggml_tallocr_t node_allocr = node_allocr(node);
1115
+ ggml_tallocr_t cur_allocr = node_allocr(node);
1116
+ if (node->view_src != NULL && cur_allocr == NULL) {
1117
+ cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1118
+ SET_CAUSE(node, "3.vsrc");
1119
+ }
973
1120
  for (int j = 0; j < GGML_MAX_SRC; j++) {
974
1121
  struct ggml_tensor * src = node->src[j];
975
1122
  if (src == NULL) {
@@ -977,81 +1124,107 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977
1124
  }
978
1125
  ggml_tallocr_t src_allocr = node_allocr(src);
979
1126
  if (src_allocr == NULL) {
980
- node_allocr(src) = node_allocr;
1127
+ if (src->view_src != NULL) {
1128
+ // views are always on the same backend as the source
1129
+ node_allocr(src) = node_allocr(src->view_src);
1130
+ SET_CAUSE(src, "3.vsrc");
1131
+ } else {
1132
+ node_allocr(src) = cur_allocr;
1133
+ SET_CAUSE(src, "3.cur");
1134
+ }
981
1135
  }
982
1136
  }
983
1137
  }
984
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1138
+ #ifdef DEBUG_PASS3
1139
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1140
+ #endif
985
1141
 
986
1142
  // pass 4: split graph, find tensors that need to be copied
987
- // TODO:
988
- // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
989
- // find first backend
990
- int cur_split = 0;
991
- for (int i = 0; i < graph->n_nodes; i++) {
992
- struct ggml_tensor * node = graph->nodes[i];
993
- if (node->view_src == NULL) {
994
- sched->splits[0].tallocr = node_allocr(node);
995
- break;
996
- }
997
- }
998
- sched->splits[0].i_start = 0;
999
- sched->splits[0].n_inputs = 0;
1000
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1001
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1002
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1003
- for (int i = 0; i < graph->n_nodes; i++) {
1004
- struct ggml_tensor * node = graph->nodes[i];
1005
-
1006
- if (ggml_is_view_op(node->op)) {
1007
- continue;
1143
+ {
1144
+ int cur_split = 0;
1145
+ // find the backend of the first split, skipping view ops
1146
+ for (int i = 0; i < graph->n_nodes; i++) {
1147
+ struct ggml_tensor * node = graph->nodes[i];
1148
+ if (!ggml_is_view_op(node->op)) {
1149
+ sched->splits[0].tallocr = node_allocr(node);
1150
+ break;
1151
+ }
1008
1152
  }
1153
+ sched->splits[0].i_start = 0;
1154
+ sched->splits[0].n_inputs = 0;
1155
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1156
+ ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1157
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1158
+ for (int i = 0; i < graph->n_nodes; i++) {
1159
+ struct ggml_tensor * node = graph->nodes[i];
1160
+
1161
+ if (ggml_is_view_op(node->op)) {
1162
+ continue;
1163
+ }
1009
1164
 
1010
- ggml_tallocr_t node_allocr = node_allocr(node);
1165
+ ggml_tallocr_t node_allocr = node_allocr(node);
1011
1166
 
1012
- if (node_allocr != cur_allocr) {
1013
- sched->splits[cur_split].i_end = i;
1014
- cur_split++;
1015
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1016
- sched->splits[cur_split].tallocr = node_allocr;
1017
- sched->splits[cur_split].i_start = i;
1018
- sched->splits[cur_split].n_inputs = 0;
1019
- memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
1020
- cur_allocr = node_allocr;
1021
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1022
- }
1167
+ GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
1023
1168
 
1024
- // find inputs that are not on the same backend
1025
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1026
- struct ggml_tensor * src = node->src[j];
1027
- if (src == NULL) {
1028
- break;
1169
+ if (node_allocr != cur_allocr) {
1170
+ sched->splits[cur_split].i_end = i;
1171
+ cur_split++;
1172
+ GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1173
+ sched->splits[cur_split].tallocr = node_allocr;
1174
+ sched->splits[cur_split].i_start = i;
1175
+ sched->splits[cur_split].n_inputs = 0;
1176
+ cur_allocr = node_allocr;
1177
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1029
1178
  }
1030
- ggml_tallocr_t src_allocr = node_allocr(src);
1031
- if (src_allocr != node_allocr) {
1032
- int n_inputs = sched->splits[cur_split].n_inputs++;
1033
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1034
- sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1035
-
1036
- // create copies
1037
- size_t id = hash_id(src);
1038
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1039
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1040
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1041
- node_allocr(tensor_copy) = cur_allocr;
1042
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1043
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1179
+
1180
+ // find inputs that are not on the same backend
1181
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1182
+ struct ggml_tensor * src = node->src[j];
1183
+ if (src == NULL) {
1184
+ break;
1185
+ }
1186
+ ggml_tallocr_t src_allocr = node_allocr(src);
1187
+ GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1188
+ if (src_allocr != node_allocr) {
1189
+ // check if the input is already in the split
1190
+ bool found = false;
1191
+ for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1192
+ if (sched->splits[cur_split].inputs[k] == src) {
1193
+ found = true;
1194
+ break;
1195
+ }
1196
+ }
1197
+
1198
+ if (!found) {
1199
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1200
+ //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1201
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1202
+ sched->splits[cur_split].inputs[n_inputs] = src;
1203
+ }
1204
+
1205
+ // create a copy of the input in the split's backend
1206
+ size_t id = hash_id(src);
1207
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1208
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1209
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1210
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1211
+
1212
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1213
+ node_allocr(tensor_copy) = cur_allocr;
1214
+ SET_CAUSE(tensor_copy, "4.cpy");
1215
+ }
1216
+ node->src[j] = sched->node_copies[id][cur_backend_id];
1044
1217
  }
1045
- node->src[j] = sched->node_copies[id][cur_backend_id];
1046
1218
  }
1047
1219
  }
1220
+ sched->splits[cur_split].i_end = graph->n_nodes;
1221
+ sched->n_splits = cur_split + 1;
1048
1222
  }
1049
- sched->splits[cur_split].i_end = graph->n_nodes;
1050
- sched->n_splits = cur_split + 1;
1051
-
1052
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
1223
+ #ifdef DEBUG_PASS4
1224
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1225
+ #endif
1053
1226
 
1054
- #if 1
1227
+ #ifndef NDEBUG
1055
1228
  // sanity check: all sources should have the same backend as the node
1056
1229
  for (int i = 0; i < graph->n_nodes; i++) {
1057
1230
  struct ggml_tensor * node = graph->nodes[i];
@@ -1059,6 +1232,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1059
1232
  if (node_allocr == NULL) {
1060
1233
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1061
1234
  }
1235
+ if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1236
+ fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1237
+ node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1238
+ node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1239
+ }
1062
1240
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1063
1241
  struct ggml_tensor * src = node->src[j];
1064
1242
  if (src == NULL) {
@@ -1070,8 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1070
1248
  node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1071
1249
  j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1072
1250
  }
1251
+ if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1252
+ fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1253
+ src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1254
+ src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1255
+ }
1073
1256
  }
1074
1257
  }
1258
+ fflush(stderr);
1075
1259
  #endif
1076
1260
 
1077
1261
  // create copies of the graph for each split
@@ -1085,6 +1269,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1085
1269
  for (int j = 0; j < split->n_inputs; j++) {
1086
1270
  struct ggml_tensor * input = split->inputs[j];
1087
1271
  struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1272
+ // add a dependency to the input source so that it is not freed before the copy is done
1273
+ GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1088
1274
  input_cpy->src[0] = input;
1089
1275
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1090
1276
  }
@@ -1119,24 +1305,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1119
1305
  uint64_t copy_start_us = ggml_time_us();
1120
1306
  for (int j = 0; j < split->n_inputs; j++) {
1121
1307
  struct ggml_tensor * input = split->inputs[j];
1122
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
1123
- if (input->buffer == NULL) {
1124
- if (input->view_src == NULL) {
1125
- fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
1126
- exit(1);
1127
- }
1128
- // FIXME: may need to use the sched buffer instead
1129
- ggml_backend_view_init(input->view_src->buffer, input);
1130
- }
1131
- if (input_cpy->buffer == NULL) {
1132
- fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
1133
- exit(1);
1134
- }
1135
- //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
1136
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1137
- ggml_backend_tensor_copy(input, input_cpy);
1308
+ struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1309
+
1310
+ GGML_ASSERT(input->buffer != NULL);
1311
+ GGML_ASSERT(input_cpy->buffer != NULL);
1312
+
1313
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1314
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1315
+ ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1138
1316
  }
1139
- // ggml_backend_synchronize(split_backend);
1317
+ //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1140
1318
  int64_t copy_end_us = ggml_time_us();
1141
1319
  copy_us[split_backend_id] += copy_end_us - copy_start_us;
1142
1320
 
@@ -1148,7 +1326,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1148
1326
 
1149
1327
  uint64_t compute_start_us = ggml_time_us();
1150
1328
  ggml_backend_graph_compute(split_backend, &split->graph);
1151
- // ggml_backend_synchronize(split_backend);
1329
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1152
1330
  uint64_t compute_end_us = ggml_time_us();
1153
1331
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1154
1332
  }
@@ -1168,26 +1346,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
1168
1346
  for (int i = 0; i < sched->n_backends; i++) {
1169
1347
  ggml_tallocr_reset(sched->tallocs[i]);
1170
1348
  }
1349
+ // reset state for the next run
1350
+ size_t hash_size = sched->hash_set.size;
1351
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1352
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1353
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1354
+
1355
+ sched->is_reset = true;
1171
1356
  }
1172
1357
 
1173
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
1358
+ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1359
+ GGML_ASSERT(n_backends > 0);
1174
1360
  GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1175
1361
 
1176
- struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
1177
- memset(sched, 0, sizeof(struct ggml_backend_sched));
1362
+ struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1363
+
1364
+ // initialize hash table
1365
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1366
+ sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1367
+ sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1178
1368
 
1179
1369
  sched->n_backends = n_backends;
1180
1370
  for (int i = 0; i < n_backends; i++) {
1181
1371
  sched->backends[i] = backends[i];
1372
+ sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1182
1373
  }
1183
1374
 
1184
1375
  sched->galloc = ggml_gallocr_new();
1185
1376
 
1186
1377
  // init measure allocs for each backend
1187
1378
  for (int i = 0; i < n_backends; i++) {
1188
- sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
1379
+ sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1189
1380
  }
1190
1381
 
1382
+ sched_reset(sched);
1383
+
1191
1384
  return sched;
1192
1385
  }
1193
1386
 
@@ -1199,6 +1392,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1199
1392
  ggml_tallocr_free(sched->tallocs[i]);
1200
1393
  }
1201
1394
  ggml_gallocr_free(sched->galloc);
1395
+ ggml_free(sched->ctx);
1202
1396
  free(sched->hash_set.keys);
1203
1397
  free(sched->node_talloc);
1204
1398
  free(sched->node_copies);
@@ -1206,12 +1400,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1206
1400
  }
1207
1401
 
1208
1402
  void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1209
- // initialize hash tables
1210
- size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
1211
- sched->hash_set.size = hash_size;
1212
- sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
1213
- sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
1214
- sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
1403
+ GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
1215
1404
 
1216
1405
  sched_split_graph(sched, measure_graph);
1217
1406
  sched_alloc_splits(sched);
@@ -1220,28 +1409,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
1220
1409
  for (int i = 0; i < sched->n_backends; i++) {
1221
1410
  size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1222
1411
  ggml_tallocr_free(sched->tallocs[i]);
1223
- sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
1412
+ sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1224
1413
  }
1225
1414
 
1226
1415
  sched_reset(sched);
1227
1416
  }
1228
1417
 
1229
1418
  void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1230
- GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1419
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1420
+
1421
+ if (!sched->is_reset) {
1422
+ sched_reset(sched);
1423
+ }
1231
1424
 
1232
1425
  sched_split_graph(sched, graph);
1233
1426
  sched_alloc_splits(sched);
1234
1427
  sched_compute_splits(sched);
1428
+ }
1429
+
1430
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1235
1431
  sched_reset(sched);
1236
1432
  }
1237
1433
 
1434
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1435
+ return sched->n_splits;
1436
+ }
1437
+
1238
1438
  ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1239
1439
  int backend_index = sched_backend_prio(sched, backend);
1440
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1240
1441
  return sched->tallocs[backend_index];
1241
1442
  }
1242
1443
 
1243
1444
  ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1244
1445
  int backend_index = sched_backend_prio(sched, backend);
1446
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1245
1447
  return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1246
1448
  }
1247
1449
 
@@ -1251,10 +1453,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1251
1453
  node_allocr(node) = sched->tallocs[backend_index];
1252
1454
  }
1253
1455
 
1456
+ ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1457
+ ggml_tallocr_t allocr = node_allocr(node);
1458
+ if (allocr == NULL) {
1459
+ return NULL;
1460
+ }
1461
+ return get_allocr_backend(sched, allocr);
1462
+ }
1463
+
1254
1464
  // utils
1465
+
1255
1466
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1256
1467
  GGML_ASSERT(tensor->buffer == NULL);
1257
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1468
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1258
1469
  GGML_ASSERT(tensor->view_src != NULL);
1259
1470
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1260
1471
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1320,6 +1531,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1320
1531
 
1321
1532
  struct ggml_tensor * dst = node_copies[id];
1322
1533
  if (dst->view_src != NULL) {
1534
+ graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1323
1535
  ggml_backend_view_init(dst->view_src->buffer, dst);
1324
1536
  }
1325
1537
  else {
@@ -1353,6 +1565,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1353
1565
  struct ggml_context * ctx_allocated = ggml_init(params);
1354
1566
  struct ggml_context * ctx_unallocated = ggml_init(params);
1355
1567
 
1568
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1569
+ fprintf(stderr, "failed to allocate context for graph copy\n");
1570
+ free(hash_set.keys);
1571
+ free(node_copies);
1572
+ free(node_init);
1573
+ ggml_free(ctx_allocated);
1574
+ ggml_free(ctx_unallocated);
1575
+ return (struct ggml_backend_graph_copy) {
1576
+ /* .buffer = */ NULL,
1577
+ /* .ctx_allocated = */ NULL,
1578
+ /* .ctx_unallocated = */ NULL,
1579
+ /* .graph = */ NULL,
1580
+ };
1581
+ }
1582
+
1356
1583
  // dup nodes
1357
1584
  for (int i = 0; i < graph->n_nodes; i++) {
1358
1585
  struct ggml_tensor * node = graph->nodes[i];
@@ -1361,6 +1588,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1361
1588
 
1362
1589
  // allocate nodes
1363
1590
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1591
+ if (buffer == NULL) {
1592
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
1593
+ free(hash_set.keys);
1594
+ free(node_copies);
1595
+ free(node_init);
1596
+ ggml_free(ctx_allocated);
1597
+ ggml_free(ctx_unallocated);
1598
+ return (struct ggml_backend_graph_copy) {
1599
+ /* .buffer = */ NULL,
1600
+ /* .ctx_allocated = */ NULL,
1601
+ /* .ctx_unallocated = */ NULL,
1602
+ /* .graph = */ NULL,
1603
+ };
1604
+ }
1364
1605
 
1365
1606
  //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1366
1607
 
@@ -1397,8 +1638,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1397
1638
  ggml_free(copy.ctx_unallocated);
1398
1639
  }
1399
1640
 
1400
- void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1641
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1401
1642
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1643
+ if (copy.buffer == NULL) {
1644
+ return false;
1645
+ }
1646
+
1402
1647
  struct ggml_cgraph * g1 = graph;
1403
1648
  struct ggml_cgraph * g2 = copy.graph;
1404
1649
 
@@ -1428,4 +1673,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1428
1673
  }
1429
1674
 
1430
1675
  ggml_backend_graph_copy_free(copy);
1676
+
1677
+ return true;
1431
1678
  }