cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
  #include <windows.h>
9
9
  #endif
10
10
 
11
+ #include "ggml-backend.h"
11
12
  #include "ggml-backend-impl.h"
12
13
  #include "ggml-alloc.h"
13
14
  #include "ggml-impl.h"
@@ -34,6 +35,11 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
34
35
  }
35
36
 
36
37
  lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
37
43
  return buft->iface.alloc_buffer(buft, size);
38
44
  }
39
45
 
@@ -89,7 +95,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
89
95
  }
90
96
 
91
97
  const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
98
+ return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
93
99
  }
94
100
 
95
101
  void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
@@ -108,6 +114,11 @@ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
108
114
  }
109
115
 
110
116
  void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
111
122
  void * base = buffer->iface.get_base(buffer);
112
123
 
113
124
  LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +133,15 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct
122
133
  }
123
134
  }
124
135
 
136
+ void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
125
145
  size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
126
146
  return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
127
147
  }
@@ -134,10 +154,6 @@ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, st
134
154
  return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
135
155
  }
136
156
 
137
- void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
157
  bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
142
158
  return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
143
159
  }
@@ -198,7 +214,7 @@ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
198
214
  }
199
215
 
200
216
  lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
217
+ return lm_ggml_backend_dev_buffer_type(backend->device);
202
218
  }
203
219
 
204
220
  lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
@@ -238,43 +254,42 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
238
254
  void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239
255
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
256
 
257
+ if (size == 0) {
258
+ return;
259
+ }
260
+
241
261
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
262
  LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
263
  LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
244
264
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
265
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
266
  }
251
267
 
252
268
  void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253
269
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
270
 
271
+ if (size == 0) {
272
+ return;
273
+ }
274
+
255
275
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
276
  LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
277
  LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
258
278
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
279
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
280
  }
265
281
 
266
282
  LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
283
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
284
 
269
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
285
+ if (size == 0) {
274
286
  return;
275
287
  }
276
288
 
277
- LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
289
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
292
+ LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278
293
 
279
294
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
295
  }
@@ -316,33 +331,15 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
316
331
  }
317
332
 
318
333
  bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return lm_ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
334
+ return lm_ggml_backend_dev_supports_op(backend->device, op);
325
335
  }
326
336
 
327
337
  bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return lm_ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
-
333
- return backend->iface.supports_buft(backend, buft);
338
+ return lm_ggml_backend_dev_supports_buft(backend->device, buft);
334
339
  }
335
340
 
336
341
  bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
337
- // helper to ease transition to device interface
338
- if (backend->device) {
339
- return lm_ggml_backend_dev_offload_op(backend->device, op);
340
- }
341
-
342
- if (backend->iface.offload_op != NULL) {
343
- return backend->iface.offload_op(backend, op);
344
- }
345
- return false;
342
+ return lm_ggml_backend_dev_offload_op(backend->device, op);
346
343
  }
347
344
 
348
345
  lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
@@ -379,7 +376,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
379
376
  lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
380
377
  } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
381
378
  #ifndef NDEBUG
382
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
379
+ LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
383
380
  #endif
384
381
  size_t nbytes = lm_ggml_nbytes(src);
385
382
  void * data = malloc(nbytes);
@@ -538,10 +535,40 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
538
535
  #include "ggml-metal.h"
539
536
  #endif
540
537
 
538
+ #ifdef LM_GGML_USE_SYCL
539
+ #include "ggml-sycl.h"
540
+ #endif
541
+
542
+ #ifdef LM_GGML_USE_VULKAN
543
+ #include "ggml-vulkan.h"
544
+ #endif
545
+
541
546
  #ifdef LM_GGML_USE_BLAS
542
547
  #include "ggml-blas.h"
543
548
  #endif
544
549
 
550
+ #ifdef LM_GGML_USE_RPC
551
+ #include "ggml-rpc.h"
552
+ #endif
553
+
554
+ #ifndef __AMX_INT8__
555
+ #undef LM_GGML_USE_AMX
556
+ #endif
557
+
558
+ #ifdef LM_GGML_USE_AMX
559
+ # include "ggml-amx.h"
560
+ #endif
561
+
562
+ #ifdef LM_GGML_USE_CANN
563
+ #include "ggml-cann.h"
564
+ #endif
565
+
566
+ #ifdef LM_GGML_USE_KOMPUTE
567
+ #include "ggml-kompute.h"
568
+ #endif
569
+
570
+ #include "ggml-cpu.h"
571
+
545
572
  struct lm_ggml_backend_registry {
546
573
  std::vector<lm_ggml_backend_reg_t> backends;
547
574
  std::vector<lm_ggml_backend_dev_t> devices;
@@ -553,18 +580,34 @@ struct lm_ggml_backend_registry {
553
580
  #ifdef LM_GGML_USE_METAL
554
581
  register_backend(lm_ggml_backend_metal_reg());
555
582
  #endif
583
+ #ifdef LM_GGML_USE_SYCL
584
+ register_backend(lm_ggml_backend_sycl_reg());
585
+ #endif
586
+ #ifdef LM_GGML_USE_VULKAN
587
+ register_backend(lm_ggml_backend_vk_reg());
588
+ #endif
589
+ #ifdef LM_GGML_USE_CANN
590
+ register_backend(lm_ggml_backend_cann_reg());
591
+ #endif
556
592
  #ifdef LM_GGML_USE_BLAS
557
593
  register_backend(lm_ggml_backend_blas_reg());
558
594
  #endif
559
-
560
- // TODO: sycl, vulkan, kompute, cann
595
+ #ifdef LM_GGML_USE_RPC
596
+ register_backend(lm_ggml_backend_rpc_reg());
597
+ #endif
598
+ #ifdef LM_GGML_USE_AMX
599
+ register_backend(lm_ggml_backend_amx_reg());
600
+ #endif
601
+ #ifdef LM_GGML_USE_KOMPUTE
602
+ register_backend(lm_ggml_backend_kompute_reg());
603
+ #endif
561
604
 
562
605
  register_backend(lm_ggml_backend_cpu_reg());
563
606
  }
564
607
 
565
608
  void register_backend(lm_ggml_backend_reg_t reg) {
566
609
  #ifndef NDEBUG
567
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
610
+ LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
568
611
  __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
569
612
  #endif
570
613
  backends.push_back(reg);
@@ -575,7 +618,7 @@ struct lm_ggml_backend_registry {
575
618
 
576
619
  void register_device(lm_ggml_backend_dev_t device) {
577
620
  #ifndef NDEBUG
578
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
621
+ LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
579
622
  #endif
580
623
  devices.push_back(device);
581
624
  }
@@ -663,9 +706,9 @@ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type typ
663
706
  }
664
707
 
665
708
  lm_ggml_backend_t lm_ggml_backend_init_best(void) {
666
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
709
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
667
710
  if (!dev) {
668
- dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
711
+ dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
669
712
  }
670
713
  if (!dev) {
671
714
  return NULL;
@@ -673,1918 +716,1946 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
673
716
  return lm_ggml_backend_dev_init(dev, NULL);
674
717
  }
675
718
 
676
- // backend CPU
719
+ // multi-buffer buffer
677
720
 
678
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
721
+ struct lm_ggml_backend_multi_buffer_context {
722
+ lm_ggml_backend_buffer_t * buffers;
723
+ size_t n_buffers;
724
+ };
679
725
 
680
- static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
681
- return "CPU";
726
+ static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
727
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
728
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
729
+ lm_ggml_backend_buffer_free(ctx->buffers[i]);
730
+ }
682
731
 
683
- LM_GGML_UNUSED(buffer);
732
+ free(ctx->buffers);
733
+ free(ctx);
684
734
  }
685
735
 
686
- static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
687
- uintptr_t data = (uintptr_t)buffer->context;
688
-
689
- // align the buffer
690
- if (data % TENSOR_ALIGNMENT != 0) {
691
- data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
736
+ static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
737
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
738
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
739
+ lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
692
740
  }
693
-
694
- return (void *)data;
695
741
  }
696
742
 
697
- static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
698
- free(buffer->context);
699
- }
743
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
744
+ /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
745
+ /* .get_base = */ NULL,
746
+ /* .init_tensor = */ NULL,
747
+ /* .memset_tensor = */ NULL,
748
+ /* .set_tensor = */ NULL,
749
+ /* .get_tensor = */ NULL,
750
+ /* .cpy_tensor = */ NULL,
751
+ /* .clear = */ lm_ggml_backend_multi_buffer_clear,
752
+ /* .reset = */ NULL,
753
+ };
700
754
 
701
- static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
702
- memset((char *)tensor->data + offset, value, size);
755
+ lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
756
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
757
+ ctx->n_buffers = n_buffers;
758
+ ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
703
759
 
704
- LM_GGML_UNUSED(buffer);
705
- }
760
+ LM_GGML_ASSERT(ctx->buffers != NULL);
706
761
 
707
- static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
708
- memcpy((char *)tensor->data + offset, data, size);
762
+ size_t total_size = 0;
763
+ for (size_t i = 0; i < n_buffers; i++) {
764
+ ctx->buffers[i] = buffers[i];
765
+ total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
766
+ }
709
767
 
710
- LM_GGML_UNUSED(buffer);
768
+ return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
711
769
  }
712
770
 
713
- static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
714
- memcpy(data, (const char *)tensor->data + offset, size);
715
-
716
- LM_GGML_UNUSED(buffer);
771
+ bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
772
+ return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
717
773
  }
718
774
 
719
- static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
720
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
721
- memcpy(dst->data, src->data, lm_ggml_nbytes(src));
722
- return true;
775
+ void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
776
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
777
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
778
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
779
+ lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
723
780
  }
724
- return false;
781
+ }
725
782
 
726
- LM_GGML_UNUSED(buffer);
783
+ // creates a copy of the tensor with the same memory layout
784
+ static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
785
+ struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
786
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
787
+ dup->nb[i] = tensor->nb[i];
788
+ }
789
+ return dup;
727
790
  }
728
791
 
729
- static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
730
- memset(buffer->context, value, buffer->size);
792
+ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
793
+ return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
731
794
  }
732
795
 
733
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
734
- /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
735
- /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
736
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
737
- /* .init_tensor = */ NULL, // no initialization required
738
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
739
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
740
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
741
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
742
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
743
- /* .reset = */ NULL,
744
- };
796
+ // scheduler
745
797
 
746
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
747
- /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
748
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
749
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
750
- /* .init_tensor = */ NULL, // no initialization required
751
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
752
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
753
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
754
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
755
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
756
- /* .reset = */ NULL,
757
- };
798
+ #ifndef LM_GGML_SCHED_MAX_BACKENDS
799
+ #define LM_GGML_SCHED_MAX_BACKENDS 16
800
+ #endif
758
801
 
759
- static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
760
- return "CPU";
802
+ #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
803
+ #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
804
+ #endif
761
805
 
762
- LM_GGML_UNUSED(buft);
763
- }
806
+ #ifndef LM_GGML_SCHED_MAX_COPIES
807
+ #define LM_GGML_SCHED_MAX_COPIES 4
808
+ #endif
764
809
 
765
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
766
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
767
- void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
768
- if (data == NULL) {
769
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
770
- return NULL;
771
- }
810
+ struct lm_ggml_backend_sched_split {
811
+ int backend_id;
812
+ int i_start;
813
+ int i_end;
814
+ struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
815
+ int n_inputs;
816
+ // graph view of this split
817
+ struct lm_ggml_cgraph graph;
818
+ };
772
819
 
773
- return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
774
- }
820
+ struct lm_ggml_backend_sched {
821
+ bool is_reset; // true if the scheduler has been reset since the last graph split
822
+ bool is_alloc;
775
823
 
776
- static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
777
- return TENSOR_ALIGNMENT;
824
+ int n_backends;
778
825
 
779
- LM_GGML_UNUSED(buft);
780
- }
826
+ lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
827
+ lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
828
+ lm_ggml_gallocr_t galloc;
781
829
 
782
- static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
783
- return true;
830
+ // hash map of the nodes in the graph
831
+ struct lm_ggml_hash_set hash_set;
832
+ int * hv_tensor_backend_ids; // [hash_set.size]
833
+ struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
784
834
 
785
- LM_GGML_UNUSED(buft);
786
- }
835
+ int * node_backend_ids; // [graph_size]
836
+ int * leaf_backend_ids; // [graph_size]
787
837
 
788
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
789
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
790
- /* .iface = */ {
791
- /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
792
- /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
793
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
794
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
795
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
796
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
797
- },
798
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
799
- /* .context = */ NULL,
800
- };
838
+ int * prev_node_backend_ids; // [graph_size]
839
+ int * prev_leaf_backend_ids; // [graph_size]
801
840
 
802
- return &lm_ggml_backend_cpu_buffer_type;
803
- }
841
+ // copy of the graph with modified inputs
842
+ struct lm_ggml_cgraph graph;
804
843
 
805
- #ifdef LM_GGML_USE_CPU_HBM
844
+ // graph splits
845
+ struct lm_ggml_backend_sched_split * splits;
846
+ int n_splits;
847
+ int splits_capacity;
806
848
 
807
- // buffer type HBM
849
+ // pipeline parallelism support
850
+ int n_copies;
851
+ int cur_copy;
852
+ lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
853
+ struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
854
+ int n_graph_inputs;
808
855
 
809
- #include <hbwmalloc.h>
856
+ struct lm_ggml_context * ctx;
810
857
 
811
- static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
812
- return "CPU_HBM";
858
+ lm_ggml_backend_sched_eval_callback callback_eval;
859
+ void * callback_eval_user_data;
813
860
 
814
- LM_GGML_UNUSED(buft);
815
- }
861
+ char * context_buffer;
862
+ size_t context_buffer_size;
816
863
 
817
- static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
818
- return "CPU_HBM";
864
+ int debug;
865
+ };
819
866
 
820
- LM_GGML_UNUSED(buf);
821
- }
867
+ #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
868
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
869
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
870
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
822
871
 
823
- static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
824
- hbw_free(buffer->context);
872
+ // returns the priority of the backend, lower id is higher priority
873
+ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
874
+ for (int i = 0; i < sched->n_backends; i++) {
875
+ if (sched->backends[i] == backend) {
876
+ return i;
877
+ }
878
+ }
879
+ return -1;
825
880
  }
826
881
 
827
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
828
- //void * ptr = hbw_malloc(size);
829
- void * ptr;
830
- int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
831
- if (result != 0) {
832
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
833
- return NULL;
882
+ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
883
+ lm_ggml_backend_buffer_t buffer = tensor->buffer;
884
+ if (buffer == NULL) {
885
+ return -1;
834
886
  }
835
887
 
836
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
837
- buffer->buft = buft;
838
- buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name;
839
- buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
840
-
841
- return buffer;
842
- }
888
+ // find highest prio backend that supports the buffer type and the op
889
+ for (int i = 0; i < sched->n_backends; i++) {
890
+ if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
891
+ lm_ggml_backend_supports_op(sched->backends[i], op)) {
892
+ return i;
893
+ }
894
+ }
843
895
 
844
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
845
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
846
- /* .iface = */ {
847
- /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
848
- /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
849
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
850
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
851
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
852
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
853
- },
854
- /* .context = */ NULL,
855
- };
896
+ #ifndef NDEBUG
897
+ LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
898
+ __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
899
+ #endif
856
900
 
857
- return &lm_ggml_backend_cpu_buffer_type_hbm;
901
+ return -1;
858
902
  }
859
- #endif
860
903
 
861
- struct lm_ggml_backend_cpu_context {
862
- int n_threads;
863
- lm_ggml_threadpool_t threadpool;
904
+ #if 0
905
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
906
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
907
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
908
+ #define GET_CAUSE(node) causes[hash_id(node)]
909
+ #else
910
+ #define SET_CAUSE(node, ...)
911
+ #define GET_CAUSE(node) ""
912
+ #endif
864
913
 
865
- uint8_t * work_data;
866
- size_t work_size;
914
+ // returns the backend that should be used for the node based on the current locations
915
+ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
916
+ // TODO: use supports_op to check if the backend supports the op
867
917
 
868
- lm_ggml_abort_callback abort_callback;
869
- void * abort_callback_data;
870
- };
918
+ // assign pre-allocated nodes to their backend
919
+ int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
920
+ if (cur_backend_id != -1) {
921
+ SET_CAUSE(tensor, "1.dst");
922
+ return cur_backend_id;
923
+ }
871
924
 
872
- static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
873
- return "CPU";
925
+ // view_src
926
+ if (tensor->view_src != NULL) {
927
+ cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
928
+ if (cur_backend_id != -1) {
929
+ SET_CAUSE(tensor, "1.vsrc");
930
+ return cur_backend_id;
931
+ }
932
+ }
874
933
 
875
- LM_GGML_UNUSED(backend);
876
- }
934
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
935
+ // since the tensor is pre-allocated, it cannot be moved to another backend
936
+ LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
937
+ }
877
938
 
878
- static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
879
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
880
- delete[] cpu_ctx->work_data;
881
- delete cpu_ctx;
882
- delete backend;
883
- }
939
+ // graph input
940
+ if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
941
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
942
+ SET_CAUSE(tensor, "1.inp");
943
+ return cur_backend_id;
944
+ }
884
945
 
885
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
886
- return lm_ggml_backend_cpu_buffer_type();
946
+ // operations with weights are preferably run on the same backend as the weights
947
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
948
+ const struct lm_ggml_tensor * src = tensor->src[i];
949
+ if (src == NULL) {
950
+ continue;
951
+ }
952
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
953
+ // not an ideal solution
954
+ if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
955
+ int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
956
+ // check if a backend with higher prio wants to offload the op
957
+ if (src_backend_id == sched->n_backends - 1) {
958
+ for (int b = 0; b < src_backend_id; b++) {
959
+ if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
960
+ SET_CAUSE(tensor, "1.off");
961
+ return b;
962
+ }
963
+ }
964
+ }
965
+ SET_CAUSE(tensor, "1.wgt%d", i);
966
+ return src_backend_id;
967
+ }
968
+ }
887
969
 
888
- LM_GGML_UNUSED(backend);
970
+ return -1;
889
971
  }
890
972
 
891
- struct lm_ggml_backend_plan_cpu {
892
- struct lm_ggml_cplan cplan;
893
- struct lm_ggml_cgraph cgraph;
894
- };
895
-
896
- static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
897
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
898
-
899
- struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
900
-
901
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
902
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
973
+ static char * fmt_size(size_t size) {
974
+ static char buffer[128];
975
+ if (size >= 1024*1024) {
976
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
977
+ } else {
978
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
979
+ }
980
+ return buffer;
981
+ }
903
982
 
904
- if (cpu_plan->cplan.work_size > 0) {
905
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
906
- if (cpu_plan->cplan.work_data == NULL) {
907
- delete cpu_plan;
908
- return NULL;
983
+ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
984
+ int cur_split = 0;
985
+ for (int i = 0; i < graph->n_nodes; i++) {
986
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
987
+ lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
988
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
989
+ sched->splits[cur_split].n_inputs);
990
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
991
+ LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
992
+ fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
993
+ }
994
+ LM_GGML_LOG_DEBUG("\n");
995
+ cur_split++;
996
+ }
997
+ struct lm_ggml_tensor * node = graph->nodes[i];
998
+ if (lm_ggml_is_view_op(node->op)) {
999
+ continue;
1000
+ }
1001
+ if (sched->debug > 1) {
1002
+ lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1003
+ LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1004
+ fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1005
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1006
+ struct lm_ggml_tensor * src = node->src[j];
1007
+ if (src == NULL) {
1008
+ continue;
1009
+ }
1010
+ lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1011
+ LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1012
+ fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1013
+ }
1014
+ LM_GGML_LOG_DEBUG("\n");
909
1015
  }
910
1016
  }
911
-
912
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
913
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
914
-
915
- return cpu_plan;
916
1017
  }
917
1018
 
918
- static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
919
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
1019
+ static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1020
+ lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1021
+ lm_ggml_backend_buffer_type_t buft = NULL;
920
1022
 
921
- delete[] cpu_plan->cplan.work_data;
922
- delete cpu_plan;
1023
+ if (buf) {
1024
+ // the tensor is already allocated
1025
+ buft = buf->buft;
1026
+ } else {
1027
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1028
+ int tensor_backend_id = tensor_backend_id(t);
1029
+ if (tensor_backend_id == -1 && t->view_src) {
1030
+ tensor_backend_id = tensor_backend_id(t->view_src);
1031
+ }
1032
+ if (tensor_backend_id != -1) {
1033
+ buft = sched->bufts[tensor_backend_id];
1034
+ }
1035
+ }
923
1036
 
924
- LM_GGML_UNUSED(backend);
1037
+ return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
925
1038
  }
926
1039
 
927
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
928
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
929
-
930
- return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
931
-
932
- LM_GGML_UNUSED(backend);
1040
+ static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1041
+ if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1042
+ *node_backend_id = cur_backend_id;
1043
+ SET_CAUSE(node, "2.sup");
1044
+ }
933
1045
  }
934
1046
 
935
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
936
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
1047
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1048
+ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1049
+ // reset splits
1050
+ sched->n_splits = 0;
1051
+ sched->n_graph_inputs = 0;
1052
+ sched->is_reset = false;
937
1053
 
938
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
1054
+ struct lm_ggml_init_params params = {
1055
+ /* .mem_size = */ sched->context_buffer_size,
1056
+ /* .mem_buffer = */ sched->context_buffer,
1057
+ /* .no_alloc = */ true
1058
+ };
939
1059
 
940
- if (cpu_ctx->work_size < cplan.work_size) {
941
- delete[] cpu_ctx->work_data;
942
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
943
- if (cpu_ctx->work_data == NULL) {
944
- cpu_ctx->work_size = 0;
945
- return LM_GGML_STATUS_ALLOC_FAILED;
946
- }
947
- cpu_ctx->work_size = cplan.work_size;
1060
+ lm_ggml_free(sched->ctx);
1061
+
1062
+ sched->ctx = lm_ggml_init(params);
1063
+ if (sched->ctx == NULL) {
1064
+ LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
948
1065
  }
949
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
950
1066
 
951
- cplan.abort_callback = cpu_ctx->abort_callback;
952
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
953
-
954
- return lm_ggml_graph_compute(cgraph, &cplan);
955
- }
956
-
957
- static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
958
- /* .get_name = */ lm_ggml_backend_cpu_get_name,
959
- /* .free = */ lm_ggml_backend_cpu_free,
960
- /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
961
- /* .set_tensor_async = */ NULL,
962
- /* .get_tensor_async = */ NULL,
963
- /* .cpy_tensor_async = */ NULL,
964
- /* .synchronize = */ NULL,
965
- /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
966
- /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
967
- /* .graph_plan_update = */ NULL,
968
- /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
969
- /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
970
- /* .supports_op = */ NULL,
971
- /* .supports_buft = */ NULL,
972
- /* .offload_op = */ NULL,
973
- /* .event_record = */ NULL,
974
- /* .event_wait = */ NULL,
975
- };
976
-
977
- static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
978
- static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
979
- return &guid;
980
- }
981
-
982
- lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
983
- struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
984
- if (ctx == NULL) {
985
- return NULL;
986
- }
987
-
988
- ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
989
- ctx->threadpool = NULL;
990
- ctx->work_data = NULL;
991
- ctx->work_size = 0;
992
- ctx->abort_callback = NULL;
993
- ctx->abort_callback_data = NULL;
994
-
995
- lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
996
- /* .guid = */ lm_ggml_backend_cpu_guid(),
997
- /* .interface = */ lm_ggml_backend_cpu_i,
998
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
999
- /* .context = */ ctx,
1000
- };
1001
-
1002
- if (cpu_backend == NULL) {
1003
- delete ctx;
1004
- return NULL;
1067
+ // pass 1: assign backends to ops with pre-allocated inputs
1068
+ for (int i = 0; i < graph->n_leafs; i++) {
1069
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1070
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1071
+ // do not overwrite user assignments
1072
+ if (*leaf_backend_id == -1) {
1073
+ *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1074
+ }
1005
1075
  }
1006
1076
 
1007
- return cpu_backend;
1008
- }
1009
-
1010
- bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
1011
- return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
1012
- }
1013
-
1014
- void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
1015
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1016
-
1017
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1018
- ctx->n_threads = n_threads;
1019
- }
1020
-
1021
- void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
1022
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1077
+ for (int i = 0; i < graph->n_nodes; i++) {
1078
+ struct lm_ggml_tensor * node = graph->nodes[i];
1079
+ int * node_backend_id = &tensor_backend_id(node);
1080
+ // do not overwrite user assignments
1081
+ if (*node_backend_id == -1) {
1082
+ *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1023
1083
 
1024
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1084
+ #if 0
1085
+ // src
1086
+ if (node->op == LM_GGML_OP_NONE) {
1087
+ continue;
1088
+ }
1025
1089
 
1026
- if (ctx->threadpool && ctx->threadpool != threadpool) {
1027
- // already had a different threadpool, pause/suspend it before switching
1028
- lm_ggml_threadpool_pause(ctx->threadpool);
1090
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1091
+ struct lm_ggml_tensor * src = node->src[j];
1092
+ if (src == NULL) {
1093
+ continue;
1094
+ }
1095
+ int * src_backend_id = &tensor_backend_id(src);
1096
+ if (*src_backend_id == -1) {
1097
+ *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1098
+ }
1099
+ }
1100
+ #endif
1101
+ }
1029
1102
  }
1030
- ctx->threadpool = threadpool;
1031
- }
1032
-
1033
- void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
1034
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1035
-
1036
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1037
- ctx->abort_callback = abort_callback;
1038
- ctx->abort_callback_data = abort_callback_data;
1039
- }
1040
-
1041
- lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1042
- LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1043
- return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1044
- }
1045
-
1046
- ////////////////////////
1047
-
1048
- struct lm_ggml_backend_cpu_device_context {
1049
- std::string description = "CPU";
1050
1103
 
1051
- lm_ggml_backend_cpu_device_context() {
1052
- #ifdef __APPLE__
1053
- size_t len = 0;
1054
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1055
- description.resize(len);
1056
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1057
- }
1058
- #elif defined(__linux__)
1059
- FILE * f = fopen("/proc/cpuinfo", "r");
1060
- if (f) {
1061
- char buf[1024];
1062
- while (fgets(buf, sizeof(buf), f)) {
1063
- if (strncmp(buf, "model name", 10) == 0) {
1064
- char * p = strchr(buf, ':');
1065
- if (p) {
1066
- p++;
1067
- while (std::isspace(*p)) {
1068
- p++;
1069
- }
1070
- while (std::isspace(p[strlen(p) - 1])) {
1071
- p[strlen(p) - 1] = '\0';
1072
- }
1073
- description = p;
1074
- break;
1075
- }
1104
+ // pass 2: expand current backend assignments
1105
+ // assign the same backend to adjacent nodes
1106
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1107
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1108
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1109
+ // expand gpu down
1110
+ {
1111
+ int cur_backend_id = -1;
1112
+ for (int i = 0; i < graph->n_nodes; i++) {
1113
+ struct lm_ggml_tensor * node = graph->nodes[i];
1114
+ if (lm_ggml_is_view_op(node->op)) {
1115
+ continue;
1116
+ }
1117
+ int * node_backend_id = &tensor_backend_id(node);
1118
+ if (*node_backend_id != -1) {
1119
+ if (*node_backend_id == sched->n_backends - 1) {
1120
+ // skip cpu (lowest prio backend)
1121
+ cur_backend_id = -1;
1122
+ } else {
1123
+ cur_backend_id = *node_backend_id;
1076
1124
  }
1125
+ } else if (cur_backend_id != -1) {
1126
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1077
1127
  }
1078
- fclose(f);
1079
1128
  }
1080
- #elif defined(_WIN32)
1081
- HKEY hKey;
1082
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1083
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1084
- 0,
1085
- KEY_READ,
1086
- &hKey) == ERROR_SUCCESS) {
1087
- DWORD cpu_brand_size = 0;
1088
- if (RegQueryValueExA(hKey,
1089
- TEXT("ProcessorNameString"),
1090
- NULL,
1091
- NULL,
1092
- NULL,
1093
- &cpu_brand_size) == ERROR_SUCCESS) {
1094
- description.resize(cpu_brand_size);
1095
- if (RegQueryValueExA(hKey,
1096
- TEXT("ProcessorNameString"),
1097
- NULL,
1098
- NULL,
1099
- (LPBYTE)&description[0], // NOLINT
1100
- &cpu_brand_size) == ERROR_SUCCESS) {
1101
- if (description.find('\0') != std::string::npos) {
1102
- description.resize(description.find('\0'));
1103
- }
1129
+ }
1130
+ // expand gpu up
1131
+ {
1132
+ int cur_backend_id = -1;
1133
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1134
+ struct lm_ggml_tensor * node = graph->nodes[i];
1135
+ if (lm_ggml_is_view_op(node->op)) {
1136
+ continue;
1137
+ }
1138
+ int * node_backend_id = &tensor_backend_id(node);
1139
+ if (*node_backend_id != -1) {
1140
+ if (*node_backend_id == sched->n_backends - 1) {
1141
+ // skip cpu (lowest prio backend)
1142
+ cur_backend_id = -1;
1143
+ } else {
1144
+ cur_backend_id = *node_backend_id;
1104
1145
  }
1146
+ } else if (cur_backend_id != -1) {
1147
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1148
+ }
1149
+ }
1150
+ }
1151
+ // expand rest down
1152
+ {
1153
+ int cur_backend_id = -1;
1154
+ for (int i = 0; i < graph->n_nodes; i++) {
1155
+ struct lm_ggml_tensor * node = graph->nodes[i];
1156
+ if (lm_ggml_is_view_op(node->op)) {
1157
+ continue;
1158
+ }
1159
+ int * node_backend_id = &tensor_backend_id(node);
1160
+ if (*node_backend_id != -1) {
1161
+ cur_backend_id = *node_backend_id;
1162
+ } else if (cur_backend_id != -1) {
1163
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1164
+ }
1165
+ }
1166
+ }
1167
+ // expand rest up
1168
+ {
1169
+ int cur_backend_id = -1;
1170
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1171
+ struct lm_ggml_tensor * node = graph->nodes[i];
1172
+ if (lm_ggml_is_view_op(node->op)) {
1173
+ continue;
1174
+ }
1175
+ int * node_backend_id = &tensor_backend_id(node);
1176
+ if (*node_backend_id != -1) {
1177
+ cur_backend_id = *node_backend_id;
1178
+ } else if (cur_backend_id != -1) {
1179
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1105
1180
  }
1106
- RegCloseKey(hKey);
1107
1181
  }
1108
- #endif
1109
1182
  }
1110
- };
1111
-
1112
- static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
1113
- return "CPU";
1114
-
1115
- LM_GGML_UNUSED(dev);
1116
- }
1117
-
1118
- static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
1119
- struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
1120
-
1121
- return ctx->description.c_str();
1122
- }
1123
1183
 
1124
- static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
1125
- // TODO
1126
- *free = 0;
1127
- *total = 0;
1128
-
1129
- LM_GGML_UNUSED(dev);
1130
- }
1131
-
1132
- static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
1133
- return LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1134
-
1135
- LM_GGML_UNUSED(dev);
1136
- }
1137
-
1138
- static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
1139
- props->name = lm_ggml_backend_cpu_device_get_name(dev);
1140
- props->description = lm_ggml_backend_cpu_device_get_description(dev);
1141
- props->type = lm_ggml_backend_cpu_device_get_type(dev);
1142
- lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1143
- props->caps = {
1144
- /* .async = */ false,
1145
- /* .host_buffer = */ false,
1146
- /* .buffer_from_host_ptr = */ true,
1147
- /* .events = */ false,
1148
- };
1149
- }
1150
-
1151
- static lm_ggml_backend_t lm_ggml_backend_cpu_device_init(lm_ggml_backend_dev_t dev, const char * params) {
1152
- return lm_ggml_backend_cpu_init();
1153
-
1154
- LM_GGML_UNUSED(dev);
1155
- LM_GGML_UNUSED(params);
1156
- }
1157
-
1158
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
1159
- return lm_ggml_backend_cpu_buffer_type();
1160
-
1161
- LM_GGML_UNUSED(dev);
1162
- }
1163
-
1164
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1165
- return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
1166
-
1167
- LM_GGML_UNUSED(dev);
1168
- LM_GGML_UNUSED(max_tensor_size);
1169
- }
1170
-
1171
- static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
1172
- switch (op->op) {
1173
- case LM_GGML_OP_CPY:
1174
- return
1175
- op->type != LM_GGML_TYPE_IQ2_XXS &&
1176
- op->type != LM_GGML_TYPE_IQ2_XS &&
1177
- op->type != LM_GGML_TYPE_IQ1_S &&
1178
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
1179
- case LM_GGML_OP_MUL_MAT:
1180
- return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
1181
- case LM_GGML_OP_ROPE_BACK:
1182
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1183
- case LM_GGML_OP_IM2COL_BACK:
1184
- return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
1185
- case LM_GGML_OP_OUT_PROD:
1186
- return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
1187
- default:
1188
- return true;
1184
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1185
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1186
+ // however, we also need to verify that the sources are in compatible buffer types
1187
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1188
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1189
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1190
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1191
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1192
+ for (int i = 0; i < graph->n_nodes; i++) {
1193
+ struct lm_ggml_tensor * node = graph->nodes[i];
1194
+ if (lm_ggml_is_view_op(node->op)) {
1195
+ continue;
1196
+ }
1197
+ int * node_backend_id = &tensor_backend_id(node);
1198
+ if (*node_backend_id == -1) {
1199
+ // unassigned node: find the backend with the most supported inputs
1200
+ int n_supported_best = -1;
1201
+ for (int b = 0; b < sched->n_backends; b++) {
1202
+ if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1203
+ int n_supported = 0;
1204
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1205
+ struct lm_ggml_tensor * src = node->src[j];
1206
+ if (src == NULL) {
1207
+ continue;
1208
+ }
1209
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1210
+ n_supported++;
1211
+ }
1212
+ }
1213
+ if (n_supported > n_supported_best) {
1214
+ n_supported_best = n_supported;
1215
+ *node_backend_id = b;
1216
+ SET_CAUSE(node, "3.best");
1217
+ }
1218
+ }
1219
+ }
1220
+ } else {
1221
+ // assigned node: upgrade to higher prio backend if possible
1222
+ for (int b = 0; b < *node_backend_id; b++) {
1223
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1224
+ bool supported = true;
1225
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1226
+ struct lm_ggml_tensor * src = node->src[j];
1227
+ if (src == NULL) {
1228
+ continue;
1229
+ }
1230
+ if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1231
+ supported = false;
1232
+ break;
1233
+ }
1234
+ }
1235
+ if (supported) {
1236
+ *node_backend_id = b;
1237
+ SET_CAUSE(node, "3.upg");
1238
+ break;
1239
+ }
1240
+ }
1241
+ }
1242
+ }
1189
1243
  }
1190
1244
 
1191
- LM_GGML_UNUSED(dev);
1192
- }
1193
-
1194
- static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
1195
- return lm_ggml_backend_buft_is_host(buft);
1196
-
1197
- LM_GGML_UNUSED(dev);
1198
- }
1199
-
1200
- static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
1201
- /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
1202
- /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
1203
- /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
1204
- /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
1205
- /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
1206
- /* .init_backend = */ lm_ggml_backend_cpu_device_init,
1207
- /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
1208
- /* .get_host_buffer_type = */ NULL,
1209
- /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
1210
- /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
1211
- /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
1212
- /* .offload_op = */ NULL,
1213
- /* .event_new = */ NULL,
1214
- /* .event_free = */ NULL,
1215
- /* .event_synchronize = */ NULL,
1216
- };
1217
-
1218
- ////////////////////////
1219
-
1220
- static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
1221
- return "CPU";
1222
-
1223
- LM_GGML_UNUSED(reg);
1224
- }
1225
-
1226
- static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
1227
- return 1;
1228
-
1229
- LM_GGML_UNUSED(reg);
1230
- }
1245
+ // pass 4: assign backends to remaining src from dst and view_src
1246
+ for (int i = 0; i < graph->n_nodes; i++) {
1247
+ struct lm_ggml_tensor * node = graph->nodes[i];
1248
+ int * cur_backend_id = &tensor_backend_id(node);
1249
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1250
+ *cur_backend_id = tensor_backend_id(node->view_src);
1251
+ SET_CAUSE(node, "4.vsrc");
1252
+ }
1253
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1254
+ struct lm_ggml_tensor * src = node->src[j];
1255
+ if (src == NULL) {
1256
+ continue;
1257
+ }
1258
+ int * src_backend_id = &tensor_backend_id(src);
1259
+ if (*src_backend_id == -1) {
1260
+ if (src->view_src != NULL) {
1261
+ // views are always on the same backend as the source
1262
+ *src_backend_id = tensor_backend_id(src->view_src);
1263
+ SET_CAUSE(src, "4.vsrc");
1264
+ } else {
1265
+ *src_backend_id = *cur_backend_id;
1266
+ SET_CAUSE(src, "4.cur");
1267
+ }
1268
+ }
1269
+ }
1270
+ }
1231
1271
 
1232
- static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
1233
- LM_GGML_ASSERT(index == 0);
1272
+ // pass 5: split graph, find tensors that need to be copied
1273
+ {
1274
+ int i_split = 0;
1275
+ struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1276
+ // find the backend of the first split, skipping view ops
1277
+ int i = 0;
1278
+ for (; i < graph->n_nodes; i++) {
1279
+ struct lm_ggml_tensor * node = graph->nodes[i];
1280
+ if (!lm_ggml_is_view_op(node->op)) {
1281
+ split->backend_id = tensor_backend_id(node);
1282
+ break;
1283
+ }
1284
+ }
1285
+ split->i_start = 0;
1286
+ split->n_inputs = 0;
1287
+ int cur_backend_id = split->backend_id;
1288
+ for (; i < graph->n_nodes; i++) {
1289
+ struct lm_ggml_tensor * node = graph->nodes[i];
1234
1290
 
1235
- static lm_ggml_backend_cpu_device_context ctx;
1236
- static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
1237
- /* .iface = */ lm_ggml_backend_cpu_device_i,
1238
- /* .reg = */ reg,
1239
- /* .context = */ &ctx,
1240
- };
1291
+ if (lm_ggml_is_view_op(node->op)) {
1292
+ continue;
1293
+ }
1241
1294
 
1242
- return &lm_ggml_backend_cpu_device;
1243
- }
1295
+ const int node_backend_id = tensor_backend_id(node);
1244
1296
 
1245
- static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
1246
- if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
1247
- return (void *)lm_ggml_backend_cpu_set_n_threads;
1248
- }
1249
- return NULL;
1297
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1250
1298
 
1251
- LM_GGML_UNUSED(reg);
1252
- }
1299
+ // check if we should start a new split based on the sources of the current node
1300
+ bool need_new_split = false;
1301
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1302
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1303
+ struct lm_ggml_tensor * src = node->src[j];
1304
+ if (src == NULL) {
1305
+ continue;
1306
+ }
1307
+ // check if a weight is on a different and incompatible backend
1308
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1309
+ if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1310
+ int src_backend_id = tensor_backend_id(src);
1311
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1312
+ need_new_split = true;
1313
+ break;
1314
+ }
1315
+ }
1316
+ // check if the split has too many inputs
1317
+ // FIXME: count the number of inputs instead of only checking when full
1318
+ if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1319
+ const size_t id = hash_id(src);
1320
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1321
+ bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1322
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1323
+ need_new_split = true;
1324
+ break;
1325
+ }
1326
+ }
1327
+ }
1328
+ }
1253
1329
 
1254
- static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
1255
- /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
1256
- /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
1257
- /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
1258
- /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
1259
- };
1330
+ if (node_backend_id != cur_backend_id || need_new_split) {
1331
+ split->i_end = i;
1332
+ i_split++;
1333
+ if (i_split >= sched->splits_capacity) {
1334
+ sched->splits_capacity *= 2;
1335
+ sched->splits = (lm_ggml_backend_sched_split *)
1336
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1337
+ LM_GGML_ASSERT(sched->splits != NULL);
1338
+ }
1339
+ split = &sched->splits[i_split];
1340
+ split->backend_id = node_backend_id;
1341
+ split->i_start = i;
1342
+ split->n_inputs = 0;
1343
+ cur_backend_id = node_backend_id;
1344
+ }
1260
1345
 
1261
- lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
1262
- static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
1263
- /* .iface = */ lm_ggml_backend_cpu_reg_i,
1264
- /* .context = */ NULL,
1265
- };
1346
+ // find inputs that are not on the same backend
1347
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1348
+ struct lm_ggml_tensor * src = node->src[j];
1349
+ if (src == NULL) {
1350
+ continue;
1351
+ }
1266
1352
 
1267
- return &lm_ggml_backend_cpu_reg;
1268
- }
1353
+ size_t src_id = hash_id(src);
1354
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1355
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1269
1356
 
1270
- // multi-buffer buffer
1357
+ if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1358
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1359
+ lm_ggml_backend_t backend = sched->backends[src_backend_id];
1360
+ for (int c = 0; c < sched->n_copies; c++) {
1361
+ struct lm_ggml_tensor * tensor_copy;
1362
+ if (c == sched->cur_copy) {
1363
+ tensor_copy = src; // use the original tensor as the current copy
1364
+ } else {
1365
+ tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1366
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1367
+ }
1368
+ if (sched->n_copies > 1) {
1369
+ lm_ggml_set_input(tensor_copy);
1370
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1371
+ }
1372
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1373
+ SET_CAUSE(tensor_copy, "4.cpy");
1374
+ }
1375
+ int n_graph_inputs = sched->n_graph_inputs++;
1376
+ LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1377
+ sched->graph_inputs[n_graph_inputs] = src;
1378
+ }
1379
+ }
1271
1380
 
1272
- struct lm_ggml_backend_multi_buffer_context {
1273
- lm_ggml_backend_buffer_t * buffers;
1274
- size_t n_buffers;
1275
- };
1381
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1382
+ // create a copy of the input in the split's backend
1383
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1384
+ lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1385
+ for (int c = 0; c < sched->n_copies; c++) {
1386
+ struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1387
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1388
+ if (sched->n_copies > 1) {
1389
+ lm_ggml_set_input(tensor_copy);
1390
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1391
+ }
1392
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1393
+ SET_CAUSE(tensor_copy, "4.cpy");
1394
+ }
1395
+ int n_inputs = split->n_inputs++;
1396
+ LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1397
+ split->inputs[n_inputs] = src;
1398
+ }
1399
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1400
+ }
1401
+ }
1402
+ }
1403
+ split->i_end = graph->n_nodes;
1404
+ sched->n_splits = i_split + 1;
1405
+ }
1276
1406
 
1277
- static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
1278
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1407
+ if (sched->debug) {
1408
+ lm_ggml_backend_sched_print_assignments(sched, graph);
1409
+ }
1279
1410
 
1280
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1281
- }
1411
+ // swap node_backend_ids and leaf _backend_ids with prevs
1412
+ {
1413
+ int * tmp = sched->node_backend_ids;
1414
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1415
+ sched->prev_node_backend_ids = tmp;
1282
1416
 
1283
- static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
1284
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1285
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1286
- lm_ggml_backend_buffer_free(ctx->buffers[i]);
1417
+ tmp = sched->leaf_backend_ids;
1418
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1419
+ sched->prev_leaf_backend_ids = tmp;
1287
1420
  }
1288
1421
 
1289
- free(ctx->buffers);
1290
- free(ctx);
1291
- }
1292
-
1293
- static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
1294
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1295
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1296
- lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
1422
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1423
+ if (sched->graph.size < graph_size) {
1424
+ sched->graph.size = graph_size;
1425
+ sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1426
+ sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1427
+ LM_GGML_ASSERT(sched->graph.nodes != NULL);
1428
+ LM_GGML_ASSERT(sched->graph.leafs != NULL);
1297
1429
  }
1298
- }
1430
+ sched->graph.n_nodes = 0;
1431
+ sched->graph.n_leafs = 0;
1299
1432
 
1300
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
1301
- /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
1302
- /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
1303
- /* .get_base = */ NULL,
1304
- /* .init_tensor = */ NULL,
1305
- /* .memset_tensor = */ NULL,
1306
- /* .set_tensor = */ NULL,
1307
- /* .get_tensor = */ NULL,
1308
- /* .cpy_tensor = */ NULL,
1309
- /* .clear = */ lm_ggml_backend_multi_buffer_clear,
1310
- /* .reset = */ NULL,
1311
- };
1433
+ struct lm_ggml_cgraph * graph_copy = &sched->graph;
1312
1434
 
1313
- lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
1314
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
1315
- ctx->n_buffers = n_buffers;
1316
- ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
1435
+ for (int i = 0; i < sched->n_splits; i++) {
1436
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1437
+ split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1317
1438
 
1318
- LM_GGML_ASSERT(ctx->buffers != NULL);
1439
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1440
+ for (int j = 0; j < split->n_inputs; j++) {
1441
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1319
1442
 
1320
- size_t total_size = 0;
1321
- for (size_t i = 0; i < n_buffers; i++) {
1322
- ctx->buffers[i] = buffers[i];
1323
- total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
1324
- }
1443
+ struct lm_ggml_tensor * input = split->inputs[j];
1444
+ const size_t input_id = hash_id(input);
1445
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1325
1446
 
1326
- return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
1327
- }
1447
+ // add a dependency to the input source so that it is not freed before the copy is done
1448
+ struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1449
+ input_dep->src[0] = input;
1450
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1451
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1328
1452
 
1329
- bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
1330
- return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
1331
- }
1453
+ // add a dependency to the input copy so that it is allocated at the start of the split
1454
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1455
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1456
+ }
1332
1457
 
1333
- void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1334
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
1335
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1336
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1337
- lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1458
+ for (int j = split->i_start; j < split->i_end; j++) {
1459
+ assert(graph_copy->size > graph_copy->n_nodes);
1460
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1461
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1462
+ }
1338
1463
  }
1339
- }
1340
1464
 
1341
- // creates a copy of the tensor with the same memory layout
1342
- static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
1343
- struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
1344
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
1345
- dup->nb[i] = tensor->nb[i];
1465
+ if (sched->n_copies > 1) {
1466
+ // add input copies as leafs so that they are allocated first
1467
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1468
+ struct lm_ggml_tensor * input = sched->graph_inputs[i];
1469
+ size_t id = hash_id(input);
1470
+ int backend_id = tensor_backend_id(input);
1471
+ for (int c = 0; c < sched->n_copies; c++) {
1472
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1473
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1474
+ assert(graph_copy->size > graph_copy->n_leafs);
1475
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1476
+ }
1477
+ }
1478
+
1479
+ for (int i = 0; i < sched->n_splits; i++) {
1480
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1481
+ int backend_id = split->backend_id;
1482
+ for (int j = 0; j < split->n_inputs; j++) {
1483
+ struct lm_ggml_tensor * input = split->inputs[j];
1484
+ size_t id = hash_id(input);
1485
+ for (int c = 0; c < sched->n_copies; c++) {
1486
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1487
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1488
+ assert(graph_copy->size > graph_copy->n_leafs);
1489
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1490
+ }
1491
+ }
1492
+ }
1346
1493
  }
1347
- return dup;
1348
- }
1349
1494
 
1350
- static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1351
- return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
1495
+ // add leafs from the original graph
1496
+ for (int i = 0; i < graph->n_leafs; i++) {
1497
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1498
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1499
+ assert(graph_copy->size > graph_copy->n_leafs);
1500
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1501
+ }
1352
1502
  }
1353
1503
 
1354
- // scheduler
1355
-
1356
- #ifndef LM_GGML_SCHED_MAX_BACKENDS
1357
- #define LM_GGML_SCHED_MAX_BACKENDS 16
1358
- #endif
1359
-
1360
- #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1361
- #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1362
- #endif
1504
+ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1505
+ bool backend_ids_changed = false;
1506
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1507
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1508
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1509
+ backend_ids_changed = true;
1510
+ break;
1511
+ }
1512
+ }
1513
+ if (!backend_ids_changed) {
1514
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1515
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1516
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1517
+ backend_ids_changed = true;
1518
+ break;
1519
+ }
1520
+ }
1521
+ }
1363
1522
 
1364
- #ifndef LM_GGML_SCHED_MAX_COPIES
1365
- #define LM_GGML_SCHED_MAX_COPIES 4
1523
+ // allocate graph
1524
+ if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1525
+ // the re-allocation may cause the split inputs to be moved to a different address
1526
+ lm_ggml_backend_sched_synchronize(sched);
1527
+ #ifndef NDEBUG
1528
+ LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1366
1529
  #endif
1530
+ lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1531
+ if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1532
+ LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1533
+ return false;
1534
+ }
1535
+ }
1367
1536
 
1368
- struct lm_ggml_backend_sched_split {
1369
- int backend_id;
1370
- int i_start;
1371
- int i_end;
1372
- struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1373
- int n_inputs;
1374
- // graph view of this split
1375
- struct lm_ggml_cgraph graph;
1376
- };
1377
-
1378
- struct lm_ggml_backend_sched {
1379
- bool is_reset; // true if the scheduler has been reset since the last graph split
1380
- bool is_alloc;
1381
-
1382
- int n_backends;
1537
+ return true;
1538
+ }
1383
1539
 
1384
- lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
1385
- lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
1386
- lm_ggml_gallocr_t galloc;
1540
+ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1541
+ struct lm_ggml_backend_sched_split * splits = sched->splits;
1387
1542
 
1388
- // hash map of the nodes in the graph
1389
- struct lm_ggml_hash_set hash_set;
1390
- int * hv_tensor_backend_ids; // [hash_set.size]
1391
- struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1543
+ for (int i = 0; i < sched->n_splits; i++) {
1544
+ struct lm_ggml_backend_sched_split * split = &splits[i];
1545
+ int split_backend_id = split->backend_id;
1546
+ lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1392
1547
 
1393
- int * node_backend_ids; // [graph_size]
1394
- int * leaf_backend_ids; // [graph_size]
1548
+ // copy the input tensors to the split backend
1549
+ for (int j = 0; j < split->n_inputs; j++) {
1550
+ lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1551
+ struct lm_ggml_tensor * input = split->inputs[j];
1552
+ struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1395
1553
 
1396
- int * prev_node_backend_ids; // [graph_size]
1397
- int * prev_leaf_backend_ids; // [graph_size]
1554
+ if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1555
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1556
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1557
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1558
+ } else {
1559
+ lm_ggml_backend_synchronize(split_backend);
1560
+ }
1561
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1562
+ } else {
1563
+ // wait for the split backend to finish using the input before overwriting it
1564
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1565
+ lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1566
+ } else {
1567
+ lm_ggml_backend_synchronize(split_backend);
1568
+ }
1569
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1570
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1571
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1572
+ lm_ggml_backend_synchronize(input_backend);
1573
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1574
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1575
+ } else {
1576
+ lm_ggml_backend_synchronize(split_backend);
1577
+ }
1578
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1579
+ }
1580
+ }
1581
+ }
1398
1582
 
1399
- // copy of the graph with modified inputs
1400
- struct lm_ggml_cgraph graph;
1583
+ if (!sched->callback_eval) {
1584
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1585
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1586
+ return ec;
1587
+ }
1588
+ } else {
1589
+ // similar to lm_ggml_backend_compare_graph_backend
1590
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1591
+ struct lm_ggml_tensor * t = split->graph.nodes[j0];
1401
1592
 
1402
- // graph splits
1403
- struct lm_ggml_backend_sched_split * splits;
1404
- int n_splits;
1405
- int splits_capacity;
1593
+ // check if the user needs data from this node
1594
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1406
1595
 
1407
- // pipeline parallelism support
1408
- int n_copies;
1409
- int cur_copy;
1410
- lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
1411
- struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1412
- int n_graph_inputs;
1596
+ int j1 = j0;
1413
1597
 
1414
- struct lm_ggml_context * ctx;
1598
+ // determine the range [j0, j1] of nodes that can be computed together
1599
+ while (!need && j1 < split->graph.n_nodes - 1) {
1600
+ t = split->graph.nodes[++j1];
1601
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1602
+ }
1415
1603
 
1416
- lm_ggml_backend_sched_eval_callback callback_eval;
1417
- void * callback_eval_user_data;
1604
+ struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1418
1605
 
1419
- char * context_buffer;
1420
- size_t context_buffer_size;
1606
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1607
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1608
+ return ec;
1609
+ }
1421
1610
 
1422
- bool debug;
1423
- };
1611
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1612
+ lm_ggml_backend_synchronize(split_backend);
1424
1613
 
1425
- #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
1426
- #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1427
- #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1428
- #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1614
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1615
+ break;
1616
+ }
1429
1617
 
1430
- // returns the priority of the backend, lower id is higher priority
1431
- static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1432
- for (int i = 0; i < sched->n_backends; i++) {
1433
- if (sched->backends[i] == backend) {
1434
- return i;
1618
+ j0 = j1;
1619
+ }
1435
1620
  }
1436
- }
1437
- return -1;
1438
- }
1439
-
1440
- static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
1441
- lm_ggml_backend_buffer_t buffer = tensor->buffer;
1442
- if (buffer == NULL) {
1443
- return -1;
1444
- }
1445
1621
 
1446
- // find highest prio backend that supports the buffer type and the op
1447
- for (int i = 0; i < sched->n_backends; i++) {
1448
- if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1449
- lm_ggml_backend_supports_op(sched->backends[i], op)) {
1450
- return i;
1622
+ // record the event of this copy
1623
+ if (split->n_inputs > 0) {
1624
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1625
+ lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1626
+ }
1451
1627
  }
1452
1628
  }
1453
1629
 
1454
- #ifndef NDEBUG
1455
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1456
- __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
1457
- #endif
1630
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1458
1631
 
1459
- return -1;
1632
+ return LM_GGML_STATUS_SUCCESS;
1460
1633
  }
1461
1634
 
1462
- #if 0
1463
- #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1464
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1465
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1466
- #define GET_CAUSE(node) causes[hash_id(node)]
1467
- #else
1468
- #define SET_CAUSE(node, ...)
1469
- #define GET_CAUSE(node) ""
1470
- #endif
1635
+ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1636
+ lm_ggml_backend_t * backends,
1637
+ lm_ggml_backend_buffer_type_t * bufts,
1638
+ int n_backends,
1639
+ size_t graph_size,
1640
+ bool parallel) {
1641
+ LM_GGML_ASSERT(n_backends > 0);
1642
+ LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1643
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1471
1644
 
1472
- // returns the backend that should be used for the node based on the current locations
1473
- static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
1474
- // TODO: use supports_op to check if the backend supports the op
1645
+ struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1475
1646
 
1476
- // assign pre-allocated nodes to their backend
1477
- int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1478
- if (cur_backend_id != -1) {
1479
- SET_CAUSE(tensor, "1.dst");
1480
- return cur_backend_id;
1481
- }
1647
+ const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
1648
+ sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
1649
+ sched->n_backends = n_backends;
1650
+ sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1482
1651
 
1483
- // view_src
1484
- if (tensor->view_src != NULL) {
1485
- cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1486
- if (cur_backend_id != -1) {
1487
- SET_CAUSE(tensor, "1.vsrc");
1488
- return cur_backend_id;
1489
- }
1490
- }
1652
+ // initialize hash table
1653
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1654
+ sched->hash_set = lm_ggml_hash_set_new(graph_size);
1655
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1656
+ sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1491
1657
 
1492
- if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1493
- // since the tensor is pre-allocated, it cannot be moved to another backend
1494
- LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1495
- }
1658
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1659
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1660
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1661
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1662
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1663
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1496
1664
 
1497
- // graph input
1498
- if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1499
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1500
- SET_CAUSE(tensor, "1.inp");
1501
- return cur_backend_id;
1502
- }
1665
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1666
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1503
1667
 
1504
- // operations with weights are preferably run on the same backend as the weights
1505
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1506
- const struct lm_ggml_tensor * src = tensor->src[i];
1507
- if (src == NULL) {
1508
- continue;
1509
- }
1510
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1511
- int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1512
- // check if a backend with higher prio wants to offload the op
1513
- if (src_backend_id == sched->n_backends - 1) {
1514
- for (int b = 0; b < src_backend_id; b++) {
1515
- if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
1516
- SET_CAUSE(tensor, "1.off");
1517
- return b;
1518
- }
1519
- }
1668
+ const int initial_splits_capacity = 16;
1669
+ sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1670
+ sched->splits_capacity = initial_splits_capacity;
1671
+
1672
+ for (int b = 0; b < n_backends; b++) {
1673
+ sched->backends[b] = backends[b];
1674
+ sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1675
+ LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1676
+
1677
+ if (sched->n_copies > 1) {
1678
+ for (int c = 0; c < sched->n_copies; c++) {
1679
+ sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
1520
1680
  }
1521
- SET_CAUSE(tensor, "1.wgt%d", i);
1522
- return src_backend_id;
1523
1681
  }
1524
1682
  }
1525
1683
 
1526
- return -1;
1684
+ sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1685
+
1686
+ lm_ggml_backend_sched_reset(sched);
1687
+
1688
+ return sched;
1527
1689
  }
1528
1690
 
1529
- static char * fmt_size(size_t size) {
1530
- static char buffer[128];
1531
- if (size >= 1024*1024) {
1532
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1533
- } else {
1534
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1691
+ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1692
+ if (sched == NULL) {
1693
+ return;
1535
1694
  }
1536
- return buffer;
1695
+ for (int b = 0; b < sched->n_backends; b++) {
1696
+ for (int c = 0; c < sched->n_copies; c++) {
1697
+ lm_ggml_backend_event_free(sched->events[b][c]);
1698
+ }
1699
+ }
1700
+ lm_ggml_gallocr_free(sched->galloc);
1701
+ lm_ggml_free(sched->ctx);
1702
+ lm_ggml_hash_set_free(&sched->hash_set);
1703
+ free(sched->splits);
1704
+ free(sched->hv_tensor_backend_ids);
1705
+ free(sched->hv_tensor_copies);
1706
+ free(sched->node_backend_ids);
1707
+ free(sched->leaf_backend_ids);
1708
+ free(sched->prev_node_backend_ids);
1709
+ free(sched->prev_leaf_backend_ids);
1710
+ free(sched->context_buffer);
1711
+ free(sched->graph.nodes);
1712
+ free(sched->graph.leafs);
1713
+ free(sched);
1537
1714
  }
1538
1715
 
1539
- static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1540
- int cur_split = 0;
1541
- for (int i = 0; i < graph->n_nodes; i++) {
1542
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1543
- lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1544
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1545
- sched->splits[cur_split].n_inputs);
1546
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1547
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1548
- fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1549
- }
1550
- fprintf(stderr, "\n");
1551
- cur_split++;
1552
- }
1553
- struct lm_ggml_tensor * node = graph->nodes[i];
1554
- if (lm_ggml_is_view_op(node->op)) {
1555
- continue;
1556
- }
1557
- lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1558
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1559
- fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1560
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1561
- struct lm_ggml_tensor * src = node->src[j];
1562
- if (src == NULL) {
1563
- continue;
1564
- }
1565
- lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1566
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1567
- fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1568
- }
1569
- fprintf(stderr, "\n");
1716
+ void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1717
+ // reset state for the next run
1718
+ if (!sched->is_reset) {
1719
+ lm_ggml_hash_set_reset(&sched->hash_set);
1720
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1721
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1722
+ sched->is_reset = true;
1570
1723
  }
1724
+ sched->is_alloc = false;
1571
1725
  }
1572
1726
 
1573
- static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1574
- lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1575
- lm_ggml_backend_buffer_type_t buft = NULL;
1727
+ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1728
+ LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1576
1729
 
1577
- if (buf) {
1578
- // the tensor is already allocated
1579
- buft = buf->buft;
1580
- } else {
1581
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1582
- int tensor_backend_id = tensor_backend_id(t);
1583
- if (tensor_backend_id == -1 && t->view_src) {
1584
- tensor_backend_id = tensor_backend_id(t->view_src);
1585
- }
1586
- if (tensor_backend_id != -1) {
1587
- buft = sched->bufts[tensor_backend_id];
1588
- }
1730
+ lm_ggml_backend_sched_split_graph(sched, measure_graph);
1731
+
1732
+ if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1733
+ return false;
1589
1734
  }
1590
1735
 
1591
- return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1592
- }
1736
+ lm_ggml_backend_sched_reset(sched);
1737
+ lm_ggml_backend_sched_synchronize(sched);
1593
1738
 
1594
- static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1595
- if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1596
- *node_backend_id = cur_backend_id;
1597
- SET_CAUSE(node, "2.sup");
1598
- }
1739
+ return true;
1599
1740
  }
1600
1741
 
1601
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1602
- static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1603
- // reset splits
1604
- sched->n_splits = 0;
1605
- sched->n_graph_inputs = 0;
1606
- sched->is_reset = false;
1742
+ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1743
+ LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1607
1744
 
1608
- struct lm_ggml_init_params params = {
1609
- /* .mem_size = */ sched->context_buffer_size,
1610
- /* .mem_buffer = */ sched->context_buffer,
1611
- /* .no_alloc = */ true
1612
- };
1745
+ lm_ggml_backend_sched_split_graph(sched, graph);
1613
1746
 
1614
- lm_ggml_free(sched->ctx);
1615
1747
 
1616
- sched->ctx = lm_ggml_init(params);
1617
- if (sched->ctx == NULL) {
1618
- LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
1748
+ if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1749
+ return false;
1619
1750
  }
1620
1751
 
1621
- // pass 1: assign backends to ops with pre-allocated inputs
1622
- for (int i = 0; i < graph->n_leafs; i++) {
1623
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1624
- int * leaf_backend_id = &tensor_backend_id(leaf);
1625
- // do not overwrite user assignments
1626
- if (*leaf_backend_id == -1) {
1627
- *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1628
- }
1629
- }
1752
+ sched->is_alloc = true;
1630
1753
 
1631
- for (int i = 0; i < graph->n_nodes; i++) {
1632
- struct lm_ggml_tensor * node = graph->nodes[i];
1633
- int * node_backend_id = &tensor_backend_id(node);
1634
- // do not overwrite user assignments
1635
- if (*node_backend_id == -1) {
1636
- *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1754
+ return true;
1755
+ }
1637
1756
 
1638
- #if 0
1639
- // src
1640
- if (node->op == LM_GGML_OP_NONE) {
1641
- continue;
1642
- }
1757
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1758
+ enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1759
+ lm_ggml_backend_sched_synchronize(sched);
1760
+ return err;
1761
+ }
1643
1762
 
1644
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1645
- struct lm_ggml_tensor * src = node->src[j];
1646
- if (src == NULL) {
1647
- continue;
1648
- }
1649
- int * src_backend_id = &tensor_backend_id(src);
1650
- if (*src_backend_id == -1) {
1651
- *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1652
- }
1653
- }
1654
- #endif
1655
- }
1763
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1764
+ if (!sched->is_reset && !sched->is_alloc) {
1765
+ lm_ggml_backend_sched_reset(sched);
1656
1766
  }
1657
1767
 
1658
- // pass 2: expand current backend assignments
1659
- // assign the same backend to adjacent nodes
1660
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1661
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1662
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1663
- // expand gpu down
1664
- {
1665
- int cur_backend_id = -1;
1666
- for (int i = 0; i < graph->n_nodes; i++) {
1667
- struct lm_ggml_tensor * node = graph->nodes[i];
1668
- if (lm_ggml_is_view_op(node->op)) {
1669
- continue;
1670
- }
1671
- int * node_backend_id = &tensor_backend_id(node);
1672
- if (*node_backend_id != -1) {
1673
- if (*node_backend_id == sched->n_backends - 1) {
1674
- // skip cpu (lowest prio backend)
1675
- cur_backend_id = -1;
1676
- } else {
1677
- cur_backend_id = *node_backend_id;
1678
- }
1679
- } else if (cur_backend_id != -1) {
1680
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1681
- }
1682
- }
1683
- }
1684
- // expand gpu up
1685
- {
1686
- int cur_backend_id = -1;
1687
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1688
- struct lm_ggml_tensor * node = graph->nodes[i];
1689
- if (lm_ggml_is_view_op(node->op)) {
1690
- continue;
1691
- }
1692
- int * node_backend_id = &tensor_backend_id(node);
1693
- if (*node_backend_id != -1) {
1694
- if (*node_backend_id == sched->n_backends - 1) {
1695
- // skip cpu (lowest prio backend)
1696
- cur_backend_id = -1;
1697
- } else {
1698
- cur_backend_id = *node_backend_id;
1699
- }
1700
- } else if (cur_backend_id != -1) {
1701
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1702
- }
1703
- }
1704
- }
1705
- // expand rest down
1706
- {
1707
- int cur_backend_id = -1;
1708
- for (int i = 0; i < graph->n_nodes; i++) {
1709
- struct lm_ggml_tensor * node = graph->nodes[i];
1710
- if (lm_ggml_is_view_op(node->op)) {
1711
- continue;
1712
- }
1713
- int * node_backend_id = &tensor_backend_id(node);
1714
- if (*node_backend_id != -1) {
1715
- cur_backend_id = *node_backend_id;
1716
- } else if (cur_backend_id != -1) {
1717
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1718
- }
1719
- }
1720
- }
1721
- // expand rest up
1722
- {
1723
- int cur_backend_id = -1;
1724
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1725
- struct lm_ggml_tensor * node = graph->nodes[i];
1726
- if (lm_ggml_is_view_op(node->op)) {
1727
- continue;
1728
- }
1729
- int * node_backend_id = &tensor_backend_id(node);
1730
- if (*node_backend_id != -1) {
1731
- cur_backend_id = *node_backend_id;
1732
- } else if (cur_backend_id != -1) {
1733
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1734
- }
1768
+ if (!sched->is_alloc) {
1769
+ if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1770
+ return LM_GGML_STATUS_ALLOC_FAILED;
1735
1771
  }
1736
1772
  }
1737
1773
 
1738
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1739
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1740
- // however, we also need to verify that the sources are in compatible buffer types
1741
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1742
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1743
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1744
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1745
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1746
- for (int i = 0; i < graph->n_nodes; i++) {
1747
- struct lm_ggml_tensor * node = graph->nodes[i];
1748
- if (lm_ggml_is_view_op(node->op)) {
1749
- continue;
1750
- }
1751
- int * node_backend_id = &tensor_backend_id(node);
1752
- if (*node_backend_id == -1) {
1753
- // unassigned node: find the backend with the most supported inputs
1754
- int n_supported_best = -1;
1755
- for (int b = 0; b < sched->n_backends; b++) {
1756
- if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1757
- int n_supported = 0;
1758
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1759
- struct lm_ggml_tensor * src = node->src[j];
1760
- if (src == NULL) {
1761
- continue;
1762
- }
1763
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1764
- n_supported++;
1765
- }
1766
- }
1767
- if (n_supported > n_supported_best) {
1768
- n_supported_best = n_supported;
1769
- *node_backend_id = b;
1770
- SET_CAUSE(node, "3.best");
1771
- }
1772
- }
1773
- }
1774
- } else {
1775
- // assigned node: upgrade to higher prio backend if possible
1776
- for (int b = 0; b < *node_backend_id; b++) {
1777
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1778
- bool supported = true;
1779
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1780
- struct lm_ggml_tensor * src = node->src[j];
1781
- if (src == NULL) {
1782
- continue;
1783
- }
1784
- if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1785
- supported = false;
1786
- break;
1787
- }
1788
- }
1789
- if (supported) {
1790
- *node_backend_id = b;
1791
- SET_CAUSE(node, "3.upg");
1792
- break;
1793
- }
1794
- }
1795
- }
1796
- }
1774
+ return lm_ggml_backend_sched_compute_splits(sched);
1775
+ }
1776
+
1777
+ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1778
+ for (int i = 0; i < sched->n_backends; i++) {
1779
+ lm_ggml_backend_synchronize(sched->backends[i]);
1797
1780
  }
1781
+ }
1782
+
1783
+ void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1784
+ sched->callback_eval = callback;
1785
+ sched->callback_eval_user_data = user_data;
1786
+ }
1787
+
1788
+ int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1789
+ return sched->n_splits;
1790
+ }
1791
+
1792
+ int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1793
+ return sched->n_copies;
1794
+ }
1795
+
1796
+ int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
1797
+ return sched->n_backends;
1798
+ }
1799
+
1800
+ lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
1801
+ LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1802
+ return sched->backends[i];
1803
+ }
1804
+
1805
+ size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1806
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1807
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1808
+
1809
+ return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1810
+ }
1811
+
1812
+ void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
1813
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1814
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1815
+ tensor_backend_id(node) = backend_index;
1816
+ SET_CAUSE(node, "usr");
1817
+ sched->is_reset = false;
1818
+ }
1798
1819
 
1799
- // pass 4: assign backends to remaining src from dst and view_src
1800
- for (int i = 0; i < graph->n_nodes; i++) {
1801
- struct lm_ggml_tensor * node = graph->nodes[i];
1802
- int * cur_backend_id = &tensor_backend_id(node);
1803
- if (node->view_src != NULL && *cur_backend_id == -1) {
1804
- *cur_backend_id = tensor_backend_id(node->view_src);
1805
- SET_CAUSE(node, "4.vsrc");
1806
- }
1807
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1808
- struct lm_ggml_tensor * src = node->src[j];
1809
- if (src == NULL) {
1810
- continue;
1811
- }
1812
- int * src_backend_id = &tensor_backend_id(src);
1813
- if (*src_backend_id == -1) {
1814
- if (src->view_src != NULL) {
1815
- // views are always on the same backend as the source
1816
- *src_backend_id = tensor_backend_id(src->view_src);
1817
- SET_CAUSE(src, "4.vsrc");
1818
- } else {
1819
- *src_backend_id = *cur_backend_id;
1820
- SET_CAUSE(src, "4.cur");
1821
- }
1822
- }
1823
- }
1820
+ lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
1821
+ int backend_index = tensor_backend_id(node);
1822
+ if (backend_index == -1) {
1823
+ return NULL;
1824
1824
  }
1825
+ return sched->backends[backend_index];
1826
+ }
1825
1827
 
1826
- // pass 5: split graph, find tensors that need to be copied
1827
- {
1828
- int i_split = 0;
1829
- struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1830
- // find the backend of the first split, skipping view ops
1831
- int i = 0;
1832
- for (; i < graph->n_nodes; i++) {
1833
- struct lm_ggml_tensor * node = graph->nodes[i];
1834
- if (!lm_ggml_is_view_op(node->op)) {
1835
- split->backend_id = tensor_backend_id(node);
1836
- break;
1837
- }
1838
- }
1839
- split->i_start = 0;
1840
- split->n_inputs = 0;
1841
- int cur_backend_id = split->backend_id;
1842
- for (; i < graph->n_nodes; i++) {
1843
- struct lm_ggml_tensor * node = graph->nodes[i];
1828
+ // utils
1844
1829
 
1845
- if (lm_ggml_is_view_op(node->op)) {
1846
- continue;
1847
- }
1830
+ void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
1831
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1832
+ LM_GGML_ASSERT(tensor->view_src != NULL);
1833
+ LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
1834
+ LM_GGML_ASSERT(tensor->view_src->data != NULL);
1848
1835
 
1849
- const int node_backend_id = tensor_backend_id(node);
1836
+ tensor->buffer = tensor->view_src->buffer;
1837
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1838
+ lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1839
+ }
1850
1840
 
1851
- assert(node_backend_id != -1); // all nodes should be assigned by now
1841
+ void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
1842
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1843
+ LM_GGML_ASSERT(tensor->data == NULL);
1844
+ LM_GGML_ASSERT(tensor->view_src == NULL);
1845
+ LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
1846
+ LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1847
+ (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
1852
1848
 
1853
- // check if we should start a new split based on the sources of the current node
1854
- bool need_new_split = false;
1855
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1856
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1857
- struct lm_ggml_tensor * src = node->src[j];
1858
- if (src == NULL) {
1859
- continue;
1860
- }
1861
- // check if a weight is on a different backend
1862
- // by starting a new split, the memory of the previously offloaded weights can be reused
1863
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1864
- int src_backend_id = tensor_backend_id(src);
1865
- if (src_backend_id != cur_backend_id) {
1866
- need_new_split = true;
1867
- break;
1868
- }
1869
- }
1870
- // check if the split has too many inputs
1871
- // FIXME: count the number of inputs instead of only checking when full
1872
- if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1873
- const size_t id = hash_id(src);
1874
- int src_backend_id = sched->hv_tensor_backend_ids[id];
1875
- bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1876
- if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1877
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1878
- need_new_split = true;
1879
- break;
1880
- }
1881
- }
1882
- }
1883
- }
1849
+ tensor->buffer = buffer;
1850
+ tensor->data = addr;
1851
+ lm_ggml_backend_buffer_init_tensor(buffer, tensor);
1852
+ }
1884
1853
 
1885
- if (node_backend_id != cur_backend_id || need_new_split) {
1886
- split->i_end = i;
1887
- i_split++;
1888
- if (i_split >= sched->splits_capacity) {
1889
- sched->splits_capacity *= 2;
1890
- sched->splits = (lm_ggml_backend_sched_split *)
1891
- realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1892
- LM_GGML_ASSERT(sched->splits != NULL);
1893
- }
1894
- split = &sched->splits[i_split];
1895
- split->backend_id = node_backend_id;
1896
- split->i_start = i;
1897
- split->n_inputs = 0;
1898
- cur_backend_id = node_backend_id;
1899
- }
1854
+ static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
1855
+ struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
1900
1856
 
1901
- // find inputs that are not on the same backend
1902
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1903
- struct lm_ggml_tensor * src = node->src[j];
1904
- if (src == NULL) {
1905
- continue;
1906
- }
1857
+ LM_GGML_ASSERT(src != NULL);
1858
+ LM_GGML_ASSERT(src->data && "graph must be allocated");
1907
1859
 
1908
- size_t src_id = hash_id(src);
1909
- const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1910
- assert(src_backend_id != -1); // all inputs should be assigned by now
1860
+ size_t id = lm_ggml_hash_insert(&hash_set, src);
1861
+ if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
1862
+ return node_copies[lm_ggml_hash_find(&hash_set, src)];
1863
+ }
1911
1864
 
1912
- if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1913
- if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1914
- lm_ggml_backend_t backend = sched->backends[src_backend_id];
1915
- for (int c = 0; c < sched->n_copies; c++) {
1916
- struct lm_ggml_tensor * tensor_copy;
1917
- if (c == sched->cur_copy) {
1918
- tensor_copy = src; // use the original tensor as the current copy
1919
- } else {
1920
- tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1921
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1922
- }
1923
- if (sched->n_copies > 1) {
1924
- lm_ggml_set_input(tensor_copy);
1925
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1926
- }
1927
- tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1928
- SET_CAUSE(tensor_copy, "4.cpy");
1929
- }
1930
- int n_graph_inputs = sched->n_graph_inputs++;
1931
- LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1932
- sched->graph_inputs[n_graph_inputs] = src;
1933
- }
1934
- }
1865
+ struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1866
+ if (src->view_src != NULL) {
1867
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1868
+ dst->view_offs = src->view_offs;
1869
+ }
1870
+ dst->op = src->op;
1871
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1872
+ lm_ggml_set_name(dst, src->name);
1935
1873
 
1936
- if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1937
- // create a copy of the input in the split's backend
1938
- if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1939
- lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1940
- for (int c = 0; c < sched->n_copies; c++) {
1941
- struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1942
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1943
- if (sched->n_copies > 1) {
1944
- lm_ggml_set_input(tensor_copy);
1945
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1946
- }
1947
- tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1948
- SET_CAUSE(tensor_copy, "4.cpy");
1949
- }
1950
- int n_inputs = split->n_inputs++;
1951
- LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1952
- split->inputs[n_inputs] = src;
1953
- }
1954
- node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1955
- }
1956
- }
1874
+ // copy src
1875
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1876
+ struct lm_ggml_tensor * s = src->src[i];
1877
+ if (s == NULL) {
1878
+ continue;
1957
1879
  }
1958
- split->i_end = graph->n_nodes;
1959
- sched->n_splits = i_split + 1;
1880
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1881
+ }
1882
+
1883
+ node_copies[id] = dst;
1884
+ return dst;
1885
+ }
1886
+
1887
+ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
1888
+ size_t id = lm_ggml_hash_find(hash_set, src);
1889
+ if (node_init[id]) {
1890
+ return;
1891
+ }
1892
+ node_init[id] = true;
1893
+
1894
+ struct lm_ggml_tensor * dst = node_copies[id];
1895
+ if (dst->view_src != NULL) {
1896
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1897
+ lm_ggml_backend_view_init(dst);
1898
+ }
1899
+ else {
1900
+ lm_ggml_backend_tensor_copy(src, dst);
1901
+ }
1902
+
1903
+ // init src
1904
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1905
+ struct lm_ggml_tensor * s = src->src[i];
1906
+ if (s == NULL) {
1907
+ continue;
1908
+ }
1909
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1910
+ }
1911
+ }
1912
+
1913
+ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
1914
+ struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
1915
+ struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1916
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
1917
+
1918
+ struct lm_ggml_init_params params = {
1919
+ /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
1920
+ /* .mem_buffer = */ NULL,
1921
+ /* .no_alloc = */ true
1922
+ };
1923
+
1924
+ struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
1925
+ struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
1926
+
1927
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1928
+ LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1929
+ lm_ggml_hash_set_free(&hash_set);
1930
+ free(node_copies);
1931
+ free(node_init);
1932
+ lm_ggml_free(ctx_allocated);
1933
+ lm_ggml_free(ctx_unallocated);
1934
+ return {
1935
+ /* .buffer = */ NULL,
1936
+ /* .ctx_allocated = */ NULL,
1937
+ /* .ctx_unallocated = */ NULL,
1938
+ /* .graph = */ NULL,
1939
+ };
1940
+ }
1941
+
1942
+ // dup nodes
1943
+ for (int i = 0; i < graph->n_nodes; i++) {
1944
+ struct lm_ggml_tensor * node = graph->nodes[i];
1945
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1960
1946
  }
1961
1947
 
1962
- if (sched->debug) {
1963
- lm_ggml_backend_sched_print_assignments(sched, graph);
1948
+ // allocate nodes
1949
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1950
+ if (buffer == NULL) {
1951
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1952
+ lm_ggml_hash_set_free(&hash_set);
1953
+ free(node_copies);
1954
+ free(node_init);
1955
+ lm_ggml_free(ctx_allocated);
1956
+ lm_ggml_free(ctx_unallocated);
1957
+ return {
1958
+ /* .buffer = */ NULL,
1959
+ /* .ctx_allocated = */ NULL,
1960
+ /* .ctx_unallocated = */ NULL,
1961
+ /* .graph = */ NULL,
1962
+ };
1964
1963
  }
1965
1964
 
1966
- // swap node_backend_ids and leaf _backend_ids with prevs
1967
- {
1968
- int * tmp = sched->node_backend_ids;
1969
- sched->node_backend_ids = sched->prev_node_backend_ids;
1970
- sched->prev_node_backend_ids = tmp;
1965
+ //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1971
1966
 
1972
- tmp = sched->leaf_backend_ids;
1973
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1974
- sched->prev_leaf_backend_ids = tmp;
1967
+ // copy data and init views
1968
+ for (int i = 0; i < graph->n_nodes; i++) {
1969
+ struct lm_ggml_tensor * node = graph->nodes[i];
1970
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
1975
1971
  }
1976
1972
 
1977
- int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1978
- if (sched->graph.size < graph_size) {
1979
- sched->graph.size = graph_size;
1980
- sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1981
- sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1982
- LM_GGML_ASSERT(sched->graph.nodes != NULL);
1983
- LM_GGML_ASSERT(sched->graph.leafs != NULL);
1973
+ // build graph copy
1974
+ struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
1975
+ for (int i = 0; i < graph->n_nodes; i++) {
1976
+ struct lm_ggml_tensor * node = graph->nodes[i];
1977
+ struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
1978
+ graph_copy->nodes[i] = node_copy;
1984
1979
  }
1985
- sched->graph.n_nodes = 0;
1986
- sched->graph.n_leafs = 0;
1980
+ graph_copy->n_nodes = graph->n_nodes;
1987
1981
 
1988
- struct lm_ggml_cgraph * graph_copy = &sched->graph;
1982
+ lm_ggml_hash_set_free(&hash_set);
1983
+ free(node_copies);
1984
+ free(node_init);
1989
1985
 
1990
- for (int i = 0; i < sched->n_splits; i++) {
1991
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1992
- split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1986
+ return {
1987
+ /* .buffer = */ buffer,
1988
+ /* .ctx_allocated = */ ctx_allocated,
1989
+ /* .ctx_unallocated = */ ctx_unallocated,
1990
+ /* .graph = */ graph_copy,
1991
+ };
1992
+ }
1993
1993
 
1994
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1995
- for (int j = 0; j < split->n_inputs; j++) {
1996
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1994
+ void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
1995
+ lm_ggml_backend_buffer_free(copy.buffer);
1996
+ lm_ggml_free(copy.ctx_allocated);
1997
+ lm_ggml_free(copy.ctx_unallocated);
1998
+ }
1997
1999
 
1998
- struct lm_ggml_tensor * input = split->inputs[j];
1999
- const size_t input_id = hash_id(input);
2000
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
2000
+ bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2001
+ struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2002
+ if (copy.buffer == NULL) {
2003
+ return false;
2004
+ }
2001
2005
 
2002
- // add a dependency to the input source so that it is not freed before the copy is done
2003
- struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
2004
- input_dep->src[0] = input;
2005
- sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
2006
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
2006
+ struct lm_ggml_cgraph * g1 = graph;
2007
+ struct lm_ggml_cgraph * g2 = copy.graph;
2007
2008
 
2008
- // add a dependency to the input copy so that it is allocated at the start of the split
2009
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
2010
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
2011
- }
2009
+ assert(g1->n_nodes == g2->n_nodes);
2012
2010
 
2013
- for (int j = split->i_start; j < split->i_end; j++) {
2014
- assert(graph_copy->size > graph_copy->n_nodes);
2015
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
2016
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
2017
- }
2018
- }
2011
+ for (int i = 0; i < g1->n_nodes; i++) {
2012
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2013
+ struct lm_ggml_tensor * t1 = g1->nodes[i];
2014
+ struct lm_ggml_tensor * t2 = g2->nodes[i];
2019
2015
 
2020
- if (sched->n_copies > 1) {
2021
- // add input copies as leafs so that they are allocated first
2022
- for (int i = 0; i < sched->n_graph_inputs; i++) {
2023
- struct lm_ggml_tensor * input = sched->graph_inputs[i];
2024
- size_t id = hash_id(input);
2025
- int backend_id = tensor_backend_id(input);
2026
- for (int c = 0; c < sched->n_copies; c++) {
2027
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2028
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2029
- assert(graph_copy->size > graph_copy->n_leafs);
2030
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2031
- }
2016
+ assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2017
+
2018
+ struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2019
+ struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2020
+
2021
+ lm_ggml_backend_graph_compute(backend1, &g1v);
2022
+ lm_ggml_backend_graph_compute(backend2, &g2v);
2023
+
2024
+ if (lm_ggml_is_view_op(t1->op)) {
2025
+ continue;
2032
2026
  }
2033
2027
 
2034
- for (int i = 0; i < sched->n_splits; i++) {
2035
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
2036
- int backend_id = split->backend_id;
2037
- for (int j = 0; j < split->n_inputs; j++) {
2038
- struct lm_ggml_tensor * input = split->inputs[j];
2039
- size_t id = hash_id(input);
2040
- for (int c = 0; c < sched->n_copies; c++) {
2041
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2042
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2043
- assert(graph_copy->size > graph_copy->n_leafs);
2044
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2045
- }
2046
- }
2028
+ // compare results, calculate rms etc
2029
+ if (!callback(i, t1, t2, user_data)) {
2030
+ break;
2047
2031
  }
2048
2032
  }
2049
2033
 
2050
- // add leafs from the original graph
2051
- for (int i = 0; i < graph->n_leafs; i++) {
2052
- struct lm_ggml_tensor * leaf = graph->leafs[i];
2053
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
2054
- assert(graph_copy->size > graph_copy->n_leafs);
2055
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
2056
- }
2034
+ lm_ggml_backend_graph_copy_free(copy);
2035
+
2036
+ return true;
2057
2037
  }
2058
2038
 
2059
- static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
2060
- bool backend_ids_changed = false;
2061
- for (int i = 0; i < sched->graph.n_nodes; i++) {
2062
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
2063
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
2064
- backend_ids_changed = true;
2065
- break;
2066
- }
2039
+
2040
+
2041
+ #include "ggml-backend.h"
2042
+ #include "ggml-backend-impl.h"
2043
+ #include "ggml-cpu.h"
2044
+ #include "ggml-impl.h"
2045
+ #include <cctype>
2046
+ #include <string>
2047
+
2048
+ // ggml-backend interface
2049
+
2050
+ // CPU backend - buffer
2051
+
2052
+ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
2053
+ uintptr_t data = (uintptr_t)buffer->context;
2054
+
2055
+ // align the buffer
2056
+ if (data % TENSOR_ALIGNMENT != 0) {
2057
+ data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
2067
2058
  }
2068
- if (!backend_ids_changed) {
2069
- for (int i = 0; i < sched->graph.n_leafs; i++) {
2070
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
2071
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
2072
- backend_ids_changed = true;
2073
- break;
2074
- }
2075
- }
2059
+
2060
+ return (void *)data;
2061
+ }
2062
+
2063
+ static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2064
+ lm_ggml_aligned_free(buffer->context, buffer->size);
2065
+ }
2066
+
2067
+ static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2068
+ memset((char *)tensor->data + offset, value, size);
2069
+
2070
+ LM_GGML_UNUSED(buffer);
2071
+ }
2072
+
2073
+ static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2074
+ memcpy((char *)tensor->data + offset, data, size);
2075
+
2076
+ LM_GGML_UNUSED(buffer);
2077
+ }
2078
+
2079
+ static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2080
+ memcpy(data, (const char *)tensor->data + offset, size);
2081
+
2082
+ LM_GGML_UNUSED(buffer);
2083
+ }
2084
+
2085
+ static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
2086
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
2087
+ memcpy(dst->data, src->data, lm_ggml_nbytes(src));
2088
+ return true;
2076
2089
  }
2090
+ return false;
2077
2091
 
2078
- // allocate graph
2079
- if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2080
- // the re-allocation may cause the split inputs to be moved to a different address
2081
- lm_ggml_backend_sched_synchronize(sched);
2082
- #ifndef NDEBUG
2083
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2084
- #endif
2085
- lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2086
- if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2087
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
2088
- return false;
2089
- }
2092
+ LM_GGML_UNUSED(buffer);
2093
+ }
2094
+
2095
+ static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
2096
+ memset(buffer->context, value, buffer->size);
2097
+ }
2098
+
2099
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
2100
+ /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
2101
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2102
+ /* .init_tensor = */ NULL, // no initialization required
2103
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2104
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2105
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2106
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2107
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2108
+ /* .reset = */ NULL,
2109
+ };
2110
+
2111
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
2112
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
2113
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2114
+ /* .init_tensor = */ NULL, // no initialization required
2115
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2116
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2117
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2118
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2119
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2120
+ /* .reset = */ NULL,
2121
+ };
2122
+
2123
+ // CPU backend - buffer type
2124
+
2125
+ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2126
+ return "CPU";
2127
+
2128
+ LM_GGML_UNUSED(buft);
2129
+ }
2130
+
2131
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2132
+ void * data = lm_ggml_aligned_malloc(size);
2133
+
2134
+ if (data == NULL) {
2135
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
2136
+ return NULL;
2090
2137
  }
2091
2138
 
2092
- return true;
2139
+ return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
2093
2140
  }
2094
2141
 
2095
- static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
2096
- struct lm_ggml_backend_sched_split * splits = sched->splits;
2142
+ static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
2143
+ return TENSOR_ALIGNMENT;
2097
2144
 
2098
- for (int i = 0; i < sched->n_splits; i++) {
2099
- struct lm_ggml_backend_sched_split * split = &splits[i];
2100
- int split_backend_id = split->backend_id;
2101
- lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
2145
+ LM_GGML_UNUSED(buft);
2146
+ }
2102
2147
 
2103
- // copy the input tensors to the split backend
2104
- for (int j = 0; j < split->n_inputs; j++) {
2105
- lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
2106
- struct lm_ggml_tensor * input = split->inputs[j];
2107
- struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
2148
+ static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
2149
+ return true;
2108
2150
 
2109
- if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
2110
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
2111
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2112
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2113
- } else {
2114
- lm_ggml_backend_synchronize(split_backend);
2115
- }
2116
- lm_ggml_backend_tensor_copy(input, input_cpy);
2117
- } else {
2118
- // wait for the split backend to finish using the input before overwriting it
2119
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2120
- lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
2121
- } else {
2122
- lm_ggml_backend_synchronize(split_backend);
2123
- }
2124
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
2125
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
2126
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
2127
- lm_ggml_backend_synchronize(input_backend);
2128
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2129
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2130
- } else {
2131
- lm_ggml_backend_synchronize(split_backend);
2132
- }
2133
- lm_ggml_backend_tensor_copy(input, input_cpy);
2134
- }
2135
- }
2136
- }
2151
+ LM_GGML_UNUSED(buft);
2152
+ }
2137
2153
 
2138
- if (!sched->callback_eval) {
2139
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
2140
- if (ec != LM_GGML_STATUS_SUCCESS) {
2141
- return ec;
2142
- }
2143
- } else {
2144
- // similar to lm_ggml_backend_compare_graph_backend
2145
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
2146
- struct lm_ggml_tensor * t = split->graph.nodes[j0];
2154
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
2155
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2156
+ /* .iface = */ {
2157
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
2158
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2159
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2160
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2161
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2162
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2163
+ },
2164
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2165
+ /* .context = */ NULL,
2166
+ };
2147
2167
 
2148
- // check if the user needs data from this node
2149
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2168
+ return &lm_ggml_backend_cpu_buffer_type;
2169
+ }
2150
2170
 
2151
- int j1 = j0;
2171
+ static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2172
+ return "CPU_Mapped";
2152
2173
 
2153
- // determine the range [j0, j1] of nodes that can be computed together
2154
- while (!need && j1 < split->graph.n_nodes - 1) {
2155
- t = split->graph.nodes[++j1];
2156
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2157
- }
2174
+ LM_GGML_UNUSED(buft);
2175
+ }
2158
2176
 
2159
- struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
2177
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
2178
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2179
+ /* .iface = */ {
2180
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
2181
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2182
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2183
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2184
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2185
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2186
+ },
2187
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2188
+ /* .context = */ NULL,
2189
+ };
2160
2190
 
2161
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
2162
- if (ec != LM_GGML_STATUS_SUCCESS) {
2163
- return ec;
2164
- }
2191
+ return &lm_ggml_backend_cpu_buffer_type;
2192
+ }
2165
2193
 
2166
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
2167
- lm_ggml_backend_synchronize(split_backend);
2194
+ #ifdef LM_GGML_USE_CPU_HBM
2168
2195
 
2169
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
2170
- break;
2171
- }
2196
+ // buffer type HBM
2172
2197
 
2173
- j0 = j1;
2174
- }
2175
- }
2198
+ #include <hbwmalloc.h>
2176
2199
 
2177
- // record the event of this copy
2178
- if (split->n_inputs > 0) {
2179
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2180
- lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
2181
- }
2182
- }
2200
+ static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2201
+ return "CPU_HBM";
2202
+
2203
+ LM_GGML_UNUSED(buft);
2204
+ }
2205
+
2206
+ static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2207
+ hbw_free(buffer->context);
2208
+ }
2209
+
2210
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2211
+ void * ptr;
2212
+ int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
2213
+ if (result != 0) {
2214
+ LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
2215
+ return NULL;
2183
2216
  }
2184
2217
 
2185
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
2218
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2219
+ buffer->buft = buft;
2220
+ buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
2186
2221
 
2187
- return LM_GGML_STATUS_SUCCESS;
2222
+ return buffer;
2188
2223
  }
2189
2224
 
2190
- lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
2191
- lm_ggml_backend_t * backends,
2192
- lm_ggml_backend_buffer_type_t * bufts,
2193
- int n_backends,
2194
- size_t graph_size,
2195
- bool parallel) {
2196
- LM_GGML_ASSERT(n_backends > 0);
2197
- LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
2198
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
2225
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
2226
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
2227
+ /* .iface = */ {
2228
+ /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
2229
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
2230
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2231
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2232
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2233
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2234
+ },
2235
+ /* .context = */ NULL,
2236
+ };
2199
2237
 
2200
- struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
2238
+ return &lm_ggml_backend_cpu_buffer_type_hbm;
2239
+ }
2240
+ #endif
2201
2241
 
2202
- sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
2203
- sched->n_backends = n_backends;
2204
- sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
2242
+ static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
2243
+ static lm_ggml_backend_buffer_type_t bufts[] = {
2244
+ #ifdef LM_GGML_USE_CPU_HBM
2245
+ lm_ggml_backend_cpu_hbm_buffer_type(),
2246
+ #endif
2247
+ NULL
2248
+ };
2205
2249
 
2206
- // initialize hash table
2207
- // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
2208
- sched->hash_set = lm_ggml_hash_set_new(graph_size);
2209
- sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2210
- sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
2250
+ return bufts;
2211
2251
 
2212
- const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
2213
- const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
2214
- sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2215
- sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2216
- sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2217
- sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2252
+ LM_GGML_UNUSED(device);
2253
+ }
2218
2254
 
2219
- sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
2220
- sched->context_buffer = (char *) malloc(sched->context_buffer_size);
2255
+ // CPU backend - backend (stream)
2221
2256
 
2222
- const int initial_splits_capacity = 16;
2223
- sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2224
- sched->splits_capacity = initial_splits_capacity;
2257
+ struct lm_ggml_backend_cpu_context {
2258
+ int n_threads;
2259
+ lm_ggml_threadpool_t threadpool;
2225
2260
 
2226
- for (int b = 0; b < n_backends; b++) {
2227
- sched->backends[b] = backends[b];
2228
- sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
2229
- LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
2230
- if (sched->n_copies > 1) {
2231
- for (int c = 0; c < sched->n_copies; c++) {
2232
- sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
2233
- }
2234
- }
2235
- }
2261
+ uint8_t * work_data;
2262
+ size_t work_size;
2236
2263
 
2237
- sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
2264
+ lm_ggml_abort_callback abort_callback;
2265
+ void * abort_callback_data;
2266
+ };
2238
2267
 
2239
- lm_ggml_backend_sched_reset(sched);
2268
+ static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
2269
+ return "CPU";
2240
2270
 
2241
- return sched;
2271
+ LM_GGML_UNUSED(backend);
2242
2272
  }
2243
2273
 
2244
- void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
2245
- if (sched == NULL) {
2246
- return;
2247
- }
2248
- for (int b = 0; b < sched->n_backends; b++) {
2249
- for (int c = 0; c < sched->n_copies; c++) {
2250
- lm_ggml_backend_event_free(sched->events[b][c]);
2251
- }
2252
- }
2253
- lm_ggml_gallocr_free(sched->galloc);
2254
- lm_ggml_free(sched->ctx);
2255
- lm_ggml_hash_set_free(&sched->hash_set);
2256
- free(sched->splits);
2257
- free(sched->hv_tensor_backend_ids);
2258
- free(sched->hv_tensor_copies);
2259
- free(sched->node_backend_ids);
2260
- free(sched->leaf_backend_ids);
2261
- free(sched->prev_node_backend_ids);
2262
- free(sched->prev_leaf_backend_ids);
2263
- free(sched->context_buffer);
2264
- free(sched->graph.nodes);
2265
- free(sched->graph.leafs);
2266
- free(sched);
2274
+ static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
2275
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2276
+ delete[] cpu_ctx->work_data;
2277
+ delete cpu_ctx;
2278
+ delete backend;
2267
2279
  }
2268
2280
 
2269
- void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
2270
- // reset state for the next run
2271
- if (!sched->is_reset) {
2272
- lm_ggml_hash_set_reset(&sched->hash_set);
2273
- memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2274
- memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
2275
- sched->is_reset = true;
2276
- }
2277
- sched->is_alloc = false;
2278
- }
2281
+ struct lm_ggml_backend_plan_cpu {
2282
+ struct lm_ggml_cplan cplan;
2283
+ struct lm_ggml_cgraph cgraph;
2284
+ };
2279
2285
 
2280
- bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
2281
- LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
2286
+ static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
2287
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2282
2288
 
2283
- lm_ggml_backend_sched_split_graph(sched, measure_graph);
2289
+ struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
2284
2290
 
2285
- if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2286
- return false;
2291
+ cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2292
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
2293
+
2294
+ if (cpu_plan->cplan.work_size > 0) {
2295
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
2296
+ if (cpu_plan->cplan.work_data == NULL) {
2297
+ delete cpu_plan;
2298
+ return NULL;
2299
+ }
2287
2300
  }
2288
2301
 
2289
- lm_ggml_backend_sched_reset(sched);
2290
- lm_ggml_backend_sched_synchronize(sched);
2302
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
2303
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2291
2304
 
2292
- return true;
2305
+ return cpu_plan;
2293
2306
  }
2294
2307
 
2295
- bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2296
- LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
2308
+ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2309
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2297
2310
 
2298
- lm_ggml_backend_sched_split_graph(sched, graph);
2311
+ delete[] cpu_plan->cplan.work_data;
2312
+ delete cpu_plan;
2299
2313
 
2314
+ LM_GGML_UNUSED(backend);
2315
+ }
2300
2316
 
2301
- if (!lm_ggml_backend_sched_alloc_splits(sched)) {
2302
- return false;
2303
- }
2317
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2318
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2304
2319
 
2305
- sched->is_alloc = true;
2320
+ return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
2306
2321
 
2307
- return true;
2322
+ LM_GGML_UNUSED(backend);
2308
2323
  }
2309
2324
 
2310
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2311
- enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
2312
- lm_ggml_backend_sched_synchronize(sched);
2313
- return err;
2314
- }
2325
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
2326
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2315
2327
 
2316
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2317
- if (!sched->is_reset && !sched->is_alloc) {
2318
- lm_ggml_backend_sched_reset(sched);
2319
- }
2328
+ struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2320
2329
 
2321
- if (!sched->is_alloc) {
2322
- if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
2330
+ if (cpu_ctx->work_size < cplan.work_size) {
2331
+ delete[] cpu_ctx->work_data;
2332
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
2333
+ if (cpu_ctx->work_data == NULL) {
2334
+ cpu_ctx->work_size = 0;
2323
2335
  return LM_GGML_STATUS_ALLOC_FAILED;
2324
2336
  }
2337
+ cpu_ctx->work_size = cplan.work_size;
2325
2338
  }
2339
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
2326
2340
 
2327
- return lm_ggml_backend_sched_compute_splits(sched);
2328
- }
2341
+ cplan.abort_callback = cpu_ctx->abort_callback;
2342
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2329
2343
 
2330
- void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
2331
- for (int i = 0; i < sched->n_backends; i++) {
2332
- lm_ggml_backend_synchronize(sched->backends[i]);
2333
- }
2344
+ return lm_ggml_graph_compute(cgraph, &cplan);
2334
2345
  }
2335
2346
 
2336
- void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
2337
- sched->callback_eval = callback;
2338
- sched->callback_eval_user_data = user_data;
2339
- }
2347
+ static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
2348
+ /* .get_name = */ lm_ggml_backend_cpu_get_name,
2349
+ /* .free = */ lm_ggml_backend_cpu_free,
2350
+ /* .set_tensor_async = */ NULL,
2351
+ /* .get_tensor_async = */ NULL,
2352
+ /* .cpy_tensor_async = */ NULL,
2353
+ /* .synchronize = */ NULL,
2354
+ /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
2355
+ /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
2356
+ /* .graph_plan_update = */ NULL,
2357
+ /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
2358
+ /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
2359
+ /* .event_record = */ NULL,
2360
+ /* .event_wait = */ NULL,
2361
+ };
2340
2362
 
2341
- int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
2342
- return sched->n_splits;
2363
+ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
2364
+ static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
2365
+ return &guid;
2343
2366
  }
2344
2367
 
2345
- int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
2346
- return sched->n_copies;
2347
- }
2368
+ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
2369
+ // initialize CPU backend now to avoid slowing the first graph computation
2370
+ lm_ggml_cpu_init();
2348
2371
 
2349
- int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
2350
- return sched->n_backends;
2372
+ struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
2373
+ if (ctx == NULL) {
2374
+ return NULL;
2375
+ }
2376
+
2377
+ ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
2378
+ ctx->threadpool = NULL;
2379
+ ctx->work_data = NULL;
2380
+ ctx->work_size = 0;
2381
+ ctx->abort_callback = NULL;
2382
+ ctx->abort_callback_data = NULL;
2383
+
2384
+ lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
2385
+ /* .guid = */ lm_ggml_backend_cpu_guid(),
2386
+ /* .interface = */ lm_ggml_backend_cpu_i,
2387
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2388
+ /* .context = */ ctx,
2389
+ };
2390
+
2391
+ if (cpu_backend == NULL) {
2392
+ delete ctx;
2393
+ return NULL;
2394
+ }
2395
+
2396
+ return cpu_backend;
2351
2397
  }
2352
2398
 
2353
- lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
2354
- LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
2355
- return sched->backends[i];
2399
+ bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
2400
+ return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
2356
2401
  }
2357
2402
 
2358
- size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
2359
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2360
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2403
+ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
2404
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2361
2405
 
2362
- return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2406
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2407
+ ctx->n_threads = n_threads;
2363
2408
  }
2364
2409
 
2365
- void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
2366
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2367
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2368
- tensor_backend_id(node) = backend_index;
2369
- SET_CAUSE(node, "usr");
2370
- sched->is_reset = false;
2371
- }
2410
+ void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
2411
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2372
2412
 
2373
- lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
2374
- int backend_index = tensor_backend_id(node);
2375
- if (backend_index == -1) {
2376
- return NULL;
2413
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2414
+
2415
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
2416
+ // already had a different threadpool, pause/suspend it before switching
2417
+ lm_ggml_threadpool_pause(ctx->threadpool);
2377
2418
  }
2378
- return sched->backends[backend_index];
2419
+ ctx->threadpool = threadpool;
2379
2420
  }
2380
2421
 
2381
- // utils
2422
+ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
2423
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2382
2424
 
2383
- void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
2384
- LM_GGML_ASSERT(tensor->buffer == NULL);
2385
- LM_GGML_ASSERT(tensor->view_src != NULL);
2386
- LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
2387
- LM_GGML_ASSERT(tensor->view_src->data != NULL);
2425
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2426
+ ctx->abort_callback = abort_callback;
2427
+ ctx->abort_callback_data = abort_callback_data;
2428
+ }
2388
2429
 
2389
- tensor->buffer = tensor->view_src->buffer;
2390
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2391
- lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2430
+ lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2431
+ LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
2432
+ return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
2392
2433
  }
2393
2434
 
2394
- void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
2395
- LM_GGML_ASSERT(tensor->buffer == NULL);
2396
- LM_GGML_ASSERT(tensor->data == NULL);
2397
- LM_GGML_ASSERT(tensor->view_src == NULL);
2398
- LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
2399
- LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2400
- (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
2435
+ // CPU backend - device
2401
2436
 
2402
- tensor->buffer = buffer;
2403
- tensor->data = addr;
2404
- lm_ggml_backend_buffer_init_tensor(buffer, tensor);
2437
+ struct lm_ggml_backend_cpu_device_context {
2438
+ std::string description = "CPU";
2439
+
2440
+ lm_ggml_backend_cpu_device_context() {
2441
+ #ifdef __APPLE__
2442
+ size_t len = 0;
2443
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
2444
+ description.resize(len);
2445
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
2446
+ }
2447
+ #elif defined(__linux__)
2448
+ FILE * f = fopen("/proc/cpuinfo", "r");
2449
+ if (f) {
2450
+ char buf[1024];
2451
+ while (fgets(buf, sizeof(buf), f)) {
2452
+ if (strncmp(buf, "model name", 10) == 0) {
2453
+ char * p = strchr(buf, ':');
2454
+ if (p) {
2455
+ p++;
2456
+ while (std::isspace(*p)) {
2457
+ p++;
2458
+ }
2459
+ while (std::isspace(p[strlen(p) - 1])) {
2460
+ p[strlen(p) - 1] = '\0';
2461
+ }
2462
+ description = p;
2463
+ break;
2464
+ }
2465
+ }
2466
+ }
2467
+ fclose(f);
2468
+ }
2469
+ #elif defined(_WIN32)
2470
+ HKEY hKey;
2471
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
2472
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
2473
+ 0,
2474
+ KEY_READ,
2475
+ &hKey) == ERROR_SUCCESS) {
2476
+ DWORD cpu_brand_size = 0;
2477
+ if (RegQueryValueExA(hKey,
2478
+ TEXT("ProcessorNameString"),
2479
+ NULL,
2480
+ NULL,
2481
+ NULL,
2482
+ &cpu_brand_size) == ERROR_SUCCESS) {
2483
+ description.resize(cpu_brand_size);
2484
+ if (RegQueryValueExA(hKey,
2485
+ TEXT("ProcessorNameString"),
2486
+ NULL,
2487
+ NULL,
2488
+ (LPBYTE)&description[0], // NOLINT
2489
+ &cpu_brand_size) == ERROR_SUCCESS) {
2490
+ if (description.find('\0') != std::string::npos) {
2491
+ description.resize(description.find('\0'));
2492
+ }
2493
+ }
2494
+ }
2495
+ RegCloseKey(hKey);
2496
+ }
2497
+ #endif
2498
+ }
2499
+ };
2500
+
2501
+ static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
2502
+ return "CPU";
2503
+
2504
+ LM_GGML_UNUSED(dev);
2405
2505
  }
2406
2506
 
2407
- static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
2408
- struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
2507
+ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
2508
+ struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
2409
2509
 
2410
- LM_GGML_ASSERT(src != NULL);
2411
- LM_GGML_ASSERT(src->data && "graph must be allocated");
2510
+ return ctx->description.c_str();
2511
+ }
2412
2512
 
2413
- size_t id = lm_ggml_hash_insert(&hash_set, src);
2414
- if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
2415
- return node_copies[lm_ggml_hash_find(&hash_set, src)];
2416
- }
2513
+ static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
2514
+ // TODO
2515
+ *free = 0;
2516
+ *total = 0;
2417
2517
 
2418
- struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2419
- if (src->view_src != NULL) {
2420
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2421
- dst->view_offs = src->view_offs;
2422
- }
2423
- dst->op = src->op;
2424
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2425
- lm_ggml_set_name(dst, src->name);
2518
+ LM_GGML_UNUSED(dev);
2519
+ }
2426
2520
 
2427
- // copy src
2428
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2429
- struct lm_ggml_tensor * s = src->src[i];
2430
- if (s == NULL) {
2431
- continue;
2432
- }
2433
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2434
- }
2521
+ static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
2522
+ return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
2435
2523
 
2436
- node_copies[id] = dst;
2437
- return dst;
2524
+ LM_GGML_UNUSED(dev);
2438
2525
  }
2439
2526
 
2440
- static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
2441
- size_t id = lm_ggml_hash_find(hash_set, src);
2442
- if (node_init[id]) {
2443
- return;
2444
- }
2445
- node_init[id] = true;
2527
+ static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
2528
+ props->name = lm_ggml_backend_cpu_device_get_name(dev);
2529
+ props->description = lm_ggml_backend_cpu_device_get_description(dev);
2530
+ props->type = lm_ggml_backend_cpu_device_get_type(dev);
2531
+ lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
2532
+ props->caps = {
2533
+ /* .async = */ false,
2534
+ /* .host_buffer = */ false,
2535
+ /* .buffer_from_host_ptr = */ true,
2536
+ /* .events = */ false,
2537
+ };
2538
+ }
2446
2539
 
2447
- struct lm_ggml_tensor * dst = node_copies[id];
2448
- if (dst->view_src != NULL) {
2449
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2450
- lm_ggml_backend_view_init(dst);
2451
- }
2452
- else {
2453
- lm_ggml_backend_tensor_copy(src, dst);
2454
- }
2540
+ static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
2541
+ return lm_ggml_backend_cpu_init();
2455
2542
 
2456
- // init src
2457
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2458
- struct lm_ggml_tensor * s = src->src[i];
2459
- if (s == NULL) {
2460
- continue;
2461
- }
2462
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2463
- }
2543
+ LM_GGML_UNUSED(dev);
2544
+ LM_GGML_UNUSED(params);
2464
2545
  }
2465
2546
 
2466
- struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
2467
- struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
2468
- struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2469
- bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2547
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
2548
+ return lm_ggml_backend_cpu_buffer_type();
2470
2549
 
2471
- struct lm_ggml_init_params params = {
2472
- /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
2473
- /* .mem_buffer = */ NULL,
2474
- /* .no_alloc = */ true
2475
- };
2550
+ LM_GGML_UNUSED(dev);
2551
+ }
2476
2552
 
2477
- struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
2478
- struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
2553
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2554
+ return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2479
2555
 
2480
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2481
- fprintf(stderr, "failed to allocate context for graph copy\n");
2482
- lm_ggml_hash_set_free(&hash_set);
2483
- free(node_copies);
2484
- free(node_init);
2485
- lm_ggml_free(ctx_allocated);
2486
- lm_ggml_free(ctx_unallocated);
2487
- return {
2488
- /* .buffer = */ NULL,
2489
- /* .ctx_allocated = */ NULL,
2490
- /* .ctx_unallocated = */ NULL,
2491
- /* .graph = */ NULL,
2492
- };
2493
- }
2556
+ LM_GGML_UNUSED(dev);
2557
+ LM_GGML_UNUSED(max_tensor_size);
2558
+ }
2494
2559
 
2495
- // dup nodes
2496
- for (int i = 0; i < graph->n_nodes; i++) {
2497
- struct lm_ggml_tensor * node = graph->nodes[i];
2498
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2560
+ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
2561
+ switch (op->op) {
2562
+ case LM_GGML_OP_CPY:
2563
+ return
2564
+ op->type != LM_GGML_TYPE_IQ2_XXS &&
2565
+ op->type != LM_GGML_TYPE_IQ2_XS &&
2566
+ op->type != LM_GGML_TYPE_IQ1_S &&
2567
+ op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
2568
+ case LM_GGML_OP_MUL_MAT:
2569
+ return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
2570
+ case LM_GGML_OP_ROPE_BACK:
2571
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
2572
+ case LM_GGML_OP_IM2COL_BACK:
2573
+ return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
2574
+ case LM_GGML_OP_OUT_PROD:
2575
+ return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
2576
+ default:
2577
+ return true;
2499
2578
  }
2500
2579
 
2501
- // allocate nodes
2502
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2503
- if (buffer == NULL) {
2504
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2505
- lm_ggml_hash_set_free(&hash_set);
2506
- free(node_copies);
2507
- free(node_init);
2508
- lm_ggml_free(ctx_allocated);
2509
- lm_ggml_free(ctx_unallocated);
2510
- return {
2511
- /* .buffer = */ NULL,
2512
- /* .ctx_allocated = */ NULL,
2513
- /* .ctx_unallocated = */ NULL,
2514
- /* .graph = */ NULL,
2515
- };
2516
- }
2580
+ LM_GGML_UNUSED(dev);
2581
+ }
2517
2582
 
2518
- //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2583
+ static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
2584
+ return lm_ggml_backend_buft_is_host(buft);
2519
2585
 
2520
- // copy data and init views
2521
- for (int i = 0; i < graph->n_nodes; i++) {
2522
- struct lm_ggml_tensor * node = graph->nodes[i];
2523
- graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2524
- }
2586
+ LM_GGML_UNUSED(dev);
2587
+ }
2525
2588
 
2526
- // build graph copy
2527
- struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
2528
- for (int i = 0; i < graph->n_nodes; i++) {
2529
- struct lm_ggml_tensor * node = graph->nodes[i];
2530
- struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
2531
- graph_copy->nodes[i] = node_copy;
2532
- }
2533
- graph_copy->n_nodes = graph->n_nodes;
2589
+ static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
2590
+ /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
2591
+ /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
2592
+ /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
2593
+ /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
2594
+ /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
2595
+ /* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
2596
+ /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
2597
+ /* .get_host_buffer_type = */ NULL,
2598
+ /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
2599
+ /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
2600
+ /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
2601
+ /* .offload_op = */ NULL,
2602
+ /* .event_new = */ NULL,
2603
+ /* .event_free = */ NULL,
2604
+ /* .event_synchronize = */ NULL,
2605
+ };
2534
2606
 
2535
- lm_ggml_hash_set_free(&hash_set);
2536
- free(node_copies);
2537
- free(node_init);
2607
+ // CPU backend - backend (reg)
2538
2608
 
2539
- return {
2540
- /* .buffer = */ buffer,
2541
- /* .ctx_allocated = */ ctx_allocated,
2542
- /* .ctx_unallocated = */ ctx_unallocated,
2543
- /* .graph = */ graph_copy,
2544
- };
2545
- }
2609
+ static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
2610
+ return "CPU";
2546
2611
 
2547
- void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
2548
- lm_ggml_backend_buffer_free(copy.buffer);
2549
- lm_ggml_free(copy.ctx_allocated);
2550
- lm_ggml_free(copy.ctx_unallocated);
2612
+ LM_GGML_UNUSED(reg);
2551
2613
  }
2552
2614
 
2553
- bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2554
- struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2555
- if (copy.buffer == NULL) {
2556
- return false;
2557
- }
2615
+ static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
2616
+ return 1;
2558
2617
 
2559
- struct lm_ggml_cgraph * g1 = graph;
2560
- struct lm_ggml_cgraph * g2 = copy.graph;
2618
+ LM_GGML_UNUSED(reg);
2619
+ }
2561
2620
 
2562
- assert(g1->n_nodes == g2->n_nodes);
2621
+ static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
2622
+ LM_GGML_ASSERT(index == 0);
2563
2623
 
2564
- for (int i = 0; i < g1->n_nodes; i++) {
2565
- //printf("eval %d/%d\n", i, g1->n_nodes);
2566
- struct lm_ggml_tensor * t1 = g1->nodes[i];
2567
- struct lm_ggml_tensor * t2 = g2->nodes[i];
2624
+ static lm_ggml_backend_cpu_device_context ctx;
2625
+ static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
2626
+ /* .iface = */ lm_ggml_backend_cpu_device_i,
2627
+ /* .reg = */ reg,
2628
+ /* .context = */ &ctx,
2629
+ };
2568
2630
 
2569
- assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2631
+ return &lm_ggml_backend_cpu_device;
2632
+ }
2570
2633
 
2571
- struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2572
- struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2634
+ static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
2635
+ if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
2636
+ return (void *)lm_ggml_backend_cpu_set_n_threads;
2637
+ }
2638
+ if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
2639
+ return (void *)lm_ggml_backend_cpu_get_extra_bufts;
2640
+ }
2573
2641
 
2574
- lm_ggml_backend_graph_compute(backend1, &g1v);
2575
- lm_ggml_backend_graph_compute(backend2, &g2v);
2642
+ return NULL;
2576
2643
 
2577
- if (lm_ggml_is_view_op(t1->op)) {
2578
- continue;
2579
- }
2644
+ LM_GGML_UNUSED(reg);
2645
+ }
2580
2646
 
2581
- // compare results, calculate rms etc
2582
- if (!callback(i, t1, t2, user_data)) {
2583
- break;
2584
- }
2585
- }
2647
+ static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
2648
+ /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
2649
+ /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
2650
+ /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
2651
+ /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
2652
+ };
2586
2653
 
2587
- lm_ggml_backend_graph_copy_free(copy);
2654
+ lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
2655
+ static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
2656
+ /* .iface = */ lm_ggml_backend_cpu_reg_i,
2657
+ /* .context = */ NULL,
2658
+ };
2588
2659
 
2589
- return true;
2660
+ return &lm_ggml_backend_cpu_reg;
2590
2661
  }