cui-llama.rn 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
  #include <windows.h>
9
9
  #endif
10
10
 
11
+ #include "ggml-backend.h"
11
12
  #include "ggml-backend-impl.h"
12
13
  #include "ggml-alloc.h"
13
14
  #include "ggml-impl.h"
@@ -34,6 +35,11 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
34
35
  }
35
36
 
36
37
  lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return lm_ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
37
43
  return buft->iface.alloc_buffer(buft, size);
38
44
  }
39
45
 
@@ -89,7 +95,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
89
95
  }
90
96
 
91
97
  const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
98
+ return lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(buffer));
93
99
  }
94
100
 
95
101
  void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
@@ -108,6 +114,11 @@ size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
108
114
  }
109
115
 
110
116
  void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
111
122
  void * base = buffer->iface.get_base(buffer);
112
123
 
113
124
  LM_GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +133,15 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct
122
133
  }
123
134
  }
124
135
 
136
+ void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
125
145
  size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
126
146
  return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
127
147
  }
@@ -134,10 +154,6 @@ size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, st
134
154
  return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor);
135
155
  }
136
156
 
137
- void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
157
  bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) {
142
158
  return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer));
143
159
  }
@@ -198,7 +214,7 @@ void lm_ggml_backend_free(lm_ggml_backend_t backend) {
198
214
  }
199
215
 
200
216
  lm_ggml_backend_buffer_type_t lm_ggml_backend_get_default_buffer_type(lm_ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
217
+ return lm_ggml_backend_dev_buffer_type(backend->device);
202
218
  }
203
219
 
204
220
  lm_ggml_backend_buffer_t lm_ggml_backend_alloc_buffer(lm_ggml_backend_t backend, size_t size) {
@@ -238,43 +254,42 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
238
254
  void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239
255
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
256
 
257
+ if (size == 0) {
258
+ return;
259
+ }
260
+
241
261
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
262
  LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
263
  LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
244
264
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
265
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
266
  }
251
267
 
252
268
  void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253
269
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
270
 
271
+ if (size == 0) {
272
+ return;
273
+ }
274
+
255
275
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
276
  LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
277
  LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds");
258
278
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
279
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
280
  }
265
281
 
266
282
  LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
283
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
284
 
269
- LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
285
+ if (size == 0) {
274
286
  return;
275
287
  }
276
288
 
277
- LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
289
+ LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
+ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
+ LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds");
292
+ LM_GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278
293
 
279
294
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
295
  }
@@ -316,32 +331,15 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
316
331
  }
317
332
 
318
333
  bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return lm_ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
334
+ return lm_ggml_backend_dev_supports_op(backend->device, op);
325
335
  }
326
336
 
327
337
  bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return lm_ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
- return backend->iface.supports_buft(backend, buft);
338
+ return lm_ggml_backend_dev_supports_buft(backend->device, buft);
333
339
  }
334
340
 
335
341
  bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
336
- // helper to ease transition to device interface
337
- if (backend->device) {
338
- return lm_ggml_backend_dev_offload_op(backend->device, op);
339
- }
340
-
341
- if (backend->iface.offload_op != NULL) {
342
- return backend->iface.offload_op(backend, op);
343
- }
344
- return false;
342
+ return lm_ggml_backend_dev_offload_op(backend->device, op);
345
343
  }
346
344
 
347
345
  lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
@@ -561,6 +559,16 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
561
559
  # include "ggml-amx.h"
562
560
  #endif
563
561
 
562
+ #ifdef LM_GGML_USE_CANN
563
+ #include "ggml-cann.h"
564
+ #endif
565
+
566
+ #ifdef LM_GGML_USE_KOMPUTE
567
+ #include "ggml-kompute.h"
568
+ #endif
569
+
570
+ #include "ggml-cpu.h"
571
+
564
572
  struct lm_ggml_backend_registry {
565
573
  std::vector<lm_ggml_backend_reg_t> backends;
566
574
  std::vector<lm_ggml_backend_dev_t> devices;
@@ -578,6 +586,9 @@ struct lm_ggml_backend_registry {
578
586
  #ifdef LM_GGML_USE_VULKAN
579
587
  register_backend(lm_ggml_backend_vk_reg());
580
588
  #endif
589
+ #ifdef LM_GGML_USE_CANN
590
+ register_backend(lm_ggml_backend_cann_reg());
591
+ #endif
581
592
  #ifdef LM_GGML_USE_BLAS
582
593
  register_backend(lm_ggml_backend_blas_reg());
583
594
  #endif
@@ -587,8 +598,9 @@ struct lm_ggml_backend_registry {
587
598
  #ifdef LM_GGML_USE_AMX
588
599
  register_backend(lm_ggml_backend_amx_reg());
589
600
  #endif
590
-
591
- // TODO: kompute, cann
601
+ #ifdef LM_GGML_USE_KOMPUTE
602
+ register_backend(lm_ggml_backend_kompute_reg());
603
+ #endif
592
604
 
593
605
  register_backend(lm_ggml_backend_cpu_reg());
594
606
  }
@@ -694,9 +706,9 @@ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type typ
694
706
  }
695
707
 
696
708
  lm_ggml_backend_t lm_ggml_backend_init_best(void) {
697
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
709
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
698
710
  if (!dev) {
699
- dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
711
+ dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
700
712
  }
701
713
  if (!dev) {
702
714
  return NULL;
@@ -704,1922 +716,1946 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
704
716
  return lm_ggml_backend_dev_init(dev, NULL);
705
717
  }
706
718
 
707
- // backend CPU
708
-
709
- static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
710
- return "CPU";
711
-
712
- LM_GGML_UNUSED(buffer);
713
- }
719
+ // multi-buffer buffer
714
720
 
715
- static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
716
- uintptr_t data = (uintptr_t)buffer->context;
721
+ struct lm_ggml_backend_multi_buffer_context {
722
+ lm_ggml_backend_buffer_t * buffers;
723
+ size_t n_buffers;
724
+ };
717
725
 
718
- // align the buffer
719
- if (data % TENSOR_ALIGNMENT != 0) {
720
- data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
726
+ static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
727
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
728
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
729
+ lm_ggml_backend_buffer_free(ctx->buffers[i]);
721
730
  }
722
731
 
723
- return (void *)data;
732
+ free(ctx->buffers);
733
+ free(ctx);
724
734
  }
725
735
 
726
- static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
727
- lm_ggml_aligned_free(buffer->context, buffer->size);
736
+ static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
737
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
738
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
739
+ lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
740
+ }
728
741
  }
729
742
 
730
- static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
731
- memset((char *)tensor->data + offset, value, size);
743
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
744
+ /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
745
+ /* .get_base = */ NULL,
746
+ /* .init_tensor = */ NULL,
747
+ /* .memset_tensor = */ NULL,
748
+ /* .set_tensor = */ NULL,
749
+ /* .get_tensor = */ NULL,
750
+ /* .cpy_tensor = */ NULL,
751
+ /* .clear = */ lm_ggml_backend_multi_buffer_clear,
752
+ /* .reset = */ NULL,
753
+ };
732
754
 
733
- LM_GGML_UNUSED(buffer);
734
- }
755
+ lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
756
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
757
+ ctx->n_buffers = n_buffers;
758
+ ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
735
759
 
736
- static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
737
- memcpy((char *)tensor->data + offset, data, size);
760
+ LM_GGML_ASSERT(ctx->buffers != NULL);
738
761
 
739
- LM_GGML_UNUSED(buffer);
740
- }
762
+ size_t total_size = 0;
763
+ for (size_t i = 0; i < n_buffers; i++) {
764
+ ctx->buffers[i] = buffers[i];
765
+ total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
766
+ }
741
767
 
742
- static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
743
- memcpy(data, (const char *)tensor->data + offset, size);
768
+ return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
769
+ }
744
770
 
745
- LM_GGML_UNUSED(buffer);
771
+ bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
772
+ return buffer->iface.free_buffer == lm_ggml_backend_multi_buffer_free_buffer;
746
773
  }
747
774
 
748
- static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
749
- if (lm_ggml_backend_buffer_is_host(src->buffer)) {
750
- memcpy(dst->data, src->data, lm_ggml_nbytes(src));
751
- return true;
775
+ void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
776
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
777
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
778
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
779
+ lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
752
780
  }
753
- return false;
781
+ }
754
782
 
755
- LM_GGML_UNUSED(buffer);
783
+ // creates a copy of the tensor with the same memory layout
784
+ static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
785
+ struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
786
+ for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
787
+ dup->nb[i] = tensor->nb[i];
788
+ }
789
+ return dup;
756
790
  }
757
791
 
758
- static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
759
- memset(buffer->context, value, buffer->size);
792
+ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
793
+ return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
760
794
  }
761
795
 
762
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
763
- /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
764
- /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
765
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
766
- /* .init_tensor = */ NULL, // no initialization required
767
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
768
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
769
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
770
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
771
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
772
- /* .reset = */ NULL,
773
- };
796
+ // scheduler
774
797
 
775
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
776
- /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
777
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
778
- /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
779
- /* .init_tensor = */ NULL, // no initialization required
780
- /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
781
- /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
782
- /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
783
- /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
784
- /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
785
- /* .reset = */ NULL,
786
- };
798
+ #ifndef LM_GGML_SCHED_MAX_BACKENDS
799
+ #define LM_GGML_SCHED_MAX_BACKENDS 16
800
+ #endif
787
801
 
788
- static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
789
- return "CPU";
802
+ #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
803
+ #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
804
+ #endif
790
805
 
791
- LM_GGML_UNUSED(buft);
792
- }
806
+ #ifndef LM_GGML_SCHED_MAX_COPIES
807
+ #define LM_GGML_SCHED_MAX_COPIES 4
808
+ #endif
793
809
 
794
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
795
- auto alloc_size = size;
796
- if (alloc_size == 0) {
797
- alloc_size = 1;
798
- }
810
+ struct lm_ggml_backend_sched_split {
811
+ int backend_id;
812
+ int i_start;
813
+ int i_end;
814
+ struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
815
+ int n_inputs;
816
+ // graph view of this split
817
+ struct lm_ggml_cgraph graph;
818
+ };
799
819
 
800
- void * data = lm_ggml_aligned_malloc(alloc_size);
820
+ struct lm_ggml_backend_sched {
821
+ bool is_reset; // true if the scheduler has been reset since the last graph split
822
+ bool is_alloc;
801
823
 
802
- if (data == NULL) {
803
- LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
804
- return NULL;
805
- }
824
+ int n_backends;
806
825
 
807
- return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, alloc_size);
808
- }
826
+ lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
827
+ lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
828
+ lm_ggml_gallocr_t galloc;
809
829
 
810
- static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
811
- return TENSOR_ALIGNMENT;
830
+ // hash map of the nodes in the graph
831
+ struct lm_ggml_hash_set hash_set;
832
+ int * hv_tensor_backend_ids; // [hash_set.size]
833
+ struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
812
834
 
813
- LM_GGML_UNUSED(buft);
814
- }
835
+ int * node_backend_ids; // [graph_size]
836
+ int * leaf_backend_ids; // [graph_size]
815
837
 
816
- static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
817
- return true;
838
+ int * prev_node_backend_ids; // [graph_size]
839
+ int * prev_leaf_backend_ids; // [graph_size]
818
840
 
819
- LM_GGML_UNUSED(buft);
820
- }
841
+ // copy of the graph with modified inputs
842
+ struct lm_ggml_cgraph graph;
821
843
 
822
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
823
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
824
- /* .iface = */ {
825
- /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
826
- /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
827
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
828
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
829
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
830
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
831
- },
832
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
833
- /* .context = */ NULL,
834
- };
844
+ // graph splits
845
+ struct lm_ggml_backend_sched_split * splits;
846
+ int n_splits;
847
+ int splits_capacity;
835
848
 
836
- return &lm_ggml_backend_cpu_buffer_type;
837
- }
849
+ // pipeline parallelism support
850
+ int n_copies;
851
+ int cur_copy;
852
+ lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
853
+ struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
854
+ int n_graph_inputs;
838
855
 
839
- #ifdef LM_GGML_USE_CPU_HBM
856
+ struct lm_ggml_context * ctx;
840
857
 
841
- // buffer type HBM
858
+ lm_ggml_backend_sched_eval_callback callback_eval;
859
+ void * callback_eval_user_data;
842
860
 
843
- #include <hbwmalloc.h>
861
+ char * context_buffer;
862
+ size_t context_buffer_size;
844
863
 
845
- static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
846
- return "CPU_HBM";
864
+ int debug;
865
+ };
847
866
 
848
- LM_GGML_UNUSED(buft);
849
- }
867
+ #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
868
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
869
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
870
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
850
871
 
851
- static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
852
- return "CPU_HBM";
853
-
854
- LM_GGML_UNUSED(buf);
872
+ // returns the priority of the backend, lower id is higher priority
873
+ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
874
+ for (int i = 0; i < sched->n_backends; i++) {
875
+ if (sched->backends[i] == backend) {
876
+ return i;
877
+ }
878
+ }
879
+ return -1;
855
880
  }
856
881
 
857
- static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
858
- hbw_free(buffer->context);
859
- }
882
+ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
883
+ lm_ggml_backend_buffer_t buffer = tensor->buffer;
884
+ if (buffer == NULL) {
885
+ return -1;
886
+ }
860
887
 
861
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
862
- //void * ptr = hbw_malloc(size);
863
- void * ptr;
864
- int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
865
- if (result != 0) {
866
- LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
867
- return NULL;
888
+ // find highest prio backend that supports the buffer type and the op
889
+ for (int i = 0; i < sched->n_backends; i++) {
890
+ if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
891
+ lm_ggml_backend_supports_op(sched->backends[i], op)) {
892
+ return i;
893
+ }
868
894
  }
869
895
 
870
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
871
- buffer->buft = buft;
872
- buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name;
873
- buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
896
+ #ifndef NDEBUG
897
+ LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
898
+ __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
899
+ #endif
874
900
 
875
- return buffer;
901
+ return -1;
876
902
  }
877
903
 
878
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
879
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
880
- /* .iface = */ {
881
- /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
882
- /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
883
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
884
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
885
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
886
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
887
- },
888
- /* .context = */ NULL,
889
- };
890
-
891
- return &lm_ggml_backend_cpu_buffer_type_hbm;
892
- }
904
+ #if 0
905
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
906
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
907
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
908
+ #define GET_CAUSE(node) causes[hash_id(node)]
909
+ #else
910
+ #define SET_CAUSE(node, ...)
911
+ #define GET_CAUSE(node) ""
893
912
  #endif
894
913
 
895
- struct lm_ggml_backend_cpu_context {
896
- int n_threads;
897
- lm_ggml_threadpool_t threadpool;
898
-
899
- uint8_t * work_data;
900
- size_t work_size;
914
+ // returns the backend that should be used for the node based on the current locations
915
+ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
916
+ // TODO: use supports_op to check if the backend supports the op
901
917
 
902
- lm_ggml_abort_callback abort_callback;
903
- void * abort_callback_data;
904
- };
918
+ // assign pre-allocated nodes to their backend
919
+ int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
920
+ if (cur_backend_id != -1) {
921
+ SET_CAUSE(tensor, "1.dst");
922
+ return cur_backend_id;
923
+ }
905
924
 
906
- static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
907
- return "CPU";
925
+ // view_src
926
+ if (tensor->view_src != NULL) {
927
+ cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
928
+ if (cur_backend_id != -1) {
929
+ SET_CAUSE(tensor, "1.vsrc");
930
+ return cur_backend_id;
931
+ }
932
+ }
908
933
 
909
- LM_GGML_UNUSED(backend);
910
- }
934
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
935
+ // since the tensor is pre-allocated, it cannot be moved to another backend
936
+ LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
937
+ }
911
938
 
912
- static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
913
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
914
- delete[] cpu_ctx->work_data;
915
- delete cpu_ctx;
916
- delete backend;
917
- }
939
+ // graph input
940
+ if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
941
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
942
+ SET_CAUSE(tensor, "1.inp");
943
+ return cur_backend_id;
944
+ }
918
945
 
919
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
920
- return lm_ggml_backend_cpu_buffer_type();
946
+ // operations with weights are preferably run on the same backend as the weights
947
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
948
+ const struct lm_ggml_tensor * src = tensor->src[i];
949
+ if (src == NULL) {
950
+ continue;
951
+ }
952
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
953
+ // not an ideal solution
954
+ if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
955
+ int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
956
+ // check if a backend with higher prio wants to offload the op
957
+ if (src_backend_id == sched->n_backends - 1) {
958
+ for (int b = 0; b < src_backend_id; b++) {
959
+ if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
960
+ SET_CAUSE(tensor, "1.off");
961
+ return b;
962
+ }
963
+ }
964
+ }
965
+ SET_CAUSE(tensor, "1.wgt%d", i);
966
+ return src_backend_id;
967
+ }
968
+ }
921
969
 
922
- LM_GGML_UNUSED(backend);
970
+ return -1;
923
971
  }
924
972
 
925
- struct lm_ggml_backend_plan_cpu {
926
- struct lm_ggml_cplan cplan;
927
- struct lm_ggml_cgraph cgraph;
928
- };
929
-
930
- static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
931
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
932
-
933
- struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
934
-
935
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
936
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
973
+ static char * fmt_size(size_t size) {
974
+ static char buffer[128];
975
+ if (size >= 1024*1024) {
976
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
977
+ } else {
978
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
979
+ }
980
+ return buffer;
981
+ }
937
982
 
938
- if (cpu_plan->cplan.work_size > 0) {
939
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
940
- if (cpu_plan->cplan.work_data == NULL) {
941
- delete cpu_plan;
942
- return NULL;
983
+ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
984
+ int cur_split = 0;
985
+ for (int i = 0; i < graph->n_nodes; i++) {
986
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
987
+ lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
988
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
989
+ sched->splits[cur_split].n_inputs);
990
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
991
+ LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
992
+ fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
993
+ }
994
+ LM_GGML_LOG_DEBUG("\n");
995
+ cur_split++;
996
+ }
997
+ struct lm_ggml_tensor * node = graph->nodes[i];
998
+ if (lm_ggml_is_view_op(node->op)) {
999
+ continue;
1000
+ }
1001
+ if (sched->debug > 1) {
1002
+ lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1003
+ LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1004
+ fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1005
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1006
+ struct lm_ggml_tensor * src = node->src[j];
1007
+ if (src == NULL) {
1008
+ continue;
1009
+ }
1010
+ lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1011
+ LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1012
+ fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1013
+ }
1014
+ LM_GGML_LOG_DEBUG("\n");
943
1015
  }
944
1016
  }
945
-
946
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
947
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
948
-
949
- return cpu_plan;
950
1017
  }
951
1018
 
952
- static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
953
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
1019
+ static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1020
+ lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1021
+ lm_ggml_backend_buffer_type_t buft = NULL;
954
1022
 
955
- delete[] cpu_plan->cplan.work_data;
956
- delete cpu_plan;
1023
+ if (buf) {
1024
+ // the tensor is already allocated
1025
+ buft = buf->buft;
1026
+ } else {
1027
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1028
+ int tensor_backend_id = tensor_backend_id(t);
1029
+ if (tensor_backend_id == -1 && t->view_src) {
1030
+ tensor_backend_id = tensor_backend_id(t->view_src);
1031
+ }
1032
+ if (tensor_backend_id != -1) {
1033
+ buft = sched->bufts[tensor_backend_id];
1034
+ }
1035
+ }
957
1036
 
958
- LM_GGML_UNUSED(backend);
1037
+ return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
959
1038
  }
960
1039
 
961
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
962
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
1040
+ static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1041
+ if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1042
+ *node_backend_id = cur_backend_id;
1043
+ SET_CAUSE(node, "2.sup");
1044
+ }
1045
+ }
963
1046
 
964
- return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
1047
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1048
+ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1049
+ // reset splits
1050
+ sched->n_splits = 0;
1051
+ sched->n_graph_inputs = 0;
1052
+ sched->is_reset = false;
965
1053
 
966
- LM_GGML_UNUSED(backend);
967
- }
1054
+ struct lm_ggml_init_params params = {
1055
+ /* .mem_size = */ sched->context_buffer_size,
1056
+ /* .mem_buffer = */ sched->context_buffer,
1057
+ /* .no_alloc = */ true
1058
+ };
968
1059
 
969
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
970
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
1060
+ lm_ggml_free(sched->ctx);
971
1061
 
972
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
1062
+ sched->ctx = lm_ggml_init(params);
1063
+ if (sched->ctx == NULL) {
1064
+ LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
1065
+ }
973
1066
 
974
- if (cpu_ctx->work_size < cplan.work_size) {
975
- delete[] cpu_ctx->work_data;
976
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
977
- if (cpu_ctx->work_data == NULL) {
978
- cpu_ctx->work_size = 0;
979
- return LM_GGML_STATUS_ALLOC_FAILED;
1067
+ // pass 1: assign backends to ops with pre-allocated inputs
1068
+ for (int i = 0; i < graph->n_leafs; i++) {
1069
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1070
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1071
+ // do not overwrite user assignments
1072
+ if (*leaf_backend_id == -1) {
1073
+ *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
980
1074
  }
981
- cpu_ctx->work_size = cplan.work_size;
982
1075
  }
983
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
984
-
985
- cplan.abort_callback = cpu_ctx->abort_callback;
986
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
987
1076
 
988
- return lm_ggml_graph_compute(cgraph, &cplan);
989
- }
990
-
991
- static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
992
- /* .get_name = */ lm_ggml_backend_cpu_get_name,
993
- /* .free = */ lm_ggml_backend_cpu_free,
994
- /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
995
- /* .set_tensor_async = */ NULL,
996
- /* .get_tensor_async = */ NULL,
997
- /* .cpy_tensor_async = */ NULL,
998
- /* .synchronize = */ NULL,
999
- /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
1000
- /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
1001
- /* .graph_plan_update = */ NULL,
1002
- /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
1003
- /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
1004
- /* .supports_op = */ NULL,
1005
- /* .supports_buft = */ NULL,
1006
- /* .offload_op = */ NULL,
1007
- /* .event_record = */ NULL,
1008
- /* .event_wait = */ NULL,
1009
- };
1077
+ for (int i = 0; i < graph->n_nodes; i++) {
1078
+ struct lm_ggml_tensor * node = graph->nodes[i];
1079
+ int * node_backend_id = &tensor_backend_id(node);
1080
+ // do not overwrite user assignments
1081
+ if (*node_backend_id == -1) {
1082
+ *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1010
1083
 
1011
- static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
1012
- static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
1013
- return &guid;
1014
- }
1084
+ #if 0
1085
+ // src
1086
+ if (node->op == LM_GGML_OP_NONE) {
1087
+ continue;
1088
+ }
1015
1089
 
1016
- lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
1017
- struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
1018
- if (ctx == NULL) {
1019
- return NULL;
1090
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1091
+ struct lm_ggml_tensor * src = node->src[j];
1092
+ if (src == NULL) {
1093
+ continue;
1094
+ }
1095
+ int * src_backend_id = &tensor_backend_id(src);
1096
+ if (*src_backend_id == -1) {
1097
+ *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1098
+ }
1099
+ }
1100
+ #endif
1101
+ }
1020
1102
  }
1021
1103
 
1022
- ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
1023
- ctx->threadpool = NULL;
1024
- ctx->work_data = NULL;
1025
- ctx->work_size = 0;
1026
- ctx->abort_callback = NULL;
1027
- ctx->abort_callback_data = NULL;
1028
-
1029
- lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
1030
- /* .guid = */ lm_ggml_backend_cpu_guid(),
1031
- /* .interface = */ lm_ggml_backend_cpu_i,
1032
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1033
- /* .context = */ ctx,
1034
- };
1035
-
1036
- if (cpu_backend == NULL) {
1037
- delete ctx;
1038
- return NULL;
1104
+ // pass 2: expand current backend assignments
1105
+ // assign the same backend to adjacent nodes
1106
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1107
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1108
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1109
+ // expand gpu down
1110
+ {
1111
+ int cur_backend_id = -1;
1112
+ for (int i = 0; i < graph->n_nodes; i++) {
1113
+ struct lm_ggml_tensor * node = graph->nodes[i];
1114
+ if (lm_ggml_is_view_op(node->op)) {
1115
+ continue;
1116
+ }
1117
+ int * node_backend_id = &tensor_backend_id(node);
1118
+ if (*node_backend_id != -1) {
1119
+ if (*node_backend_id == sched->n_backends - 1) {
1120
+ // skip cpu (lowest prio backend)
1121
+ cur_backend_id = -1;
1122
+ } else {
1123
+ cur_backend_id = *node_backend_id;
1124
+ }
1125
+ } else if (cur_backend_id != -1) {
1126
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1127
+ }
1128
+ }
1039
1129
  }
1040
-
1041
- return cpu_backend;
1042
- }
1043
-
1044
- bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
1045
- return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
1046
- }
1047
-
1048
- void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
1049
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1050
-
1051
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1052
- ctx->n_threads = n_threads;
1053
- }
1054
-
1055
- void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
1056
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1057
-
1058
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1059
-
1060
- if (ctx->threadpool && ctx->threadpool != threadpool) {
1061
- // already had a different threadpool, pause/suspend it before switching
1062
- lm_ggml_threadpool_pause(ctx->threadpool);
1130
+ // expand gpu up
1131
+ {
1132
+ int cur_backend_id = -1;
1133
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1134
+ struct lm_ggml_tensor * node = graph->nodes[i];
1135
+ if (lm_ggml_is_view_op(node->op)) {
1136
+ continue;
1137
+ }
1138
+ int * node_backend_id = &tensor_backend_id(node);
1139
+ if (*node_backend_id != -1) {
1140
+ if (*node_backend_id == sched->n_backends - 1) {
1141
+ // skip cpu (lowest prio backend)
1142
+ cur_backend_id = -1;
1143
+ } else {
1144
+ cur_backend_id = *node_backend_id;
1145
+ }
1146
+ } else if (cur_backend_id != -1) {
1147
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1148
+ }
1149
+ }
1150
+ }
1151
+ // expand rest down
1152
+ {
1153
+ int cur_backend_id = -1;
1154
+ for (int i = 0; i < graph->n_nodes; i++) {
1155
+ struct lm_ggml_tensor * node = graph->nodes[i];
1156
+ if (lm_ggml_is_view_op(node->op)) {
1157
+ continue;
1158
+ }
1159
+ int * node_backend_id = &tensor_backend_id(node);
1160
+ if (*node_backend_id != -1) {
1161
+ cur_backend_id = *node_backend_id;
1162
+ } else if (cur_backend_id != -1) {
1163
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1164
+ }
1165
+ }
1166
+ }
1167
+ // expand rest up
1168
+ {
1169
+ int cur_backend_id = -1;
1170
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1171
+ struct lm_ggml_tensor * node = graph->nodes[i];
1172
+ if (lm_ggml_is_view_op(node->op)) {
1173
+ continue;
1174
+ }
1175
+ int * node_backend_id = &tensor_backend_id(node);
1176
+ if (*node_backend_id != -1) {
1177
+ cur_backend_id = *node_backend_id;
1178
+ } else if (cur_backend_id != -1) {
1179
+ lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1180
+ }
1181
+ }
1063
1182
  }
1064
- ctx->threadpool = threadpool;
1065
- }
1066
-
1067
- void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
1068
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
1069
-
1070
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
1071
- ctx->abort_callback = abort_callback;
1072
- ctx->abort_callback_data = abort_callback_data;
1073
- }
1074
-
1075
- lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1076
- LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1077
- return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1078
- }
1079
-
1080
- ////////////////////////
1081
-
1082
- struct lm_ggml_backend_cpu_device_context {
1083
- std::string description = "CPU";
1084
1183
 
1085
- lm_ggml_backend_cpu_device_context() {
1086
- #ifdef __APPLE__
1087
- size_t len = 0;
1088
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1089
- description.resize(len);
1090
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1184
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1185
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1186
+ // however, we also need to verify that the sources are in compatible buffer types
1187
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1188
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1189
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1190
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1191
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1192
+ for (int i = 0; i < graph->n_nodes; i++) {
1193
+ struct lm_ggml_tensor * node = graph->nodes[i];
1194
+ if (lm_ggml_is_view_op(node->op)) {
1195
+ continue;
1091
1196
  }
1092
- #elif defined(__linux__)
1093
- FILE * f = fopen("/proc/cpuinfo", "r");
1094
- if (f) {
1095
- char buf[1024];
1096
- while (fgets(buf, sizeof(buf), f)) {
1097
- if (strncmp(buf, "model name", 10) == 0) {
1098
- char * p = strchr(buf, ':');
1099
- if (p) {
1100
- p++;
1101
- while (std::isspace(*p)) {
1102
- p++;
1197
+ int * node_backend_id = &tensor_backend_id(node);
1198
+ if (*node_backend_id == -1) {
1199
+ // unassigned node: find the backend with the most supported inputs
1200
+ int n_supported_best = -1;
1201
+ for (int b = 0; b < sched->n_backends; b++) {
1202
+ if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1203
+ int n_supported = 0;
1204
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1205
+ struct lm_ggml_tensor * src = node->src[j];
1206
+ if (src == NULL) {
1207
+ continue;
1103
1208
  }
1104
- while (std::isspace(p[strlen(p) - 1])) {
1105
- p[strlen(p) - 1] = '\0';
1209
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1210
+ n_supported++;
1106
1211
  }
1107
- description = p;
1212
+ }
1213
+ if (n_supported > n_supported_best) {
1214
+ n_supported_best = n_supported;
1215
+ *node_backend_id = b;
1216
+ SET_CAUSE(node, "3.best");
1217
+ }
1218
+ }
1219
+ }
1220
+ } else {
1221
+ // assigned node: upgrade to higher prio backend if possible
1222
+ for (int b = 0; b < *node_backend_id; b++) {
1223
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1224
+ bool supported = true;
1225
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1226
+ struct lm_ggml_tensor * src = node->src[j];
1227
+ if (src == NULL) {
1228
+ continue;
1229
+ }
1230
+ if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1231
+ supported = false;
1232
+ break;
1233
+ }
1234
+ }
1235
+ if (supported) {
1236
+ *node_backend_id = b;
1237
+ SET_CAUSE(node, "3.upg");
1108
1238
  break;
1109
1239
  }
1110
1240
  }
1111
1241
  }
1112
- fclose(f);
1113
1242
  }
1114
- #elif defined(_WIN32)
1115
- HKEY hKey;
1116
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1117
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1118
- 0,
1119
- KEY_READ,
1120
- &hKey) == ERROR_SUCCESS) {
1121
- DWORD cpu_brand_size = 0;
1122
- if (RegQueryValueExA(hKey,
1123
- TEXT("ProcessorNameString"),
1124
- NULL,
1125
- NULL,
1126
- NULL,
1127
- &cpu_brand_size) == ERROR_SUCCESS) {
1128
- description.resize(cpu_brand_size);
1129
- if (RegQueryValueExA(hKey,
1130
- TEXT("ProcessorNameString"),
1131
- NULL,
1132
- NULL,
1133
- (LPBYTE)&description[0], // NOLINT
1134
- &cpu_brand_size) == ERROR_SUCCESS) {
1135
- if (description.find('\0') != std::string::npos) {
1136
- description.resize(description.find('\0'));
1137
- }
1243
+ }
1244
+
1245
+ // pass 4: assign backends to remaining src from dst and view_src
1246
+ for (int i = 0; i < graph->n_nodes; i++) {
1247
+ struct lm_ggml_tensor * node = graph->nodes[i];
1248
+ int * cur_backend_id = &tensor_backend_id(node);
1249
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1250
+ *cur_backend_id = tensor_backend_id(node->view_src);
1251
+ SET_CAUSE(node, "4.vsrc");
1252
+ }
1253
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1254
+ struct lm_ggml_tensor * src = node->src[j];
1255
+ if (src == NULL) {
1256
+ continue;
1257
+ }
1258
+ int * src_backend_id = &tensor_backend_id(src);
1259
+ if (*src_backend_id == -1) {
1260
+ if (src->view_src != NULL) {
1261
+ // views are always on the same backend as the source
1262
+ *src_backend_id = tensor_backend_id(src->view_src);
1263
+ SET_CAUSE(src, "4.vsrc");
1264
+ } else {
1265
+ *src_backend_id = *cur_backend_id;
1266
+ SET_CAUSE(src, "4.cur");
1138
1267
  }
1139
1268
  }
1140
- RegCloseKey(hKey);
1141
1269
  }
1142
- #endif
1143
1270
  }
1144
- };
1145
-
1146
- static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
1147
- return "CPU";
1148
-
1149
- LM_GGML_UNUSED(dev);
1150
- }
1151
1271
 
1152
- static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
1153
- struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
1272
+ // pass 5: split graph, find tensors that need to be copied
1273
+ {
1274
+ int i_split = 0;
1275
+ struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1276
+ // find the backend of the first split, skipping view ops
1277
+ int i = 0;
1278
+ for (; i < graph->n_nodes; i++) {
1279
+ struct lm_ggml_tensor * node = graph->nodes[i];
1280
+ if (!lm_ggml_is_view_op(node->op)) {
1281
+ split->backend_id = tensor_backend_id(node);
1282
+ break;
1283
+ }
1284
+ }
1285
+ split->i_start = 0;
1286
+ split->n_inputs = 0;
1287
+ int cur_backend_id = split->backend_id;
1288
+ for (; i < graph->n_nodes; i++) {
1289
+ struct lm_ggml_tensor * node = graph->nodes[i];
1154
1290
 
1155
- return ctx->description.c_str();
1156
- }
1291
+ if (lm_ggml_is_view_op(node->op)) {
1292
+ continue;
1293
+ }
1157
1294
 
1158
- static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
1159
- // TODO
1160
- *free = 0;
1161
- *total = 0;
1295
+ const int node_backend_id = tensor_backend_id(node);
1162
1296
 
1163
- LM_GGML_UNUSED(dev);
1164
- }
1297
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1165
1298
 
1166
- static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
1167
- return LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1299
+ // check if we should start a new split based on the sources of the current node
1300
+ bool need_new_split = false;
1301
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1302
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1303
+ struct lm_ggml_tensor * src = node->src[j];
1304
+ if (src == NULL) {
1305
+ continue;
1306
+ }
1307
+ // check if a weight is on a different and incompatible backend
1308
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1309
+ if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1310
+ int src_backend_id = tensor_backend_id(src);
1311
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1312
+ need_new_split = true;
1313
+ break;
1314
+ }
1315
+ }
1316
+ // check if the split has too many inputs
1317
+ // FIXME: count the number of inputs instead of only checking when full
1318
+ if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1319
+ const size_t id = hash_id(src);
1320
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1321
+ bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1322
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1323
+ need_new_split = true;
1324
+ break;
1325
+ }
1326
+ }
1327
+ }
1328
+ }
1168
1329
 
1169
- LM_GGML_UNUSED(dev);
1170
- }
1330
+ if (node_backend_id != cur_backend_id || need_new_split) {
1331
+ split->i_end = i;
1332
+ i_split++;
1333
+ if (i_split >= sched->splits_capacity) {
1334
+ sched->splits_capacity *= 2;
1335
+ sched->splits = (lm_ggml_backend_sched_split *)
1336
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1337
+ LM_GGML_ASSERT(sched->splits != NULL);
1338
+ }
1339
+ split = &sched->splits[i_split];
1340
+ split->backend_id = node_backend_id;
1341
+ split->i_start = i;
1342
+ split->n_inputs = 0;
1343
+ cur_backend_id = node_backend_id;
1344
+ }
1171
1345
 
1172
- static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
1173
- props->name = lm_ggml_backend_cpu_device_get_name(dev);
1174
- props->description = lm_ggml_backend_cpu_device_get_description(dev);
1175
- props->type = lm_ggml_backend_cpu_device_get_type(dev);
1176
- lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1177
- props->caps = {
1178
- /* .async = */ false,
1179
- /* .host_buffer = */ false,
1180
- /* .buffer_from_host_ptr = */ true,
1181
- /* .events = */ false,
1182
- };
1183
- }
1346
+ // find inputs that are not on the same backend
1347
+ for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1348
+ struct lm_ggml_tensor * src = node->src[j];
1349
+ if (src == NULL) {
1350
+ continue;
1351
+ }
1184
1352
 
1185
- static lm_ggml_backend_t lm_ggml_backend_cpu_device_init(lm_ggml_backend_dev_t dev, const char * params) {
1186
- return lm_ggml_backend_cpu_init();
1353
+ size_t src_id = hash_id(src);
1354
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1355
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1187
1356
 
1188
- LM_GGML_UNUSED(dev);
1189
- LM_GGML_UNUSED(params);
1190
- }
1357
+ if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1358
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1359
+ lm_ggml_backend_t backend = sched->backends[src_backend_id];
1360
+ for (int c = 0; c < sched->n_copies; c++) {
1361
+ struct lm_ggml_tensor * tensor_copy;
1362
+ if (c == sched->cur_copy) {
1363
+ tensor_copy = src; // use the original tensor as the current copy
1364
+ } else {
1365
+ tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1366
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1367
+ }
1368
+ if (sched->n_copies > 1) {
1369
+ lm_ggml_set_input(tensor_copy);
1370
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1371
+ }
1372
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1373
+ SET_CAUSE(tensor_copy, "4.cpy");
1374
+ }
1375
+ int n_graph_inputs = sched->n_graph_inputs++;
1376
+ LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1377
+ sched->graph_inputs[n_graph_inputs] = src;
1378
+ }
1379
+ }
1191
1380
 
1192
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
1193
- return lm_ggml_backend_cpu_buffer_type();
1381
+ if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1382
+ // create a copy of the input in the split's backend
1383
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1384
+ lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1385
+ for (int c = 0; c < sched->n_copies; c++) {
1386
+ struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1387
+ lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1388
+ if (sched->n_copies > 1) {
1389
+ lm_ggml_set_input(tensor_copy);
1390
+ lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1391
+ }
1392
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1393
+ SET_CAUSE(tensor_copy, "4.cpy");
1394
+ }
1395
+ int n_inputs = split->n_inputs++;
1396
+ LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1397
+ split->inputs[n_inputs] = src;
1398
+ }
1399
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1400
+ }
1401
+ }
1402
+ }
1403
+ split->i_end = graph->n_nodes;
1404
+ sched->n_splits = i_split + 1;
1405
+ }
1194
1406
 
1195
- LM_GGML_UNUSED(dev);
1196
- }
1407
+ if (sched->debug) {
1408
+ lm_ggml_backend_sched_print_assignments(sched, graph);
1409
+ }
1197
1410
 
1198
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1199
- return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
1411
+ // swap node_backend_ids and leaf _backend_ids with prevs
1412
+ {
1413
+ int * tmp = sched->node_backend_ids;
1414
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1415
+ sched->prev_node_backend_ids = tmp;
1200
1416
 
1201
- LM_GGML_UNUSED(dev);
1202
- LM_GGML_UNUSED(max_tensor_size);
1203
- }
1417
+ tmp = sched->leaf_backend_ids;
1418
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1419
+ sched->prev_leaf_backend_ids = tmp;
1420
+ }
1204
1421
 
1205
- static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
1206
- switch (op->op) {
1207
- case LM_GGML_OP_CPY:
1208
- return
1209
- op->type != LM_GGML_TYPE_IQ2_XXS &&
1210
- op->type != LM_GGML_TYPE_IQ2_XS &&
1211
- op->type != LM_GGML_TYPE_IQ1_S &&
1212
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
1213
- case LM_GGML_OP_MUL_MAT:
1214
- return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
1215
- case LM_GGML_OP_ROPE_BACK:
1216
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1217
- case LM_GGML_OP_IM2COL_BACK:
1218
- return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
1219
- case LM_GGML_OP_OUT_PROD:
1220
- return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
1221
- default:
1222
- return true;
1422
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1423
+ if (sched->graph.size < graph_size) {
1424
+ sched->graph.size = graph_size;
1425
+ sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1426
+ sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1427
+ LM_GGML_ASSERT(sched->graph.nodes != NULL);
1428
+ LM_GGML_ASSERT(sched->graph.leafs != NULL);
1223
1429
  }
1430
+ sched->graph.n_nodes = 0;
1431
+ sched->graph.n_leafs = 0;
1224
1432
 
1225
- LM_GGML_UNUSED(dev);
1226
- }
1227
-
1228
- static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
1229
- return lm_ggml_backend_buft_is_host(buft);
1230
-
1231
- LM_GGML_UNUSED(dev);
1232
- }
1433
+ struct lm_ggml_cgraph * graph_copy = &sched->graph;
1233
1434
 
1234
- static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
1235
- /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
1236
- /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
1237
- /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
1238
- /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
1239
- /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
1240
- /* .init_backend = */ lm_ggml_backend_cpu_device_init,
1241
- /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
1242
- /* .get_host_buffer_type = */ NULL,
1243
- /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
1244
- /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
1245
- /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
1246
- /* .offload_op = */ NULL,
1247
- /* .event_new = */ NULL,
1248
- /* .event_free = */ NULL,
1249
- /* .event_synchronize = */ NULL,
1250
- };
1435
+ for (int i = 0; i < sched->n_splits; i++) {
1436
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1437
+ split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1251
1438
 
1252
- ////////////////////////
1439
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1440
+ for (int j = 0; j < split->n_inputs; j++) {
1441
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1253
1442
 
1254
- static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
1255
- return "CPU";
1443
+ struct lm_ggml_tensor * input = split->inputs[j];
1444
+ const size_t input_id = hash_id(input);
1445
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1256
1446
 
1257
- LM_GGML_UNUSED(reg);
1258
- }
1447
+ // add a dependency to the input source so that it is not freed before the copy is done
1448
+ struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
1449
+ input_dep->src[0] = input;
1450
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1451
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1259
1452
 
1260
- static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
1261
- return 1;
1453
+ // add a dependency to the input copy so that it is allocated at the start of the split
1454
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1455
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1456
+ }
1262
1457
 
1263
- LM_GGML_UNUSED(reg);
1264
- }
1458
+ for (int j = split->i_start; j < split->i_end; j++) {
1459
+ assert(graph_copy->size > graph_copy->n_nodes);
1460
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1461
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1462
+ }
1463
+ }
1265
1464
 
1266
- static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
1267
- LM_GGML_ASSERT(index == 0);
1465
+ if (sched->n_copies > 1) {
1466
+ // add input copies as leafs so that they are allocated first
1467
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1468
+ struct lm_ggml_tensor * input = sched->graph_inputs[i];
1469
+ size_t id = hash_id(input);
1470
+ int backend_id = tensor_backend_id(input);
1471
+ for (int c = 0; c < sched->n_copies; c++) {
1472
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1473
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1474
+ assert(graph_copy->size > graph_copy->n_leafs);
1475
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1476
+ }
1477
+ }
1268
1478
 
1269
- static lm_ggml_backend_cpu_device_context ctx;
1270
- static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
1271
- /* .iface = */ lm_ggml_backend_cpu_device_i,
1272
- /* .reg = */ reg,
1273
- /* .context = */ &ctx,
1274
- };
1479
+ for (int i = 0; i < sched->n_splits; i++) {
1480
+ struct lm_ggml_backend_sched_split * split = &sched->splits[i];
1481
+ int backend_id = split->backend_id;
1482
+ for (int j = 0; j < split->n_inputs; j++) {
1483
+ struct lm_ggml_tensor * input = split->inputs[j];
1484
+ size_t id = hash_id(input);
1485
+ for (int c = 0; c < sched->n_copies; c++) {
1486
+ struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1487
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1488
+ assert(graph_copy->size > graph_copy->n_leafs);
1489
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1490
+ }
1491
+ }
1492
+ }
1493
+ }
1275
1494
 
1276
- return &lm_ggml_backend_cpu_device;
1495
+ // add leafs from the original graph
1496
+ for (int i = 0; i < graph->n_leafs; i++) {
1497
+ struct lm_ggml_tensor * leaf = graph->leafs[i];
1498
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1499
+ assert(graph_copy->size > graph_copy->n_leafs);
1500
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1501
+ }
1277
1502
  }
1278
1503
 
1279
- static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
1280
- if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
1281
- return (void *)lm_ggml_backend_cpu_set_n_threads;
1504
+ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1505
+ bool backend_ids_changed = false;
1506
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1507
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1508
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1509
+ backend_ids_changed = true;
1510
+ break;
1511
+ }
1512
+ }
1513
+ if (!backend_ids_changed) {
1514
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1515
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1516
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1517
+ backend_ids_changed = true;
1518
+ break;
1519
+ }
1520
+ }
1282
1521
  }
1283
- return NULL;
1284
1522
 
1285
- LM_GGML_UNUSED(reg);
1523
+ // allocate graph
1524
+ if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1525
+ // the re-allocation may cause the split inputs to be moved to a different address
1526
+ lm_ggml_backend_sched_synchronize(sched);
1527
+ #ifndef NDEBUG
1528
+ LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1529
+ #endif
1530
+ lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1531
+ if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1532
+ LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1533
+ return false;
1534
+ }
1535
+ }
1536
+
1537
+ return true;
1286
1538
  }
1287
1539
 
1288
- static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
1289
- /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
1290
- /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
1291
- /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
1292
- /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
1293
- };
1540
+ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
1541
+ struct lm_ggml_backend_sched_split * splits = sched->splits;
1294
1542
 
1295
- lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
1296
- static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
1297
- /* .iface = */ lm_ggml_backend_cpu_reg_i,
1298
- /* .context = */ NULL,
1299
- };
1543
+ for (int i = 0; i < sched->n_splits; i++) {
1544
+ struct lm_ggml_backend_sched_split * split = &splits[i];
1545
+ int split_backend_id = split->backend_id;
1546
+ lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
1300
1547
 
1301
- return &lm_ggml_backend_cpu_reg;
1302
- }
1548
+ // copy the input tensors to the split backend
1549
+ for (int j = 0; j < split->n_inputs; j++) {
1550
+ lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1551
+ struct lm_ggml_tensor * input = split->inputs[j];
1552
+ struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1303
1553
 
1304
- // multi-buffer buffer
1554
+ if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1555
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1556
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1557
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1558
+ } else {
1559
+ lm_ggml_backend_synchronize(split_backend);
1560
+ }
1561
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1562
+ } else {
1563
+ // wait for the split backend to finish using the input before overwriting it
1564
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1565
+ lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1566
+ } else {
1567
+ lm_ggml_backend_synchronize(split_backend);
1568
+ }
1569
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1570
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1571
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1572
+ lm_ggml_backend_synchronize(input_backend);
1573
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1574
+ lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1575
+ } else {
1576
+ lm_ggml_backend_synchronize(split_backend);
1577
+ }
1578
+ lm_ggml_backend_tensor_copy(input, input_cpy);
1579
+ }
1580
+ }
1581
+ }
1305
1582
 
1306
- struct lm_ggml_backend_multi_buffer_context {
1307
- lm_ggml_backend_buffer_t * buffers;
1308
- size_t n_buffers;
1309
- };
1583
+ if (!sched->callback_eval) {
1584
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
1585
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1586
+ return ec;
1587
+ }
1588
+ } else {
1589
+ // similar to lm_ggml_backend_compare_graph_backend
1590
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1591
+ struct lm_ggml_tensor * t = split->graph.nodes[j0];
1310
1592
 
1311
- static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
1312
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1593
+ // check if the user needs data from this node
1594
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1313
1595
 
1314
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1315
- }
1596
+ int j1 = j0;
1316
1597
 
1317
- static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
1318
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1319
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1320
- lm_ggml_backend_buffer_free(ctx->buffers[i]);
1321
- }
1598
+ // determine the range [j0, j1] of nodes that can be computed together
1599
+ while (!need && j1 < split->graph.n_nodes - 1) {
1600
+ t = split->graph.nodes[++j1];
1601
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1602
+ }
1322
1603
 
1323
- free(ctx->buffers);
1324
- free(ctx);
1325
- }
1604
+ struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
1326
1605
 
1327
- static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
1328
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1329
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1330
- lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
1331
- }
1332
- }
1606
+ enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
1607
+ if (ec != LM_GGML_STATUS_SUCCESS) {
1608
+ return ec;
1609
+ }
1333
1610
 
1334
- static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
1335
- /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
1336
- /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
1337
- /* .get_base = */ NULL,
1338
- /* .init_tensor = */ NULL,
1339
- /* .memset_tensor = */ NULL,
1340
- /* .set_tensor = */ NULL,
1341
- /* .get_tensor = */ NULL,
1342
- /* .cpy_tensor = */ NULL,
1343
- /* .clear = */ lm_ggml_backend_multi_buffer_clear,
1344
- /* .reset = */ NULL,
1345
- };
1611
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1612
+ lm_ggml_backend_synchronize(split_backend);
1346
1613
 
1347
- lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
1348
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
1349
- ctx->n_buffers = n_buffers;
1350
- ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
1614
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1615
+ break;
1616
+ }
1351
1617
 
1352
- LM_GGML_ASSERT(ctx->buffers != NULL);
1618
+ j0 = j1;
1619
+ }
1620
+ }
1353
1621
 
1354
- size_t total_size = 0;
1355
- for (size_t i = 0; i < n_buffers; i++) {
1356
- ctx->buffers[i] = buffers[i];
1357
- total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
1622
+ // record the event of this copy
1623
+ if (split->n_inputs > 0) {
1624
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1625
+ lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1626
+ }
1627
+ }
1358
1628
  }
1359
1629
 
1360
- return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
1361
- }
1630
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1362
1631
 
1363
- bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
1364
- return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
1632
+ return LM_GGML_STATUS_SUCCESS;
1365
1633
  }
1366
1634
 
1367
- void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1368
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
1369
- lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1370
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1371
- lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1372
- }
1373
- }
1635
+ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1636
+ lm_ggml_backend_t * backends,
1637
+ lm_ggml_backend_buffer_type_t * bufts,
1638
+ int n_backends,
1639
+ size_t graph_size,
1640
+ bool parallel) {
1641
+ LM_GGML_ASSERT(n_backends > 0);
1642
+ LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1643
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1374
1644
 
1375
- // creates a copy of the tensor with the same memory layout
1376
- static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context * ctx, const struct lm_ggml_tensor * tensor) {
1377
- struct lm_ggml_tensor * dup = lm_ggml_dup_tensor(ctx, tensor);
1378
- for (int i = 0; i < LM_GGML_MAX_DIMS; i++) {
1379
- dup->nb[i] = tensor->nb[i];
1380
- }
1381
- return dup;
1382
- }
1645
+ struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1383
1646
 
1384
- static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1385
- return op == LM_GGML_OP_VIEW || op == LM_GGML_OP_RESHAPE || op == LM_GGML_OP_PERMUTE || op == LM_GGML_OP_TRANSPOSE;
1386
- }
1647
+ const char * LM_GGML_SCHED_DEBUG = getenv("LM_GGML_SCHED_DEBUG");
1648
+ sched->debug = LM_GGML_SCHED_DEBUG ? atoi(LM_GGML_SCHED_DEBUG) : 0;
1649
+ sched->n_backends = n_backends;
1650
+ sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
1387
1651
 
1388
- // scheduler
1652
+ // initialize hash table
1653
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1654
+ sched->hash_set = lm_ggml_hash_set_new(graph_size);
1655
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1656
+ sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1389
1657
 
1390
- #ifndef LM_GGML_SCHED_MAX_BACKENDS
1391
- #define LM_GGML_SCHED_MAX_BACKENDS 16
1392
- #endif
1658
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1659
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1660
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1661
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1662
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1663
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1393
1664
 
1394
- #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1395
- #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1396
- #endif
1665
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1666
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1397
1667
 
1398
- #ifndef LM_GGML_SCHED_MAX_COPIES
1399
- #define LM_GGML_SCHED_MAX_COPIES 4
1400
- #endif
1668
+ const int initial_splits_capacity = 16;
1669
+ sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1670
+ sched->splits_capacity = initial_splits_capacity;
1401
1671
 
1402
- struct lm_ggml_backend_sched_split {
1403
- int backend_id;
1404
- int i_start;
1405
- int i_end;
1406
- struct lm_ggml_tensor * inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1407
- int n_inputs;
1408
- // graph view of this split
1409
- struct lm_ggml_cgraph graph;
1410
- };
1672
+ for (int b = 0; b < n_backends; b++) {
1673
+ sched->backends[b] = backends[b];
1674
+ sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
1675
+ LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1411
1676
 
1412
- struct lm_ggml_backend_sched {
1413
- bool is_reset; // true if the scheduler has been reset since the last graph split
1414
- bool is_alloc;
1677
+ if (sched->n_copies > 1) {
1678
+ for (int c = 0; c < sched->n_copies; c++) {
1679
+ sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
1680
+ }
1681
+ }
1682
+ }
1415
1683
 
1416
- int n_backends;
1684
+ sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
1417
1685
 
1418
- lm_ggml_backend_t backends[LM_GGML_SCHED_MAX_BACKENDS];
1419
- lm_ggml_backend_buffer_type_t bufts[LM_GGML_SCHED_MAX_BACKENDS];
1420
- lm_ggml_gallocr_t galloc;
1686
+ lm_ggml_backend_sched_reset(sched);
1421
1687
 
1422
- // hash map of the nodes in the graph
1423
- struct lm_ggml_hash_set hash_set;
1424
- int * hv_tensor_backend_ids; // [hash_set.size]
1425
- struct lm_ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1688
+ return sched;
1689
+ }
1426
1690
 
1427
- int * node_backend_ids; // [graph_size]
1428
- int * leaf_backend_ids; // [graph_size]
1691
+ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
1692
+ if (sched == NULL) {
1693
+ return;
1694
+ }
1695
+ for (int b = 0; b < sched->n_backends; b++) {
1696
+ for (int c = 0; c < sched->n_copies; c++) {
1697
+ lm_ggml_backend_event_free(sched->events[b][c]);
1698
+ }
1699
+ }
1700
+ lm_ggml_gallocr_free(sched->galloc);
1701
+ lm_ggml_free(sched->ctx);
1702
+ lm_ggml_hash_set_free(&sched->hash_set);
1703
+ free(sched->splits);
1704
+ free(sched->hv_tensor_backend_ids);
1705
+ free(sched->hv_tensor_copies);
1706
+ free(sched->node_backend_ids);
1707
+ free(sched->leaf_backend_ids);
1708
+ free(sched->prev_node_backend_ids);
1709
+ free(sched->prev_leaf_backend_ids);
1710
+ free(sched->context_buffer);
1711
+ free(sched->graph.nodes);
1712
+ free(sched->graph.leafs);
1713
+ free(sched);
1714
+ }
1429
1715
 
1430
- int * prev_node_backend_ids; // [graph_size]
1431
- int * prev_leaf_backend_ids; // [graph_size]
1716
+ void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
1717
+ // reset state for the next run
1718
+ if (!sched->is_reset) {
1719
+ lm_ggml_hash_set_reset(&sched->hash_set);
1720
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1721
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1722
+ sched->is_reset = true;
1723
+ }
1724
+ sched->is_alloc = false;
1725
+ }
1432
1726
 
1433
- // copy of the graph with modified inputs
1434
- struct lm_ggml_cgraph graph;
1727
+ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
1728
+ LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1435
1729
 
1436
- // graph splits
1437
- struct lm_ggml_backend_sched_split * splits;
1438
- int n_splits;
1439
- int splits_capacity;
1730
+ lm_ggml_backend_sched_split_graph(sched, measure_graph);
1440
1731
 
1441
- // pipeline parallelism support
1442
- int n_copies;
1443
- int cur_copy;
1444
- lm_ggml_backend_event_t events[LM_GGML_SCHED_MAX_BACKENDS][LM_GGML_SCHED_MAX_COPIES];
1445
- struct lm_ggml_tensor * graph_inputs[LM_GGML_SCHED_MAX_SPLIT_INPUTS];
1446
- int n_graph_inputs;
1732
+ if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1733
+ return false;
1734
+ }
1447
1735
 
1448
- struct lm_ggml_context * ctx;
1736
+ lm_ggml_backend_sched_reset(sched);
1737
+ lm_ggml_backend_sched_synchronize(sched);
1449
1738
 
1450
- lm_ggml_backend_sched_eval_callback callback_eval;
1451
- void * callback_eval_user_data;
1739
+ return true;
1740
+ }
1452
1741
 
1453
- char * context_buffer;
1454
- size_t context_buffer_size;
1742
+ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1743
+ LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1455
1744
 
1456
- bool debug;
1457
- };
1745
+ lm_ggml_backend_sched_split_graph(sched, graph);
1458
1746
 
1459
- #define hash_id(tensor) lm_ggml_hash_find_or_insert(&sched->hash_set, tensor)
1460
- #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1461
- #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1462
- #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1463
1747
 
1464
- // returns the priority of the backend, lower id is higher priority
1465
- static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1466
- for (int i = 0; i < sched->n_backends; i++) {
1467
- if (sched->backends[i] == backend) {
1468
- return i;
1469
- }
1748
+ if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1749
+ return false;
1470
1750
  }
1471
- return -1;
1751
+
1752
+ sched->is_alloc = true;
1753
+
1754
+ return true;
1472
1755
  }
1473
1756
 
1474
- static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
1475
- lm_ggml_backend_buffer_t buffer = tensor->buffer;
1476
- if (buffer == NULL) {
1477
- return -1;
1757
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1758
+ enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
1759
+ lm_ggml_backend_sched_synchronize(sched);
1760
+ return err;
1761
+ }
1762
+
1763
+ enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1764
+ if (!sched->is_reset && !sched->is_alloc) {
1765
+ lm_ggml_backend_sched_reset(sched);
1478
1766
  }
1479
1767
 
1480
- // find highest prio backend that supports the buffer type and the op
1481
- for (int i = 0; i < sched->n_backends; i++) {
1482
- if (lm_ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1483
- lm_ggml_backend_supports_op(sched->backends[i], op)) {
1484
- return i;
1768
+ if (!sched->is_alloc) {
1769
+ if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
1770
+ return LM_GGML_STATUS_ALLOC_FAILED;
1485
1771
  }
1486
1772
  }
1487
1773
 
1488
- #ifndef NDEBUG
1489
- LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1490
- __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
1491
- #endif
1774
+ return lm_ggml_backend_sched_compute_splits(sched);
1775
+ }
1492
1776
 
1493
- return -1;
1777
+ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1778
+ for (int i = 0; i < sched->n_backends; i++) {
1779
+ lm_ggml_backend_synchronize(sched->backends[i]);
1780
+ }
1494
1781
  }
1495
1782
 
1496
- #if 0
1497
- #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1498
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1499
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1500
- #define GET_CAUSE(node) causes[hash_id(node)]
1501
- #else
1502
- #define SET_CAUSE(node, ...)
1503
- #define GET_CAUSE(node) ""
1504
- #endif
1783
+ void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
1784
+ sched->callback_eval = callback;
1785
+ sched->callback_eval_user_data = user_data;
1786
+ }
1505
1787
 
1506
- // returns the backend that should be used for the node based on the current locations
1507
- static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
1508
- // TODO: use supports_op to check if the backend supports the op
1788
+ int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
1789
+ return sched->n_splits;
1790
+ }
1509
1791
 
1510
- // assign pre-allocated nodes to their backend
1511
- int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1512
- if (cur_backend_id != -1) {
1513
- SET_CAUSE(tensor, "1.dst");
1514
- return cur_backend_id;
1515
- }
1792
+ int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
1793
+ return sched->n_copies;
1794
+ }
1516
1795
 
1517
- // view_src
1518
- if (tensor->view_src != NULL) {
1519
- cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1520
- if (cur_backend_id != -1) {
1521
- SET_CAUSE(tensor, "1.vsrc");
1522
- return cur_backend_id;
1523
- }
1524
- }
1525
-
1526
- if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1527
- // since the tensor is pre-allocated, it cannot be moved to another backend
1528
- LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1529
- }
1796
+ int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
1797
+ return sched->n_backends;
1798
+ }
1530
1799
 
1531
- // graph input
1532
- if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1533
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1534
- SET_CAUSE(tensor, "1.inp");
1535
- return cur_backend_id;
1536
- }
1800
+ lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
1801
+ LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
1802
+ return sched->backends[i];
1803
+ }
1537
1804
 
1538
- // operations with weights are preferably run on the same backend as the weights
1539
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1540
- const struct lm_ggml_tensor * src = tensor->src[i];
1541
- if (src == NULL) {
1542
- continue;
1543
- }
1544
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1545
- int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1546
- // check if a backend with higher prio wants to offload the op
1547
- if (src_backend_id == sched->n_backends - 1) {
1548
- for (int b = 0; b < src_backend_id; b++) {
1549
- if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
1550
- SET_CAUSE(tensor, "1.off");
1551
- return b;
1552
- }
1553
- }
1554
- }
1555
- SET_CAUSE(tensor, "1.wgt%d", i);
1556
- return src_backend_id;
1557
- }
1558
- }
1805
+ size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
1806
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1807
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1559
1808
 
1560
- return -1;
1809
+ return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1561
1810
  }
1562
1811
 
1563
- static char * fmt_size(size_t size) {
1564
- static char buffer[128];
1565
- if (size >= 1024*1024) {
1566
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1567
- } else {
1568
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1569
- }
1570
- return buffer;
1812
+ void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
1813
+ int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
1814
+ LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1815
+ tensor_backend_id(node) = backend_index;
1816
+ SET_CAUSE(node, "usr");
1817
+ sched->is_reset = false;
1571
1818
  }
1572
1819
 
1573
- static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1574
- int cur_split = 0;
1575
- for (int i = 0; i < graph->n_nodes; i++) {
1576
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1577
- lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1578
- LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1579
- sched->splits[cur_split].n_inputs);
1580
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1581
- LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1582
- fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1583
- }
1584
- LM_GGML_LOG_DEBUG("\n");
1585
- cur_split++;
1586
- }
1587
- struct lm_ggml_tensor * node = graph->nodes[i];
1588
- if (lm_ggml_is_view_op(node->op)) {
1589
- continue;
1590
- }
1591
- lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1592
- LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1593
- fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1594
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1595
- struct lm_ggml_tensor * src = node->src[j];
1596
- if (src == NULL) {
1597
- continue;
1598
- }
1599
- lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1600
- LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1601
- fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1602
- }
1603
- LM_GGML_LOG_DEBUG("\n");
1820
+ lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
1821
+ int backend_index = tensor_backend_id(node);
1822
+ if (backend_index == -1) {
1823
+ return NULL;
1604
1824
  }
1825
+ return sched->backends[backend_index];
1605
1826
  }
1606
1827
 
1607
- static bool lm_ggml_backend_sched_buffer_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * t, int backend_id) {
1608
- lm_ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1609
- lm_ggml_backend_buffer_type_t buft = NULL;
1828
+ // utils
1610
1829
 
1611
- if (buf) {
1612
- // the tensor is already allocated
1613
- buft = buf->buft;
1614
- } else {
1615
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1616
- int tensor_backend_id = tensor_backend_id(t);
1617
- if (tensor_backend_id == -1 && t->view_src) {
1618
- tensor_backend_id = tensor_backend_id(t->view_src);
1619
- }
1620
- if (tensor_backend_id != -1) {
1621
- buft = sched->bufts[tensor_backend_id];
1622
- }
1623
- }
1830
+ void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
1831
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1832
+ LM_GGML_ASSERT(tensor->view_src != NULL);
1833
+ LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
1834
+ LM_GGML_ASSERT(tensor->view_src->data != NULL);
1624
1835
 
1625
- return buft != NULL && lm_ggml_backend_supports_buft(sched->backends[backend_id], buft);
1836
+ tensor->buffer = tensor->view_src->buffer;
1837
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1838
+ lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1626
1839
  }
1627
1840
 
1628
- static void lm_ggml_backend_sched_set_if_supported(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1629
- if (lm_ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1630
- *node_backend_id = cur_backend_id;
1631
- SET_CAUSE(node, "2.sup");
1632
- }
1841
+ void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
1842
+ LM_GGML_ASSERT(tensor->buffer == NULL);
1843
+ LM_GGML_ASSERT(tensor->data == NULL);
1844
+ LM_GGML_ASSERT(tensor->view_src == NULL);
1845
+ LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
1846
+ LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1847
+ (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
1848
+
1849
+ tensor->buffer = buffer;
1850
+ tensor->data = addr;
1851
+ lm_ggml_backend_buffer_init_tensor(buffer, tensor);
1633
1852
  }
1634
1853
 
1635
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1636
- static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
1637
- // reset splits
1638
- sched->n_splits = 0;
1639
- sched->n_graph_inputs = 0;
1640
- sched->is_reset = false;
1854
+ static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
1855
+ struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
1641
1856
 
1642
- struct lm_ggml_init_params params = {
1643
- /* .mem_size = */ sched->context_buffer_size,
1644
- /* .mem_buffer = */ sched->context_buffer,
1645
- /* .no_alloc = */ true
1646
- };
1857
+ LM_GGML_ASSERT(src != NULL);
1858
+ LM_GGML_ASSERT(src->data && "graph must be allocated");
1647
1859
 
1648
- lm_ggml_free(sched->ctx);
1860
+ size_t id = lm_ggml_hash_insert(&hash_set, src);
1861
+ if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
1862
+ return node_copies[lm_ggml_hash_find(&hash_set, src)];
1863
+ }
1649
1864
 
1650
- sched->ctx = lm_ggml_init(params);
1651
- if (sched->ctx == NULL) {
1652
- LM_GGML_ABORT("%s: failed to initialize context\n", __func__);
1865
+ struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1866
+ if (src->view_src != NULL) {
1867
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1868
+ dst->view_offs = src->view_offs;
1653
1869
  }
1870
+ dst->op = src->op;
1871
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1872
+ lm_ggml_set_name(dst, src->name);
1654
1873
 
1655
- // pass 1: assign backends to ops with pre-allocated inputs
1656
- for (int i = 0; i < graph->n_leafs; i++) {
1657
- struct lm_ggml_tensor * leaf = graph->leafs[i];
1658
- int * leaf_backend_id = &tensor_backend_id(leaf);
1659
- // do not overwrite user assignments
1660
- if (*leaf_backend_id == -1) {
1661
- *leaf_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, leaf);
1874
+ // copy src
1875
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1876
+ struct lm_ggml_tensor * s = src->src[i];
1877
+ if (s == NULL) {
1878
+ continue;
1662
1879
  }
1880
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1663
1881
  }
1664
1882
 
1665
- for (int i = 0; i < graph->n_nodes; i++) {
1666
- struct lm_ggml_tensor * node = graph->nodes[i];
1667
- int * node_backend_id = &tensor_backend_id(node);
1668
- // do not overwrite user assignments
1669
- if (*node_backend_id == -1) {
1670
- *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
1883
+ node_copies[id] = dst;
1884
+ return dst;
1885
+ }
1671
1886
 
1672
- #if 0
1673
- // src
1674
- if (node->op == LM_GGML_OP_NONE) {
1675
- continue;
1676
- }
1887
+ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
1888
+ size_t id = lm_ggml_hash_find(hash_set, src);
1889
+ if (node_init[id]) {
1890
+ return;
1891
+ }
1892
+ node_init[id] = true;
1677
1893
 
1678
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1679
- struct lm_ggml_tensor * src = node->src[j];
1680
- if (src == NULL) {
1681
- continue;
1682
- }
1683
- int * src_backend_id = &tensor_backend_id(src);
1684
- if (*src_backend_id == -1) {
1685
- *src_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, src);
1686
- }
1687
- }
1688
- #endif
1689
- }
1894
+ struct lm_ggml_tensor * dst = node_copies[id];
1895
+ if (dst->view_src != NULL) {
1896
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1897
+ lm_ggml_backend_view_init(dst);
1898
+ }
1899
+ else {
1900
+ lm_ggml_backend_tensor_copy(src, dst);
1690
1901
  }
1691
1902
 
1692
- // pass 2: expand current backend assignments
1693
- // assign the same backend to adjacent nodes
1694
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1695
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1696
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1697
- // expand gpu down
1698
- {
1699
- int cur_backend_id = -1;
1700
- for (int i = 0; i < graph->n_nodes; i++) {
1701
- struct lm_ggml_tensor * node = graph->nodes[i];
1702
- if (lm_ggml_is_view_op(node->op)) {
1703
- continue;
1704
- }
1705
- int * node_backend_id = &tensor_backend_id(node);
1706
- if (*node_backend_id != -1) {
1707
- if (*node_backend_id == sched->n_backends - 1) {
1708
- // skip cpu (lowest prio backend)
1709
- cur_backend_id = -1;
1710
- } else {
1711
- cur_backend_id = *node_backend_id;
1712
- }
1713
- } else if (cur_backend_id != -1) {
1714
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1715
- }
1903
+ // init src
1904
+ for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
1905
+ struct lm_ggml_tensor * s = src->src[i];
1906
+ if (s == NULL) {
1907
+ continue;
1716
1908
  }
1909
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1717
1910
  }
1718
- // expand gpu up
1719
- {
1720
- int cur_backend_id = -1;
1721
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1722
- struct lm_ggml_tensor * node = graph->nodes[i];
1723
- if (lm_ggml_is_view_op(node->op)) {
1724
- continue;
1725
- }
1726
- int * node_backend_id = &tensor_backend_id(node);
1727
- if (*node_backend_id != -1) {
1728
- if (*node_backend_id == sched->n_backends - 1) {
1729
- // skip cpu (lowest prio backend)
1730
- cur_backend_id = -1;
1731
- } else {
1732
- cur_backend_id = *node_backend_id;
1733
- }
1734
- } else if (cur_backend_id != -1) {
1735
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1736
- }
1737
- }
1738
- }
1739
- // expand rest down
1740
- {
1741
- int cur_backend_id = -1;
1742
- for (int i = 0; i < graph->n_nodes; i++) {
1743
- struct lm_ggml_tensor * node = graph->nodes[i];
1744
- if (lm_ggml_is_view_op(node->op)) {
1745
- continue;
1746
- }
1747
- int * node_backend_id = &tensor_backend_id(node);
1748
- if (*node_backend_id != -1) {
1749
- cur_backend_id = *node_backend_id;
1750
- } else if (cur_backend_id != -1) {
1751
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1752
- }
1753
- }
1754
- }
1755
- // expand rest up
1756
- {
1757
- int cur_backend_id = -1;
1758
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1759
- struct lm_ggml_tensor * node = graph->nodes[i];
1760
- if (lm_ggml_is_view_op(node->op)) {
1761
- continue;
1762
- }
1763
- int * node_backend_id = &tensor_backend_id(node);
1764
- if (*node_backend_id != -1) {
1765
- cur_backend_id = *node_backend_id;
1766
- } else if (cur_backend_id != -1) {
1767
- lm_ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1768
- }
1769
- }
1770
- }
1771
-
1772
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1773
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1774
- // however, we also need to verify that the sources are in compatible buffer types
1775
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1776
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1777
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1778
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1779
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1780
- for (int i = 0; i < graph->n_nodes; i++) {
1781
- struct lm_ggml_tensor * node = graph->nodes[i];
1782
- if (lm_ggml_is_view_op(node->op)) {
1783
- continue;
1784
- }
1785
- int * node_backend_id = &tensor_backend_id(node);
1786
- if (*node_backend_id == -1) {
1787
- // unassigned node: find the backend with the most supported inputs
1788
- int n_supported_best = -1;
1789
- for (int b = 0; b < sched->n_backends; b++) {
1790
- if (lm_ggml_backend_supports_op(sched->backends[b], node)) {
1791
- int n_supported = 0;
1792
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1793
- struct lm_ggml_tensor * src = node->src[j];
1794
- if (src == NULL) {
1795
- continue;
1796
- }
1797
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1798
- n_supported++;
1799
- }
1800
- }
1801
- if (n_supported > n_supported_best) {
1802
- n_supported_best = n_supported;
1803
- *node_backend_id = b;
1804
- SET_CAUSE(node, "3.best");
1805
- }
1806
- }
1807
- }
1808
- } else {
1809
- // assigned node: upgrade to higher prio backend if possible
1810
- for (int b = 0; b < *node_backend_id; b++) {
1811
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && lm_ggml_backend_supports_op(sched->backends[b], node)) {
1812
- bool supported = true;
1813
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1814
- struct lm_ggml_tensor * src = node->src[j];
1815
- if (src == NULL) {
1816
- continue;
1817
- }
1818
- if (!lm_ggml_backend_sched_buffer_supported(sched, src, b)) {
1819
- supported = false;
1820
- break;
1821
- }
1822
- }
1823
- if (supported) {
1824
- *node_backend_id = b;
1825
- SET_CAUSE(node, "3.upg");
1826
- break;
1827
- }
1828
- }
1829
- }
1830
- }
1831
- }
1832
-
1833
- // pass 4: assign backends to remaining src from dst and view_src
1834
- for (int i = 0; i < graph->n_nodes; i++) {
1835
- struct lm_ggml_tensor * node = graph->nodes[i];
1836
- int * cur_backend_id = &tensor_backend_id(node);
1837
- if (node->view_src != NULL && *cur_backend_id == -1) {
1838
- *cur_backend_id = tensor_backend_id(node->view_src);
1839
- SET_CAUSE(node, "4.vsrc");
1840
- }
1841
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1842
- struct lm_ggml_tensor * src = node->src[j];
1843
- if (src == NULL) {
1844
- continue;
1845
- }
1846
- int * src_backend_id = &tensor_backend_id(src);
1847
- if (*src_backend_id == -1) {
1848
- if (src->view_src != NULL) {
1849
- // views are always on the same backend as the source
1850
- *src_backend_id = tensor_backend_id(src->view_src);
1851
- SET_CAUSE(src, "4.vsrc");
1852
- } else {
1853
- *src_backend_id = *cur_backend_id;
1854
- SET_CAUSE(src, "4.cur");
1855
- }
1856
- }
1857
- }
1858
- }
1859
-
1860
- // pass 5: split graph, find tensors that need to be copied
1861
- {
1862
- int i_split = 0;
1863
- struct lm_ggml_backend_sched_split * split = &sched->splits[0];
1864
- // find the backend of the first split, skipping view ops
1865
- int i = 0;
1866
- for (; i < graph->n_nodes; i++) {
1867
- struct lm_ggml_tensor * node = graph->nodes[i];
1868
- if (!lm_ggml_is_view_op(node->op)) {
1869
- split->backend_id = tensor_backend_id(node);
1870
- break;
1871
- }
1872
- }
1873
- split->i_start = 0;
1874
- split->n_inputs = 0;
1875
- int cur_backend_id = split->backend_id;
1876
- for (; i < graph->n_nodes; i++) {
1877
- struct lm_ggml_tensor * node = graph->nodes[i];
1878
-
1879
- if (lm_ggml_is_view_op(node->op)) {
1880
- continue;
1881
- }
1882
-
1883
- const int node_backend_id = tensor_backend_id(node);
1884
-
1885
- assert(node_backend_id != -1); // all nodes should be assigned by now
1886
-
1887
- // check if we should start a new split based on the sources of the current node
1888
- bool need_new_split = false;
1889
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1890
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1891
- struct lm_ggml_tensor * src = node->src[j];
1892
- if (src == NULL) {
1893
- continue;
1894
- }
1895
- // check if a weight is on a different backend
1896
- // by starting a new split, the memory of the previously offloaded weights can be reused
1897
- if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1898
- int src_backend_id = tensor_backend_id(src);
1899
- if (src_backend_id != cur_backend_id) {
1900
- need_new_split = true;
1901
- break;
1902
- }
1903
- }
1904
- // check if the split has too many inputs
1905
- // FIXME: count the number of inputs instead of only checking when full
1906
- if (split->n_inputs == LM_GGML_SCHED_MAX_SPLIT_INPUTS) {
1907
- const size_t id = hash_id(src);
1908
- int src_backend_id = sched->hv_tensor_backend_ids[id];
1909
- bool supported = lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1910
- if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1911
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1912
- need_new_split = true;
1913
- break;
1914
- }
1915
- }
1916
- }
1917
- }
1911
+ }
1918
1912
 
1919
- if (node_backend_id != cur_backend_id || need_new_split) {
1920
- split->i_end = i;
1921
- i_split++;
1922
- if (i_split >= sched->splits_capacity) {
1923
- sched->splits_capacity *= 2;
1924
- sched->splits = (lm_ggml_backend_sched_split *)
1925
- realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1926
- LM_GGML_ASSERT(sched->splits != NULL);
1927
- }
1928
- split = &sched->splits[i_split];
1929
- split->backend_id = node_backend_id;
1930
- split->i_start = i;
1931
- split->n_inputs = 0;
1932
- cur_backend_id = node_backend_id;
1933
- }
1913
+ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
1914
+ struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
1915
+ struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1916
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
1934
1917
 
1935
- // find inputs that are not on the same backend
1936
- for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1937
- struct lm_ggml_tensor * src = node->src[j];
1938
- if (src == NULL) {
1939
- continue;
1940
- }
1918
+ struct lm_ggml_init_params params = {
1919
+ /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
1920
+ /* .mem_buffer = */ NULL,
1921
+ /* .no_alloc = */ true
1922
+ };
1941
1923
 
1942
- size_t src_id = hash_id(src);
1943
- const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1944
- assert(src_backend_id != -1); // all inputs should be assigned by now
1924
+ struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
1925
+ struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
1945
1926
 
1946
- if (src->flags & LM_GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1947
- if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1948
- lm_ggml_backend_t backend = sched->backends[src_backend_id];
1949
- for (int c = 0; c < sched->n_copies; c++) {
1950
- struct lm_ggml_tensor * tensor_copy;
1951
- if (c == sched->cur_copy) {
1952
- tensor_copy = src; // use the original tensor as the current copy
1953
- } else {
1954
- tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1955
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1956
- }
1957
- if (sched->n_copies > 1) {
1958
- lm_ggml_set_input(tensor_copy);
1959
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1960
- }
1961
- tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1962
- SET_CAUSE(tensor_copy, "4.cpy");
1963
- }
1964
- int n_graph_inputs = sched->n_graph_inputs++;
1965
- LM_GGML_ASSERT(n_graph_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1966
- sched->graph_inputs[n_graph_inputs] = src;
1967
- }
1968
- }
1927
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1928
+ LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1929
+ lm_ggml_hash_set_free(&hash_set);
1930
+ free(node_copies);
1931
+ free(node_init);
1932
+ lm_ggml_free(ctx_allocated);
1933
+ lm_ggml_free(ctx_unallocated);
1934
+ return {
1935
+ /* .buffer = */ NULL,
1936
+ /* .ctx_allocated = */ NULL,
1937
+ /* .ctx_unallocated = */ NULL,
1938
+ /* .graph = */ NULL,
1939
+ };
1940
+ }
1969
1941
 
1970
- if (src_backend_id != cur_backend_id && !lm_ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1971
- // create a copy of the input in the split's backend
1972
- if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1973
- lm_ggml_backend_t backend = sched->backends[cur_backend_id];
1974
- for (int c = 0; c < sched->n_copies; c++) {
1975
- struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src);
1976
- lm_ggml_format_name(tensor_copy, "%s#%s#%d", lm_ggml_backend_name(backend), src->name, c);
1977
- if (sched->n_copies > 1) {
1978
- lm_ggml_set_input(tensor_copy);
1979
- lm_ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1980
- }
1981
- tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1982
- SET_CAUSE(tensor_copy, "4.cpy");
1983
- }
1984
- int n_inputs = split->n_inputs++;
1985
- LM_GGML_ASSERT(n_inputs < LM_GGML_SCHED_MAX_SPLIT_INPUTS);
1986
- split->inputs[n_inputs] = src;
1987
- }
1988
- node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1989
- }
1990
- }
1991
- }
1992
- split->i_end = graph->n_nodes;
1993
- sched->n_splits = i_split + 1;
1942
+ // dup nodes
1943
+ for (int i = 0; i < graph->n_nodes; i++) {
1944
+ struct lm_ggml_tensor * node = graph->nodes[i];
1945
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
1994
1946
  }
1995
1947
 
1996
- if (sched->debug) {
1997
- lm_ggml_backend_sched_print_assignments(sched, graph);
1948
+ // allocate nodes
1949
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1950
+ if (buffer == NULL) {
1951
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1952
+ lm_ggml_hash_set_free(&hash_set);
1953
+ free(node_copies);
1954
+ free(node_init);
1955
+ lm_ggml_free(ctx_allocated);
1956
+ lm_ggml_free(ctx_unallocated);
1957
+ return {
1958
+ /* .buffer = */ NULL,
1959
+ /* .ctx_allocated = */ NULL,
1960
+ /* .ctx_unallocated = */ NULL,
1961
+ /* .graph = */ NULL,
1962
+ };
1998
1963
  }
1999
1964
 
2000
- // swap node_backend_ids and leaf _backend_ids with prevs
2001
- {
2002
- int * tmp = sched->node_backend_ids;
2003
- sched->node_backend_ids = sched->prev_node_backend_ids;
2004
- sched->prev_node_backend_ids = tmp;
1965
+ //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2005
1966
 
2006
- tmp = sched->leaf_backend_ids;
2007
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
2008
- sched->prev_leaf_backend_ids = tmp;
1967
+ // copy data and init views
1968
+ for (int i = 0; i < graph->n_nodes; i++) {
1969
+ struct lm_ggml_tensor * node = graph->nodes[i];
1970
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2009
1971
  }
2010
1972
 
2011
- int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
2012
- if (sched->graph.size < graph_size) {
2013
- sched->graph.size = graph_size;
2014
- sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
2015
- sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
2016
- LM_GGML_ASSERT(sched->graph.nodes != NULL);
2017
- LM_GGML_ASSERT(sched->graph.leafs != NULL);
1973
+ // build graph copy
1974
+ struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
1975
+ for (int i = 0; i < graph->n_nodes; i++) {
1976
+ struct lm_ggml_tensor * node = graph->nodes[i];
1977
+ struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
1978
+ graph_copy->nodes[i] = node_copy;
2018
1979
  }
2019
- sched->graph.n_nodes = 0;
2020
- sched->graph.n_leafs = 0;
1980
+ graph_copy->n_nodes = graph->n_nodes;
2021
1981
 
2022
- struct lm_ggml_cgraph * graph_copy = &sched->graph;
1982
+ lm_ggml_hash_set_free(&hash_set);
1983
+ free(node_copies);
1984
+ free(node_init);
2023
1985
 
2024
- for (int i = 0; i < sched->n_splits; i++) {
2025
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
2026
- split->graph = lm_ggml_graph_view(graph, split->i_start, split->i_end);
1986
+ return {
1987
+ /* .buffer = */ buffer,
1988
+ /* .ctx_allocated = */ ctx_allocated,
1989
+ /* .ctx_unallocated = */ ctx_unallocated,
1990
+ /* .graph = */ graph_copy,
1991
+ };
1992
+ }
2027
1993
 
2028
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
2029
- for (int j = 0; j < split->n_inputs; j++) {
2030
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1994
+ void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
1995
+ lm_ggml_backend_buffer_free(copy.buffer);
1996
+ lm_ggml_free(copy.ctx_allocated);
1997
+ lm_ggml_free(copy.ctx_unallocated);
1998
+ }
2031
1999
 
2032
- struct lm_ggml_tensor * input = split->inputs[j];
2033
- const size_t input_id = hash_id(input);
2034
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
2000
+ bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2001
+ struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2002
+ if (copy.buffer == NULL) {
2003
+ return false;
2004
+ }
2035
2005
 
2036
- // add a dependency to the input source so that it is not freed before the copy is done
2037
- struct lm_ggml_tensor * input_dep = lm_ggml_view_tensor(sched->ctx, input);
2038
- input_dep->src[0] = input;
2039
- sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
2040
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
2006
+ struct lm_ggml_cgraph * g1 = graph;
2007
+ struct lm_ggml_cgraph * g2 = copy.graph;
2041
2008
 
2042
- // add a dependency to the input copy so that it is allocated at the start of the split
2043
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
2044
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
2045
- }
2009
+ assert(g1->n_nodes == g2->n_nodes);
2046
2010
 
2047
- for (int j = split->i_start; j < split->i_end; j++) {
2048
- assert(graph_copy->size > graph_copy->n_nodes);
2049
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
2050
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
2051
- }
2052
- }
2011
+ for (int i = 0; i < g1->n_nodes; i++) {
2012
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2013
+ struct lm_ggml_tensor * t1 = g1->nodes[i];
2014
+ struct lm_ggml_tensor * t2 = g2->nodes[i];
2053
2015
 
2054
- if (sched->n_copies > 1) {
2055
- // add input copies as leafs so that they are allocated first
2056
- for (int i = 0; i < sched->n_graph_inputs; i++) {
2057
- struct lm_ggml_tensor * input = sched->graph_inputs[i];
2058
- size_t id = hash_id(input);
2059
- int backend_id = tensor_backend_id(input);
2060
- for (int c = 0; c < sched->n_copies; c++) {
2061
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2062
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2063
- assert(graph_copy->size > graph_copy->n_leafs);
2064
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2065
- }
2016
+ assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2017
+
2018
+ struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2019
+ struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2020
+
2021
+ lm_ggml_backend_graph_compute(backend1, &g1v);
2022
+ lm_ggml_backend_graph_compute(backend2, &g2v);
2023
+
2024
+ if (lm_ggml_is_view_op(t1->op)) {
2025
+ continue;
2066
2026
  }
2067
2027
 
2068
- for (int i = 0; i < sched->n_splits; i++) {
2069
- struct lm_ggml_backend_sched_split * split = &sched->splits[i];
2070
- int backend_id = split->backend_id;
2071
- for (int j = 0; j < split->n_inputs; j++) {
2072
- struct lm_ggml_tensor * input = split->inputs[j];
2073
- size_t id = hash_id(input);
2074
- for (int c = 0; c < sched->n_copies; c++) {
2075
- struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
2076
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2077
- assert(graph_copy->size > graph_copy->n_leafs);
2078
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
2079
- }
2080
- }
2028
+ // compare results, calculate rms etc
2029
+ if (!callback(i, t1, t2, user_data)) {
2030
+ break;
2081
2031
  }
2082
2032
  }
2083
2033
 
2084
- // add leafs from the original graph
2085
- for (int i = 0; i < graph->n_leafs; i++) {
2086
- struct lm_ggml_tensor * leaf = graph->leafs[i];
2087
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
2088
- assert(graph_copy->size > graph_copy->n_leafs);
2089
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
2090
- }
2034
+ lm_ggml_backend_graph_copy_free(copy);
2035
+
2036
+ return true;
2091
2037
  }
2092
2038
 
2093
- static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
2094
- bool backend_ids_changed = false;
2095
- for (int i = 0; i < sched->graph.n_nodes; i++) {
2096
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
2097
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
2098
- backend_ids_changed = true;
2099
- break;
2100
- }
2039
+
2040
+
2041
+ #include "ggml-backend.h"
2042
+ #include "ggml-backend-impl.h"
2043
+ #include "ggml-cpu.h"
2044
+ #include "ggml-impl.h"
2045
+ #include <cctype>
2046
+ #include <string>
2047
+
2048
+ // ggml-backend interface
2049
+
2050
+ // CPU backend - buffer
2051
+
2052
+ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
2053
+ uintptr_t data = (uintptr_t)buffer->context;
2054
+
2055
+ // align the buffer
2056
+ if (data % TENSOR_ALIGNMENT != 0) {
2057
+ data = LM_GGML_PAD(data, TENSOR_ALIGNMENT);
2101
2058
  }
2102
- if (!backend_ids_changed) {
2103
- for (int i = 0; i < sched->graph.n_leafs; i++) {
2104
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
2105
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
2106
- backend_ids_changed = true;
2107
- break;
2108
- }
2109
- }
2059
+
2060
+ return (void *)data;
2061
+ }
2062
+
2063
+ static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2064
+ lm_ggml_aligned_free(buffer->context, buffer->size);
2065
+ }
2066
+
2067
+ static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2068
+ memset((char *)tensor->data + offset, value, size);
2069
+
2070
+ LM_GGML_UNUSED(buffer);
2071
+ }
2072
+
2073
+ static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2074
+ memcpy((char *)tensor->data + offset, data, size);
2075
+
2076
+ LM_GGML_UNUSED(buffer);
2077
+ }
2078
+
2079
+ static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2080
+ memcpy(data, (const char *)tensor->data + offset, size);
2081
+
2082
+ LM_GGML_UNUSED(buffer);
2083
+ }
2084
+
2085
+ static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
2086
+ if (lm_ggml_backend_buffer_is_host(src->buffer)) {
2087
+ memcpy(dst->data, src->data, lm_ggml_nbytes(src));
2088
+ return true;
2110
2089
  }
2090
+ return false;
2111
2091
 
2112
- // allocate graph
2113
- if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2114
- // the re-allocation may cause the split inputs to be moved to a different address
2115
- lm_ggml_backend_sched_synchronize(sched);
2116
- #ifndef NDEBUG
2117
- LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2118
- #endif
2119
- lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2120
- if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2121
- LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2122
- return false;
2123
- }
2092
+ LM_GGML_UNUSED(buffer);
2093
+ }
2094
+
2095
+ static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
2096
+ memset(buffer->context, value, buffer->size);
2097
+ }
2098
+
2099
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
2100
+ /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
2101
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2102
+ /* .init_tensor = */ NULL, // no initialization required
2103
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2104
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2105
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2106
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2107
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2108
+ /* .reset = */ NULL,
2109
+ };
2110
+
2111
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
2112
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
2113
+ /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
2114
+ /* .init_tensor = */ NULL, // no initialization required
2115
+ /* .memset_tensor = */ lm_ggml_backend_cpu_buffer_memset_tensor,
2116
+ /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor,
2117
+ /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor,
2118
+ /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor,
2119
+ /* .clear = */ lm_ggml_backend_cpu_buffer_clear,
2120
+ /* .reset = */ NULL,
2121
+ };
2122
+
2123
+ // CPU backend - buffer type
2124
+
2125
+ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2126
+ return "CPU";
2127
+
2128
+ LM_GGML_UNUSED(buft);
2129
+ }
2130
+
2131
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2132
+ void * data = lm_ggml_aligned_malloc(size);
2133
+
2134
+ if (data == NULL) {
2135
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
2136
+ return NULL;
2124
2137
  }
2125
2138
 
2126
- return true;
2139
+ return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
2127
2140
  }
2128
2141
 
2129
- static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_sched_t sched) {
2130
- struct lm_ggml_backend_sched_split * splits = sched->splits;
2142
+ static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
2143
+ return TENSOR_ALIGNMENT;
2131
2144
 
2132
- for (int i = 0; i < sched->n_splits; i++) {
2133
- struct lm_ggml_backend_sched_split * split = &splits[i];
2134
- int split_backend_id = split->backend_id;
2135
- lm_ggml_backend_t split_backend = sched->backends[split_backend_id];
2145
+ LM_GGML_UNUSED(buft);
2146
+ }
2136
2147
 
2137
- // copy the input tensors to the split backend
2138
- for (int j = 0; j < split->n_inputs; j++) {
2139
- lm_ggml_backend_t input_backend = lm_ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
2140
- struct lm_ggml_tensor * input = split->inputs[j];
2141
- struct lm_ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
2148
+ static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
2149
+ return true;
2142
2150
 
2143
- if (input->flags & LM_GGML_TENSOR_FLAG_INPUT) {
2144
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
2145
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2146
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2147
- } else {
2148
- lm_ggml_backend_synchronize(split_backend);
2149
- }
2150
- lm_ggml_backend_tensor_copy(input, input_cpy);
2151
- } else {
2152
- // wait for the split backend to finish using the input before overwriting it
2153
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2154
- lm_ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
2155
- } else {
2156
- lm_ggml_backend_synchronize(split_backend);
2157
- }
2158
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
2159
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
2160
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
2161
- lm_ggml_backend_synchronize(input_backend);
2162
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2163
- lm_ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2164
- } else {
2165
- lm_ggml_backend_synchronize(split_backend);
2166
- }
2167
- lm_ggml_backend_tensor_copy(input, input_cpy);
2168
- }
2169
- }
2170
- }
2151
+ LM_GGML_UNUSED(buft);
2152
+ }
2171
2153
 
2172
- if (!sched->callback_eval) {
2173
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &split->graph);
2174
- if (ec != LM_GGML_STATUS_SUCCESS) {
2175
- return ec;
2176
- }
2177
- } else {
2178
- // similar to lm_ggml_backend_compare_graph_backend
2179
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
2180
- struct lm_ggml_tensor * t = split->graph.nodes[j0];
2154
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
2155
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2156
+ /* .iface = */ {
2157
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
2158
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2159
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2160
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2161
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2162
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2163
+ },
2164
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2165
+ /* .context = */ NULL,
2166
+ };
2181
2167
 
2182
- // check if the user needs data from this node
2183
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2168
+ return &lm_ggml_backend_cpu_buffer_type;
2169
+ }
2184
2170
 
2185
- int j1 = j0;
2171
+ static const char * lm_ggml_backend_cpu_buffer_from_ptr_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2172
+ return "CPU_Mapped";
2186
2173
 
2187
- // determine the range [j0, j1] of nodes that can be computed together
2188
- while (!need && j1 < split->graph.n_nodes - 1) {
2189
- t = split->graph.nodes[++j1];
2190
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
2191
- }
2174
+ LM_GGML_UNUSED(buft);
2175
+ }
2192
2176
 
2193
- struct lm_ggml_cgraph gv = lm_ggml_graph_view(&split->graph, j0, j1 + 1);
2177
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(void) {
2178
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
2179
+ /* .iface = */ {
2180
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_from_ptr_type_get_name,
2181
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
2182
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2183
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2184
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2185
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2186
+ },
2187
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2188
+ /* .context = */ NULL,
2189
+ };
2194
2190
 
2195
- enum lm_ggml_status ec = lm_ggml_backend_graph_compute_async(split_backend, &gv);
2196
- if (ec != LM_GGML_STATUS_SUCCESS) {
2197
- return ec;
2198
- }
2191
+ return &lm_ggml_backend_cpu_buffer_type;
2192
+ }
2199
2193
 
2200
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
2201
- lm_ggml_backend_synchronize(split_backend);
2194
+ #ifdef LM_GGML_USE_CPU_HBM
2202
2195
 
2203
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
2204
- break;
2205
- }
2196
+ // buffer type HBM
2206
2197
 
2207
- j0 = j1;
2208
- }
2209
- }
2198
+ #include <hbwmalloc.h>
2210
2199
 
2211
- // record the event of this copy
2212
- if (split->n_inputs > 0) {
2213
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2214
- lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
2215
- }
2216
- }
2217
- }
2200
+ static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2201
+ return "CPU_HBM";
2218
2202
 
2219
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
2203
+ LM_GGML_UNUSED(buft);
2204
+ }
2220
2205
 
2221
- return LM_GGML_STATUS_SUCCESS;
2206
+ static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2207
+ hbw_free(buffer->context);
2222
2208
  }
2223
2209
 
2224
- lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
2225
- lm_ggml_backend_t * backends,
2226
- lm_ggml_backend_buffer_type_t * bufts,
2227
- int n_backends,
2228
- size_t graph_size,
2229
- bool parallel) {
2230
- LM_GGML_ASSERT(n_backends > 0);
2231
- LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
2232
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
2210
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2211
+ void * ptr;
2212
+ int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
2213
+ if (result != 0) {
2214
+ LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
2215
+ return NULL;
2216
+ }
2233
2217
 
2234
- struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
2218
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2219
+ buffer->buft = buft;
2220
+ buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
2235
2221
 
2236
- sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
2237
- sched->n_backends = n_backends;
2238
- sched->n_copies = parallel ? LM_GGML_SCHED_MAX_COPIES : 1;
2222
+ return buffer;
2223
+ }
2239
2224
 
2240
- // initialize hash table
2241
- // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
2242
- sched->hash_set = lm_ggml_hash_set_new(graph_size);
2243
- sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2244
- sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
2225
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
2226
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
2227
+ /* .iface = */ {
2228
+ /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
2229
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
2230
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2231
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2232
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2233
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2234
+ },
2235
+ /* .context = */ NULL,
2236
+ };
2245
2237
 
2246
- const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
2247
- const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
2248
- sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2249
- sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2250
- sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2251
- sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2238
+ return &lm_ggml_backend_cpu_buffer_type_hbm;
2239
+ }
2240
+ #endif
2252
2241
 
2253
- sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
2254
- sched->context_buffer = (char *) malloc(sched->context_buffer_size);
2242
+ static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
2243
+ static lm_ggml_backend_buffer_type_t bufts[] = {
2244
+ #ifdef LM_GGML_USE_CPU_HBM
2245
+ lm_ggml_backend_cpu_hbm_buffer_type(),
2246
+ #endif
2247
+ NULL
2248
+ };
2255
2249
 
2256
- const int initial_splits_capacity = 16;
2257
- sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2258
- sched->splits_capacity = initial_splits_capacity;
2250
+ return bufts;
2259
2251
 
2260
- for (int b = 0; b < n_backends; b++) {
2261
- sched->backends[b] = backends[b];
2262
- sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
2263
- LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
2252
+ LM_GGML_UNUSED(device);
2253
+ }
2264
2254
 
2265
- if (sched->n_copies > 1) {
2266
- for (int c = 0; c < sched->n_copies; c++) {
2267
- sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
2268
- }
2269
- }
2270
- }
2255
+ // CPU backend - backend (stream)
2271
2256
 
2272
- sched->galloc = lm_ggml_gallocr_new_n(sched->bufts, n_backends);
2257
+ struct lm_ggml_backend_cpu_context {
2258
+ int n_threads;
2259
+ lm_ggml_threadpool_t threadpool;
2273
2260
 
2274
- lm_ggml_backend_sched_reset(sched);
2261
+ uint8_t * work_data;
2262
+ size_t work_size;
2275
2263
 
2276
- return sched;
2277
- }
2264
+ lm_ggml_abort_callback abort_callback;
2265
+ void * abort_callback_data;
2266
+ };
2278
2267
 
2279
- void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {
2280
- if (sched == NULL) {
2281
- return;
2282
- }
2283
- for (int b = 0; b < sched->n_backends; b++) {
2284
- for (int c = 0; c < sched->n_copies; c++) {
2285
- lm_ggml_backend_event_free(sched->events[b][c]);
2286
- }
2287
- }
2288
- lm_ggml_gallocr_free(sched->galloc);
2289
- lm_ggml_free(sched->ctx);
2290
- lm_ggml_hash_set_free(&sched->hash_set);
2291
- free(sched->splits);
2292
- free(sched->hv_tensor_backend_ids);
2293
- free(sched->hv_tensor_copies);
2294
- free(sched->node_backend_ids);
2295
- free(sched->leaf_backend_ids);
2296
- free(sched->prev_node_backend_ids);
2297
- free(sched->prev_leaf_backend_ids);
2298
- free(sched->context_buffer);
2299
- free(sched->graph.nodes);
2300
- free(sched->graph.leafs);
2301
- free(sched);
2268
+ static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
2269
+ return "CPU";
2270
+
2271
+ LM_GGML_UNUSED(backend);
2302
2272
  }
2303
2273
 
2304
- void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
2305
- // reset state for the next run
2306
- if (!sched->is_reset) {
2307
- lm_ggml_hash_set_reset(&sched->hash_set);
2308
- memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2309
- memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
2310
- sched->is_reset = true;
2311
- }
2312
- sched->is_alloc = false;
2274
+ static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
2275
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2276
+ delete[] cpu_ctx->work_data;
2277
+ delete cpu_ctx;
2278
+ delete backend;
2313
2279
  }
2314
2280
 
2315
- bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) {
2316
- LM_GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
2281
+ struct lm_ggml_backend_plan_cpu {
2282
+ struct lm_ggml_cplan cplan;
2283
+ struct lm_ggml_cgraph cgraph;
2284
+ };
2285
+
2286
+ static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
2287
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2317
2288
 
2318
- lm_ggml_backend_sched_split_graph(sched, measure_graph);
2289
+ struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
2319
2290
 
2320
- if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2321
- return false;
2291
+ cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2292
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
2293
+
2294
+ if (cpu_plan->cplan.work_size > 0) {
2295
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
2296
+ if (cpu_plan->cplan.work_data == NULL) {
2297
+ delete cpu_plan;
2298
+ return NULL;
2299
+ }
2322
2300
  }
2323
2301
 
2324
- lm_ggml_backend_sched_reset(sched);
2325
- lm_ggml_backend_sched_synchronize(sched);
2302
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
2303
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2326
2304
 
2327
- return true;
2305
+ return cpu_plan;
2328
2306
  }
2329
2307
 
2330
- bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2331
- LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
2308
+ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2309
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2332
2310
 
2333
- lm_ggml_backend_sched_split_graph(sched, graph);
2311
+ delete[] cpu_plan->cplan.work_data;
2312
+ delete cpu_plan;
2334
2313
 
2314
+ LM_GGML_UNUSED(backend);
2315
+ }
2335
2316
 
2336
- if (!lm_ggml_backend_sched_alloc_splits(sched)) {
2337
- return false;
2338
- }
2317
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2318
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2339
2319
 
2340
- sched->is_alloc = true;
2320
+ return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
2341
2321
 
2342
- return true;
2322
+ LM_GGML_UNUSED(backend);
2343
2323
  }
2344
2324
 
2345
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2346
- enum lm_ggml_status err = lm_ggml_backend_sched_graph_compute_async(sched, graph);
2347
- lm_ggml_backend_sched_synchronize(sched);
2348
- return err;
2349
- }
2325
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
2326
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2350
2327
 
2351
- enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) {
2352
- if (!sched->is_reset && !sched->is_alloc) {
2353
- lm_ggml_backend_sched_reset(sched);
2354
- }
2328
+ struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2355
2329
 
2356
- if (!sched->is_alloc) {
2357
- if (!lm_ggml_backend_sched_alloc_graph(sched, graph)) {
2330
+ if (cpu_ctx->work_size < cplan.work_size) {
2331
+ delete[] cpu_ctx->work_data;
2332
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
2333
+ if (cpu_ctx->work_data == NULL) {
2334
+ cpu_ctx->work_size = 0;
2358
2335
  return LM_GGML_STATUS_ALLOC_FAILED;
2359
2336
  }
2337
+ cpu_ctx->work_size = cplan.work_size;
2360
2338
  }
2339
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
2361
2340
 
2362
- return lm_ggml_backend_sched_compute_splits(sched);
2363
- }
2341
+ cplan.abort_callback = cpu_ctx->abort_callback;
2342
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2364
2343
 
2365
- void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
2366
- for (int i = 0; i < sched->n_backends; i++) {
2367
- lm_ggml_backend_synchronize(sched->backends[i]);
2368
- }
2344
+ return lm_ggml_graph_compute(cgraph, &cplan);
2369
2345
  }
2370
2346
 
2371
- void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
2372
- sched->callback_eval = callback;
2373
- sched->callback_eval_user_data = user_data;
2374
- }
2347
+ static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
2348
+ /* .get_name = */ lm_ggml_backend_cpu_get_name,
2349
+ /* .free = */ lm_ggml_backend_cpu_free,
2350
+ /* .set_tensor_async = */ NULL,
2351
+ /* .get_tensor_async = */ NULL,
2352
+ /* .cpy_tensor_async = */ NULL,
2353
+ /* .synchronize = */ NULL,
2354
+ /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
2355
+ /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
2356
+ /* .graph_plan_update = */ NULL,
2357
+ /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
2358
+ /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
2359
+ /* .event_record = */ NULL,
2360
+ /* .event_wait = */ NULL,
2361
+ };
2375
2362
 
2376
- int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) {
2377
- return sched->n_splits;
2363
+ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
2364
+ static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
2365
+ return &guid;
2378
2366
  }
2379
2367
 
2380
- int lm_ggml_backend_sched_get_n_copies(lm_ggml_backend_sched_t sched) {
2381
- return sched->n_copies;
2382
- }
2368
+ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
2369
+ // initialize CPU backend now to avoid slowing the first graph computation
2370
+ lm_ggml_cpu_init();
2383
2371
 
2384
- int lm_ggml_backend_sched_get_n_backends(lm_ggml_backend_sched_t sched) {
2385
- return sched->n_backends;
2386
- }
2372
+ struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
2373
+ if (ctx == NULL) {
2374
+ return NULL;
2375
+ }
2387
2376
 
2388
- lm_ggml_backend_t lm_ggml_backend_sched_get_backend(lm_ggml_backend_sched_t sched, int i) {
2389
- LM_GGML_ASSERT(i >= 0 && i < sched->n_backends);
2390
- return sched->backends[i];
2391
- }
2377
+ ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
2378
+ ctx->threadpool = NULL;
2379
+ ctx->work_data = NULL;
2380
+ ctx->work_size = 0;
2381
+ ctx->abort_callback = NULL;
2382
+ ctx->abort_callback_data = NULL;
2392
2383
 
2393
- size_t lm_ggml_backend_sched_get_buffer_size(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) {
2394
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2395
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2384
+ lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
2385
+ /* .guid = */ lm_ggml_backend_cpu_guid(),
2386
+ /* .interface = */ lm_ggml_backend_cpu_i,
2387
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2388
+ /* .context = */ ctx,
2389
+ };
2396
2390
 
2397
- return lm_ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2391
+ if (cpu_backend == NULL) {
2392
+ delete ctx;
2393
+ return NULL;
2394
+ }
2395
+
2396
+ return cpu_backend;
2398
2397
  }
2399
2398
 
2400
- void lm_ggml_backend_sched_set_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend) {
2401
- int backend_index = lm_ggml_backend_sched_backend_id(sched, backend);
2402
- LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2403
- tensor_backend_id(node) = backend_index;
2404
- SET_CAUSE(node, "usr");
2405
- sched->is_reset = false;
2399
+ bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
2400
+ return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
2406
2401
  }
2407
2402
 
2408
- lm_ggml_backend_t lm_ggml_backend_sched_get_tensor_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) {
2409
- int backend_index = tensor_backend_id(node);
2410
- if (backend_index == -1) {
2411
- return NULL;
2412
- }
2413
- return sched->backends[backend_index];
2403
+ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
2404
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2405
+
2406
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2407
+ ctx->n_threads = n_threads;
2414
2408
  }
2415
2409
 
2416
- // utils
2410
+ void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
2411
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2417
2412
 
2418
- void lm_ggml_backend_view_init(struct lm_ggml_tensor * tensor) {
2419
- LM_GGML_ASSERT(tensor->buffer == NULL);
2420
- LM_GGML_ASSERT(tensor->view_src != NULL);
2421
- LM_GGML_ASSERT(tensor->view_src->buffer != NULL);
2422
- LM_GGML_ASSERT(tensor->view_src->data != NULL);
2413
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2423
2414
 
2424
- tensor->buffer = tensor->view_src->buffer;
2425
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2426
- lm_ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2415
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
2416
+ // already had a different threadpool, pause/suspend it before switching
2417
+ lm_ggml_threadpool_pause(ctx->threadpool);
2418
+ }
2419
+ ctx->threadpool = threadpool;
2427
2420
  }
2428
2421
 
2429
- void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr) {
2430
- LM_GGML_ASSERT(tensor->buffer == NULL);
2431
- LM_GGML_ASSERT(tensor->data == NULL);
2432
- LM_GGML_ASSERT(tensor->view_src == NULL);
2433
- LM_GGML_ASSERT(addr >= lm_ggml_backend_buffer_get_base(buffer));
2434
- LM_GGML_ASSERT((char *)addr + lm_ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2435
- (char *)lm_ggml_backend_buffer_get_base(buffer) + lm_ggml_backend_buffer_get_size(buffer));
2422
+ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
2423
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2436
2424
 
2437
- tensor->buffer = buffer;
2438
- tensor->data = addr;
2439
- lm_ggml_backend_buffer_init_tensor(buffer, tensor);
2425
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2426
+ ctx->abort_callback = abort_callback;
2427
+ ctx->abort_callback_data = abort_callback_data;
2440
2428
  }
2441
2429
 
2442
- static struct lm_ggml_tensor * graph_copy_dup_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor ** node_copies,
2443
- struct lm_ggml_context * ctx_allocated, struct lm_ggml_context * ctx_unallocated, struct lm_ggml_tensor * src) {
2444
-
2445
- LM_GGML_ASSERT(src != NULL);
2446
- LM_GGML_ASSERT(src->data && "graph must be allocated");
2430
+ lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2431
+ LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
2432
+ return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
2433
+ }
2447
2434
 
2448
- size_t id = lm_ggml_hash_insert(&hash_set, src);
2449
- if (id == LM_GGML_HASHSET_ALREADY_EXISTS) {
2450
- return node_copies[lm_ggml_hash_find(&hash_set, src)];
2451
- }
2435
+ // CPU backend - device
2452
2436
 
2453
- struct lm_ggml_tensor * dst = lm_ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2454
- if (src->view_src != NULL) {
2455
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2456
- dst->view_offs = src->view_offs;
2457
- }
2458
- dst->op = src->op;
2459
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2460
- lm_ggml_set_name(dst, src->name);
2437
+ struct lm_ggml_backend_cpu_device_context {
2438
+ std::string description = "CPU";
2461
2439
 
2462
- // copy src
2463
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2464
- struct lm_ggml_tensor * s = src->src[i];
2465
- if (s == NULL) {
2466
- continue;
2440
+ lm_ggml_backend_cpu_device_context() {
2441
+ #ifdef __APPLE__
2442
+ size_t len = 0;
2443
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
2444
+ description.resize(len);
2445
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
2446
+ }
2447
+ #elif defined(__linux__)
2448
+ FILE * f = fopen("/proc/cpuinfo", "r");
2449
+ if (f) {
2450
+ char buf[1024];
2451
+ while (fgets(buf, sizeof(buf), f)) {
2452
+ if (strncmp(buf, "model name", 10) == 0) {
2453
+ char * p = strchr(buf, ':');
2454
+ if (p) {
2455
+ p++;
2456
+ while (std::isspace(*p)) {
2457
+ p++;
2458
+ }
2459
+ while (std::isspace(p[strlen(p) - 1])) {
2460
+ p[strlen(p) - 1] = '\0';
2461
+ }
2462
+ description = p;
2463
+ break;
2464
+ }
2465
+ }
2466
+ }
2467
+ fclose(f);
2468
+ }
2469
+ #elif defined(_WIN32)
2470
+ HKEY hKey;
2471
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
2472
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
2473
+ 0,
2474
+ KEY_READ,
2475
+ &hKey) == ERROR_SUCCESS) {
2476
+ DWORD cpu_brand_size = 0;
2477
+ if (RegQueryValueExA(hKey,
2478
+ TEXT("ProcessorNameString"),
2479
+ NULL,
2480
+ NULL,
2481
+ NULL,
2482
+ &cpu_brand_size) == ERROR_SUCCESS) {
2483
+ description.resize(cpu_brand_size);
2484
+ if (RegQueryValueExA(hKey,
2485
+ TEXT("ProcessorNameString"),
2486
+ NULL,
2487
+ NULL,
2488
+ (LPBYTE)&description[0], // NOLINT
2489
+ &cpu_brand_size) == ERROR_SUCCESS) {
2490
+ if (description.find('\0') != std::string::npos) {
2491
+ description.resize(description.find('\0'));
2492
+ }
2493
+ }
2494
+ }
2495
+ RegCloseKey(hKey);
2467
2496
  }
2468
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2497
+ #endif
2469
2498
  }
2499
+ };
2470
2500
 
2471
- node_copies[id] = dst;
2472
- return dst;
2501
+ static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
2502
+ return "CPU";
2503
+
2504
+ LM_GGML_UNUSED(dev);
2473
2505
  }
2474
2506
 
2475
- static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor ** node_copies, bool * node_init, struct lm_ggml_tensor * src) {
2476
- size_t id = lm_ggml_hash_find(hash_set, src);
2477
- if (node_init[id]) {
2478
- return;
2479
- }
2480
- node_init[id] = true;
2507
+ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
2508
+ struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
2481
2509
 
2482
- struct lm_ggml_tensor * dst = node_copies[id];
2483
- if (dst->view_src != NULL) {
2484
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2485
- lm_ggml_backend_view_init(dst);
2486
- }
2487
- else {
2488
- lm_ggml_backend_tensor_copy(src, dst);
2489
- }
2510
+ return ctx->description.c_str();
2511
+ }
2490
2512
 
2491
- // init src
2492
- for (int i = 0; i < LM_GGML_MAX_SRC; i++) {
2493
- struct lm_ggml_tensor * s = src->src[i];
2494
- if (s == NULL) {
2495
- continue;
2496
- }
2497
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2498
- }
2513
+ static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
2514
+ // TODO
2515
+ *free = 0;
2516
+ *total = 0;
2517
+
2518
+ LM_GGML_UNUSED(dev);
2499
2519
  }
2500
2520
 
2501
- struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
2502
- struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
2503
- struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2504
- bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2521
+ static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
2522
+ return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
2505
2523
 
2506
- struct lm_ggml_init_params params = {
2507
- /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
2508
- /* .mem_buffer = */ NULL,
2509
- /* .no_alloc = */ true
2524
+ LM_GGML_UNUSED(dev);
2525
+ }
2526
+
2527
+ static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
2528
+ props->name = lm_ggml_backend_cpu_device_get_name(dev);
2529
+ props->description = lm_ggml_backend_cpu_device_get_description(dev);
2530
+ props->type = lm_ggml_backend_cpu_device_get_type(dev);
2531
+ lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
2532
+ props->caps = {
2533
+ /* .async = */ false,
2534
+ /* .host_buffer = */ false,
2535
+ /* .buffer_from_host_ptr = */ true,
2536
+ /* .events = */ false,
2510
2537
  };
2538
+ }
2511
2539
 
2512
- struct lm_ggml_context * ctx_allocated = lm_ggml_init(params);
2513
- struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
2540
+ static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
2541
+ return lm_ggml_backend_cpu_init();
2514
2542
 
2515
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2516
- LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2517
- lm_ggml_hash_set_free(&hash_set);
2518
- free(node_copies);
2519
- free(node_init);
2520
- lm_ggml_free(ctx_allocated);
2521
- lm_ggml_free(ctx_unallocated);
2522
- return {
2523
- /* .buffer = */ NULL,
2524
- /* .ctx_allocated = */ NULL,
2525
- /* .ctx_unallocated = */ NULL,
2526
- /* .graph = */ NULL,
2527
- };
2528
- }
2543
+ LM_GGML_UNUSED(dev);
2544
+ LM_GGML_UNUSED(params);
2545
+ }
2529
2546
 
2530
- // dup nodes
2531
- for (int i = 0; i < graph->n_nodes; i++) {
2532
- struct lm_ggml_tensor * node = graph->nodes[i];
2533
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2534
- }
2547
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
2548
+ return lm_ggml_backend_cpu_buffer_type();
2535
2549
 
2536
- // allocate nodes
2537
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2538
- if (buffer == NULL) {
2539
- LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2540
- lm_ggml_hash_set_free(&hash_set);
2541
- free(node_copies);
2542
- free(node_init);
2543
- lm_ggml_free(ctx_allocated);
2544
- lm_ggml_free(ctx_unallocated);
2545
- return {
2546
- /* .buffer = */ NULL,
2547
- /* .ctx_allocated = */ NULL,
2548
- /* .ctx_unallocated = */ NULL,
2549
- /* .graph = */ NULL,
2550
- };
2551
- }
2550
+ LM_GGML_UNUSED(dev);
2551
+ }
2552
2552
 
2553
- //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2553
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2554
+ return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2554
2555
 
2555
- // copy data and init views
2556
- for (int i = 0; i < graph->n_nodes; i++) {
2557
- struct lm_ggml_tensor * node = graph->nodes[i];
2558
- graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2559
- }
2556
+ LM_GGML_UNUSED(dev);
2557
+ LM_GGML_UNUSED(max_tensor_size);
2558
+ }
2560
2559
 
2561
- // build graph copy
2562
- struct lm_ggml_cgraph * graph_copy = lm_ggml_new_graph_custom(ctx_allocated, graph->size, false);
2563
- for (int i = 0; i < graph->n_nodes; i++) {
2564
- struct lm_ggml_tensor * node = graph->nodes[i];
2565
- struct lm_ggml_tensor * node_copy = node_copies[lm_ggml_hash_find(&hash_set, node)];
2566
- graph_copy->nodes[i] = node_copy;
2560
+ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
2561
+ switch (op->op) {
2562
+ case LM_GGML_OP_CPY:
2563
+ return
2564
+ op->type != LM_GGML_TYPE_IQ2_XXS &&
2565
+ op->type != LM_GGML_TYPE_IQ2_XS &&
2566
+ op->type != LM_GGML_TYPE_IQ1_S &&
2567
+ op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
2568
+ case LM_GGML_OP_MUL_MAT:
2569
+ return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
2570
+ case LM_GGML_OP_ROPE_BACK:
2571
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
2572
+ case LM_GGML_OP_IM2COL_BACK:
2573
+ return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
2574
+ case LM_GGML_OP_OUT_PROD:
2575
+ return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
2576
+ default:
2577
+ return true;
2567
2578
  }
2568
- graph_copy->n_nodes = graph->n_nodes;
2569
-
2570
- lm_ggml_hash_set_free(&hash_set);
2571
- free(node_copies);
2572
- free(node_init);
2573
2579
 
2574
- return {
2575
- /* .buffer = */ buffer,
2576
- /* .ctx_allocated = */ ctx_allocated,
2577
- /* .ctx_unallocated = */ ctx_unallocated,
2578
- /* .graph = */ graph_copy,
2579
- };
2580
+ LM_GGML_UNUSED(dev);
2580
2581
  }
2581
2582
 
2582
- void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) {
2583
- lm_ggml_backend_buffer_free(copy.buffer);
2584
- lm_ggml_free(copy.ctx_allocated);
2585
- lm_ggml_free(copy.ctx_unallocated);
2583
+ static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
2584
+ return lm_ggml_backend_buft_is_host(buft);
2585
+
2586
+ LM_GGML_UNUSED(dev);
2586
2587
  }
2587
2588
 
2588
- bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) {
2589
- struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph);
2590
- if (copy.buffer == NULL) {
2591
- return false;
2592
- }
2589
+ static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
2590
+ /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
2591
+ /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
2592
+ /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
2593
+ /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
2594
+ /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
2595
+ /* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
2596
+ /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
2597
+ /* .get_host_buffer_type = */ NULL,
2598
+ /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
2599
+ /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
2600
+ /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
2601
+ /* .offload_op = */ NULL,
2602
+ /* .event_new = */ NULL,
2603
+ /* .event_free = */ NULL,
2604
+ /* .event_synchronize = */ NULL,
2605
+ };
2593
2606
 
2594
- struct lm_ggml_cgraph * g1 = graph;
2595
- struct lm_ggml_cgraph * g2 = copy.graph;
2607
+ // CPU backend - backend (reg)
2596
2608
 
2597
- assert(g1->n_nodes == g2->n_nodes);
2609
+ static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
2610
+ return "CPU";
2598
2611
 
2599
- for (int i = 0; i < g1->n_nodes; i++) {
2600
- //printf("eval %d/%d\n", i, g1->n_nodes);
2601
- struct lm_ggml_tensor * t1 = g1->nodes[i];
2602
- struct lm_ggml_tensor * t2 = g2->nodes[i];
2612
+ LM_GGML_UNUSED(reg);
2613
+ }
2603
2614
 
2604
- assert(t1->op == t2->op && lm_ggml_are_same_layout(t1, t2));
2615
+ static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
2616
+ return 1;
2605
2617
 
2606
- struct lm_ggml_cgraph g1v = lm_ggml_graph_view(g1, i, i + 1);
2607
- struct lm_ggml_cgraph g2v = lm_ggml_graph_view(g2, i, i + 1);
2618
+ LM_GGML_UNUSED(reg);
2619
+ }
2608
2620
 
2609
- lm_ggml_backend_graph_compute(backend1, &g1v);
2610
- lm_ggml_backend_graph_compute(backend2, &g2v);
2621
+ static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
2622
+ LM_GGML_ASSERT(index == 0);
2611
2623
 
2612
- if (lm_ggml_is_view_op(t1->op)) {
2613
- continue;
2614
- }
2624
+ static lm_ggml_backend_cpu_device_context ctx;
2625
+ static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
2626
+ /* .iface = */ lm_ggml_backend_cpu_device_i,
2627
+ /* .reg = */ reg,
2628
+ /* .context = */ &ctx,
2629
+ };
2615
2630
 
2616
- // compare results, calculate rms etc
2617
- if (!callback(i, t1, t2, user_data)) {
2618
- break;
2619
- }
2631
+ return &lm_ggml_backend_cpu_device;
2632
+ }
2633
+
2634
+ static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
2635
+ if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
2636
+ return (void *)lm_ggml_backend_cpu_set_n_threads;
2637
+ }
2638
+ if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
2639
+ return (void *)lm_ggml_backend_cpu_get_extra_bufts;
2620
2640
  }
2621
2641
 
2622
- lm_ggml_backend_graph_copy_free(copy);
2642
+ return NULL;
2623
2643
 
2624
- return true;
2644
+ LM_GGML_UNUSED(reg);
2645
+ }
2646
+
2647
+ static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
2648
+ /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
2649
+ /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
2650
+ /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
2651
+ /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
2652
+ };
2653
+
2654
+ lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
2655
+ static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
2656
+ /* .iface = */ lm_ggml_backend_cpu_reg_i,
2657
+ /* .context = */ NULL,
2658
+ };
2659
+
2660
+ return &lm_ggml_backend_cpu_reg;
2625
2661
  }