llama_cpp 0.16.0 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
|
44
44
|
return ggml_nbytes(tensor);
|
45
45
|
}
|
46
46
|
|
47
|
-
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
48
|
-
return buft->iface.supports_backend(buft, backend);
|
49
|
-
}
|
50
|
-
|
51
47
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
52
48
|
if (buft->iface.is_host) {
|
53
49
|
return buft->iface.is_host(buft);
|
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
286
282
|
return backend->iface.supports_op(backend, op);
|
287
283
|
}
|
288
284
|
|
285
|
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
286
|
+
return backend->iface.supports_buft(backend, buft);
|
287
|
+
}
|
288
|
+
|
289
289
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
290
290
|
if (backend->iface.offload_op != NULL) {
|
291
291
|
return backend->iface.offload_op(backend, op);
|
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
|
639
639
|
GGML_UNUSED(buft);
|
640
640
|
}
|
641
641
|
|
642
|
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
643
|
-
return ggml_backend_is_cpu(backend);
|
644
|
-
|
645
|
-
GGML_UNUSED(buft);
|
646
|
-
}
|
647
|
-
|
648
642
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
649
643
|
return true;
|
650
644
|
|
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
659
653
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
660
654
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
661
655
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
662
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
663
656
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
664
657
|
},
|
665
658
|
/* .context = */ NULL,
|
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
715
708
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
716
709
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
717
710
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
718
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
719
711
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
720
712
|
},
|
721
713
|
/* .context = */ NULL,
|
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|
836
828
|
GGML_UNUSED(backend);
|
837
829
|
}
|
838
830
|
|
831
|
+
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
832
|
+
return ggml_backend_buft_is_host(buft);
|
833
|
+
|
834
|
+
GGML_UNUSED(backend);
|
835
|
+
}
|
836
|
+
|
839
837
|
static struct ggml_backend_i cpu_backend_i = {
|
840
838
|
/* .get_name = */ ggml_backend_cpu_name,
|
841
839
|
/* .free = */ ggml_backend_cpu_free,
|
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
846
844
|
/* .synchronize = */ NULL,
|
847
845
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
848
846
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
847
|
+
/* .graph_plan_update = */ NULL,
|
849
848
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
850
849
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
851
850
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
851
|
+
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
852
852
|
/* .offload_op = */ NULL,
|
853
853
|
/* .event_new = */ NULL,
|
854
854
|
/* .event_free = */ NULL,
|
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
|
|
1055
1055
|
int * node_backend_ids; // [graph_size]
|
1056
1056
|
int * leaf_backend_ids; // [graph_size]
|
1057
1057
|
|
1058
|
+
int * prev_node_backend_ids; // [graph_size]
|
1059
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
1060
|
+
|
1058
1061
|
// copy of the graph with modified inputs
|
1059
1062
|
struct ggml_cgraph * graph;
|
1060
1063
|
|
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
|
|
1075
1078
|
ggml_backend_sched_eval_callback callback_eval;
|
1076
1079
|
void * callback_eval_user_data;
|
1077
1080
|
|
1081
|
+
bool debug;
|
1082
|
+
|
1078
1083
|
// align context_buffer to GGML_MEM_ALIGN
|
1079
1084
|
#ifdef _MSC_VER
|
1080
1085
|
__declspec(align(GGML_MEM_ALIGN))
|
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
1097
1102
|
return -1;
|
1098
1103
|
}
|
1099
1104
|
|
1100
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
1105
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
1101
1106
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
1102
1107
|
if (buffer == NULL) {
|
1103
1108
|
return -1;
|
1104
1109
|
}
|
1105
1110
|
|
1106
|
-
// find highest prio backend that supports the buffer type
|
1111
|
+
// find highest prio backend that supports the buffer type and the op
|
1107
1112
|
for (int i = 0; i < sched->n_backends; i++) {
|
1108
|
-
if (
|
1113
|
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
1114
|
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
1109
1115
|
return i;
|
1110
1116
|
}
|
1111
1117
|
}
|
1112
1118
|
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1119
|
+
#ifndef NDEBUG
|
1120
|
+
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
1121
|
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
1122
|
+
#endif
|
1116
1123
|
|
1117
1124
|
return -1;
|
1118
1125
|
}
|
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1131
1138
|
// TODO: use supports_op to check if the backend supports the op
|
1132
1139
|
|
1133
1140
|
// assign pre-allocated nodes to their backend
|
1134
|
-
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
1141
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
1135
1142
|
if (cur_backend_id != -1) {
|
1136
1143
|
SET_CAUSE(tensor, "1.dst");
|
1137
1144
|
return cur_backend_id;
|
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1139
1146
|
|
1140
1147
|
// view_src
|
1141
1148
|
if (tensor->view_src != NULL) {
|
1142
|
-
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
1149
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
1143
1150
|
if (cur_backend_id != -1) {
|
1144
1151
|
SET_CAUSE(tensor, "1.vsrc");
|
1145
1152
|
return cur_backend_id;
|
@@ -1161,11 +1168,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1161
1168
|
continue;
|
1162
1169
|
}
|
1163
1170
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
1164
|
-
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
1171
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
1165
1172
|
// check if a backend with higher prio wants to offload the op
|
1166
1173
|
if (src_backend_id == sched->n_backends - 1) {
|
1167
1174
|
for (int b = 0; b < src_backend_id; b++) {
|
1168
|
-
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1175
|
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1169
1176
|
SET_CAUSE(tensor, "1.off");
|
1170
1177
|
return b;
|
1171
1178
|
}
|
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
1223
1230
|
}
|
1224
1231
|
}
|
1225
1232
|
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1233
|
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
1234
|
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
1235
|
+
ggml_backend_buffer_type_t buft = NULL;
|
1236
|
+
|
1237
|
+
if (buf) {
|
1238
|
+
// the tensor is already allocated
|
1239
|
+
buft = buf->buft;
|
1240
|
+
} else {
|
1241
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
1242
|
+
int tensor_backend_id = tensor_backend_id(t);
|
1243
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
1244
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
1245
|
+
}
|
1246
|
+
if (tensor_backend_id != -1) {
|
1247
|
+
buft = sched->bufts[tensor_backend_id];
|
1248
|
+
}
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
1252
|
+
}
|
1253
|
+
|
1254
|
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
1255
|
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
1256
|
+
*node_backend_id = cur_backend_id;
|
1257
|
+
SET_CAUSE(node, "2.sup");
|
1258
|
+
}
|
1259
|
+
}
|
1230
1260
|
|
1231
1261
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
1232
1262
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1280
1310
|
}
|
1281
1311
|
}
|
1282
1312
|
}
|
1283
|
-
#ifdef DEBUG_PASS1
|
1284
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1285
|
-
#endif
|
1286
1313
|
|
1287
1314
|
// pass 2: expand current backend assignments
|
1288
1315
|
// assign the same backend to adjacent nodes
|
1289
1316
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
1290
1317
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
1291
|
-
|
1292
|
-
|
1293
|
-
// pass 2.2 expand gpu down
|
1318
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
1319
|
+
// expand gpu down
|
1294
1320
|
{
|
1295
1321
|
int cur_backend_id = -1;
|
1296
1322
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1306
1332
|
} else {
|
1307
1333
|
cur_backend_id = *node_backend_id;
|
1308
1334
|
}
|
1309
|
-
} else {
|
1310
|
-
|
1311
|
-
SET_CAUSE(node, "2.2");
|
1335
|
+
} else if (cur_backend_id != -1) {
|
1336
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1312
1337
|
}
|
1313
1338
|
}
|
1314
1339
|
}
|
1315
|
-
//
|
1340
|
+
// expand gpu up
|
1316
1341
|
{
|
1317
1342
|
int cur_backend_id = -1;
|
1318
1343
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1328
1353
|
} else {
|
1329
1354
|
cur_backend_id = *node_backend_id;
|
1330
1355
|
}
|
1331
|
-
} else {
|
1332
|
-
|
1333
|
-
SET_CAUSE(node, "2.1");
|
1356
|
+
} else if (cur_backend_id != -1) {
|
1357
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1334
1358
|
}
|
1335
1359
|
}
|
1336
1360
|
}
|
1337
|
-
//
|
1361
|
+
// expand rest down
|
1338
1362
|
{
|
1339
1363
|
int cur_backend_id = -1;
|
1340
1364
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1345
1369
|
int * node_backend_id = &tensor_backend_id(node);
|
1346
1370
|
if (*node_backend_id != -1) {
|
1347
1371
|
cur_backend_id = *node_backend_id;
|
1348
|
-
} else {
|
1349
|
-
|
1350
|
-
SET_CAUSE(node, "2.4");
|
1372
|
+
} else if (cur_backend_id != -1) {
|
1373
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1351
1374
|
}
|
1352
1375
|
}
|
1353
1376
|
}
|
1354
|
-
//
|
1377
|
+
// expand rest up
|
1355
1378
|
{
|
1356
1379
|
int cur_backend_id = -1;
|
1357
1380
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1362
1385
|
int * node_backend_id = &tensor_backend_id(node);
|
1363
1386
|
if (*node_backend_id != -1) {
|
1364
1387
|
cur_backend_id = *node_backend_id;
|
1365
|
-
} else {
|
1366
|
-
|
1367
|
-
SET_CAUSE(node, "2.3");
|
1388
|
+
} else if (cur_backend_id != -1) {
|
1389
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
1368
1390
|
}
|
1369
1391
|
}
|
1370
1392
|
}
|
1371
1393
|
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1394
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
1395
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
1396
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
1397
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
1398
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
1399
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
1400
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
1401
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
1402
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
1403
|
+
struct ggml_tensor * node = graph->nodes[i];
|
1404
|
+
if (ggml_is_view_op(node->op)) {
|
1405
|
+
continue;
|
1406
|
+
}
|
1407
|
+
int * node_backend_id = &tensor_backend_id(node);
|
1408
|
+
if (*node_backend_id == -1) {
|
1409
|
+
// unassigned node: find the backend with the most supported inputs
|
1410
|
+
int n_supported_best = -1;
|
1411
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
1412
|
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
1413
|
+
int n_supported = 0;
|
1414
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1415
|
+
struct ggml_tensor * src = node->src[j];
|
1416
|
+
if (src == NULL) {
|
1417
|
+
continue;
|
1418
|
+
}
|
1419
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1420
|
+
n_supported++;
|
1421
|
+
}
|
1422
|
+
}
|
1423
|
+
if (n_supported > n_supported_best) {
|
1424
|
+
n_supported_best = n_supported;
|
1425
|
+
*node_backend_id = b;
|
1426
|
+
SET_CAUSE(node, "3.best");
|
1427
|
+
}
|
1428
|
+
}
|
1429
|
+
}
|
1430
|
+
} else {
|
1431
|
+
// assigned node: upgrade to higher prio backend if possible
|
1432
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
1433
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
1434
|
+
bool supported = true;
|
1435
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1436
|
+
struct ggml_tensor * src = node->src[j];
|
1437
|
+
if (src == NULL) {
|
1438
|
+
continue;
|
1439
|
+
}
|
1440
|
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
1441
|
+
supported = false;
|
1442
|
+
break;
|
1443
|
+
}
|
1444
|
+
}
|
1445
|
+
if (supported) {
|
1446
|
+
*node_backend_id = b;
|
1447
|
+
SET_CAUSE(node, "3.upg");
|
1448
|
+
break;
|
1449
|
+
}
|
1450
|
+
}
|
1451
|
+
}
|
1452
|
+
}
|
1453
|
+
}
|
1375
1454
|
|
1376
|
-
// pass
|
1455
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
1377
1456
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1378
1457
|
struct ggml_tensor * node = graph->nodes[i];
|
1379
1458
|
int * cur_backend_id = &tensor_backend_id(node);
|
1380
1459
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
1381
1460
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
1382
|
-
SET_CAUSE(node, "
|
1461
|
+
SET_CAUSE(node, "4.vsrc");
|
1383
1462
|
}
|
1384
1463
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
1385
1464
|
struct ggml_tensor * src = node->src[j];
|
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1391
1470
|
if (src->view_src != NULL) {
|
1392
1471
|
// views are always on the same backend as the source
|
1393
1472
|
*src_backend_id = tensor_backend_id(src->view_src);
|
1394
|
-
SET_CAUSE(src, "
|
1473
|
+
SET_CAUSE(src, "4.vsrc");
|
1395
1474
|
} else {
|
1396
1475
|
*src_backend_id = *cur_backend_id;
|
1397
|
-
SET_CAUSE(src, "
|
1476
|
+
SET_CAUSE(src, "4.cur");
|
1398
1477
|
}
|
1399
1478
|
}
|
1400
1479
|
}
|
1401
1480
|
}
|
1402
|
-
#ifdef DEBUG_PASS3
|
1403
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
1404
|
-
#endif
|
1405
1481
|
|
1406
1482
|
// pass 4: split graph, find tensors that need to be copied
|
1407
1483
|
{
|
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1448
1524
|
}
|
1449
1525
|
}
|
1450
1526
|
// check if the split has too many inputs
|
1527
|
+
// FIXME: count the number of inputs instead of only checking when full
|
1451
1528
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
1452
1529
|
const size_t id = hash_id(src);
|
1453
1530
|
int src_backend_id = sched->tensor_backend_id[id];
|
1454
|
-
|
1531
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1532
|
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
1455
1533
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
1456
1534
|
need_new_split = true;
|
1457
1535
|
break;
|
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1486
1564
|
const int src_backend_id = tensor_backend_id(src);
|
1487
1565
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
1488
1566
|
|
1489
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)
|
1567
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
1490
1568
|
size_t id = hash_id(src);
|
1491
1569
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
1492
1570
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1511
1589
|
}
|
1512
1590
|
}
|
1513
1591
|
|
1514
|
-
|
1592
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
1593
|
+
if (src_backend_id != cur_backend_id && !supported) {
|
1515
1594
|
// create a copy of the input in the split's backend
|
1516
1595
|
const size_t id = hash_id(src);
|
1517
1596
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1537
1616
|
split->i_end = graph->n_nodes;
|
1538
1617
|
sched->n_splits = i_split + 1;
|
1539
1618
|
}
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1619
|
+
|
1620
|
+
if (sched->debug) {
|
1621
|
+
ggml_backend_sched_print_assignments(sched, graph);
|
1622
|
+
}
|
1623
|
+
|
1624
|
+
// swap node_backend_ids and leaf_backend_ids and prevs
|
1625
|
+
{
|
1626
|
+
int * tmp = sched->node_backend_ids;
|
1627
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
1628
|
+
sched->prev_node_backend_ids = tmp;
|
1629
|
+
|
1630
|
+
tmp = sched->leaf_backend_ids;
|
1631
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
1632
|
+
sched->prev_leaf_backend_ids = tmp;
|
1633
|
+
}
|
1543
1634
|
|
1544
1635
|
// create copies of the graph for each split
|
1545
1636
|
// TODO: avoid this copy
|
@@ -1613,8 +1704,26 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1613
1704
|
}
|
1614
1705
|
|
1615
1706
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1707
|
+
bool backend_ids_changed = false;
|
1708
|
+
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
1709
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
1710
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
1711
|
+
backend_ids_changed = true;
|
1712
|
+
break;
|
1713
|
+
}
|
1714
|
+
}
|
1715
|
+
if (!backend_ids_changed) {
|
1716
|
+
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
1717
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
1718
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
1719
|
+
backend_ids_changed = true;
|
1720
|
+
break;
|
1721
|
+
}
|
1722
|
+
}
|
1723
|
+
}
|
1724
|
+
|
1616
1725
|
// allocate graph
|
1617
|
-
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1726
|
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
1618
1727
|
// the re-allocation may cause the split inputs to be moved to a different address
|
1619
1728
|
ggml_backend_sched_synchronize(sched);
|
1620
1729
|
#ifndef NDEBUG
|
@@ -1727,6 +1836,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1727
1836
|
|
1728
1837
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1729
1838
|
|
1839
|
+
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
1840
|
+
|
1730
1841
|
// initialize hash table
|
1731
1842
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1732
1843
|
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
@@ -1735,6 +1846,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1735
1846
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1736
1847
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
1848
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1849
|
+
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1850
|
+
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1738
1851
|
|
1739
1852
|
sched->n_backends = n_backends;
|
1740
1853
|
|
@@ -1747,7 +1860,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1747
1860
|
for (int b = 0; b < n_backends; b++) {
|
1748
1861
|
sched->backends[b] = backends[b];
|
1749
1862
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
1750
|
-
GGML_ASSERT(
|
1863
|
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
1751
1864
|
if (sched->n_copies > 1) {
|
1752
1865
|
for (int c = 0; c < sched->n_copies; c++) {
|
1753
1866
|
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
@@ -1779,6 +1892,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
1779
1892
|
free(sched->tensor_copies);
|
1780
1893
|
free(sched->node_backend_ids);
|
1781
1894
|
free(sched->leaf_backend_ids);
|
1895
|
+
free(sched->prev_node_backend_ids);
|
1896
|
+
free(sched->prev_leaf_backend_ids);
|
1782
1897
|
free(sched);
|
1783
1898
|
}
|
1784
1899
|
|
@@ -1864,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
|
1864
1979
|
return sched->n_copies;
|
1865
1980
|
}
|
1866
1981
|
|
1982
|
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
1983
|
+
return sched->n_backends;
|
1984
|
+
}
|
1985
|
+
|
1986
|
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
1987
|
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
1988
|
+
return sched->backends[i];
|
1989
|
+
}
|
1990
|
+
|
1867
1991
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1868
1992
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1869
1993
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
@@ -1875,6 +1999,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|
1875
1999
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1876
2000
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
1877
2001
|
tensor_backend_id(node) = backend_index;
|
2002
|
+
SET_CAUSE(node, "usr");
|
1878
2003
|
}
|
1879
2004
|
|
1880
2005
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
@@ -23,7 +23,6 @@ extern "C" {
|
|
23
23
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
24
24
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
25
25
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
26
|
-
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
27
26
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
28
27
|
|
29
28
|
// buffer
|
@@ -74,6 +73,7 @@ extern "C" {
|
|
74
73
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
75
74
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
76
75
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
76
|
+
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
77
77
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
78
78
|
|
79
79
|
// tensor copy between different backends
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
90
90
|
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
91
91
|
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
92
92
|
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
93
|
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
93
|
+
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
94
94
|
|
95
95
|
//
|
96
96
|
// CPU backend
|
@@ -119,7 +119,7 @@ extern "C" {
|
|
119
119
|
|
120
120
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
121
121
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
122
|
-
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is
|
122
|
+
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
123
123
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
124
124
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
125
125
|
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
@@ -182,6 +182,9 @@ extern "C" {
|
|
182
182
|
// Initialize backend buffers from a measure graph
|
183
183
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
184
184
|
|
185
|
+
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
186
|
+
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
187
|
+
|
185
188
|
// Get the number of splits of the last graph
|
186
189
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
187
190
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|