llama_cpp 0.16.0 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
|
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
|
|
44
44
|
return ggml_nbytes(tensor);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
48
|
-
return buft->iface.supports_backend(buft, backend);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
47
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
52
48
|
if (buft->iface.is_host) {
|
|
53
49
|
return buft->iface.is_host(buft);
|
|
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
|
286
282
|
return backend->iface.supports_op(backend, op);
|
|
287
283
|
}
|
|
288
284
|
|
|
285
|
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
286
|
+
return backend->iface.supports_buft(backend, buft);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
289
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
290
290
|
if (backend->iface.offload_op != NULL) {
|
|
291
291
|
return backend->iface.offload_op(backend, op);
|
|
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
|
|
639
639
|
GGML_UNUSED(buft);
|
|
640
640
|
}
|
|
641
641
|
|
|
642
|
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
643
|
-
return ggml_backend_is_cpu(backend);
|
|
644
|
-
|
|
645
|
-
GGML_UNUSED(buft);
|
|
646
|
-
}
|
|
647
|
-
|
|
648
642
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
649
643
|
return true;
|
|
650
644
|
|
|
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
|
659
653
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
660
654
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
661
655
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
662
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
663
656
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
664
657
|
},
|
|
665
658
|
/* .context = */ NULL,
|
|
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
|
715
708
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
716
709
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
717
710
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
718
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
719
711
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
720
712
|
},
|
|
721
713
|
/* .context = */ NULL,
|
|
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|
|
836
828
|
GGML_UNUSED(backend);
|
|
837
829
|
}
|
|
838
830
|
|
|
831
|
+
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
832
|
+
return ggml_backend_buft_is_host(buft);
|
|
833
|
+
|
|
834
|
+
GGML_UNUSED(backend);
|
|
835
|
+
}
|
|
836
|
+
|
|
839
837
|
static struct ggml_backend_i cpu_backend_i = {
|
|
840
838
|
/* .get_name = */ ggml_backend_cpu_name,
|
|
841
839
|
/* .free = */ ggml_backend_cpu_free,
|
|
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
|
846
844
|
/* .synchronize = */ NULL,
|
|
847
845
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
848
846
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
847
|
+
/* .graph_plan_update = */ NULL,
|
|
849
848
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
850
849
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
851
850
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
851
|
+
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
|
852
852
|
/* .offload_op = */ NULL,
|
|
853
853
|
/* .event_new = */ NULL,
|
|
854
854
|
/* .event_free = */ NULL,
|
|
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
|
|
|
1055
1055
|
int * node_backend_ids; // [graph_size]
|
|
1056
1056
|
int * leaf_backend_ids; // [graph_size]
|
|
1057
1057
|
|
|
1058
|
+
int * prev_node_backend_ids; // [graph_size]
|
|
1059
|
+
int * prev_leaf_backend_ids; // [graph_size]
|
|
1060
|
+
|
|
1058
1061
|
// copy of the graph with modified inputs
|
|
1059
1062
|
struct ggml_cgraph * graph;
|
|
1060
1063
|
|
|
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
|
|
|
1075
1078
|
ggml_backend_sched_eval_callback callback_eval;
|
|
1076
1079
|
void * callback_eval_user_data;
|
|
1077
1080
|
|
|
1081
|
+
bool debug;
|
|
1082
|
+
|
|
1078
1083
|
// align context_buffer to GGML_MEM_ALIGN
|
|
1079
1084
|
#ifdef _MSC_VER
|
|
1080
1085
|
__declspec(align(GGML_MEM_ALIGN))
|
|
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
|
1097
1102
|
return -1;
|
|
1098
1103
|
}
|
|
1099
1104
|
|
|
1100
|
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
|
1105
|
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
|
1101
1106
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
1102
1107
|
if (buffer == NULL) {
|
|
1103
1108
|
return -1;
|
|
1104
1109
|
}
|
|
1105
1110
|
|
|
1106
|
-
// find highest prio backend that supports the buffer type
|
|
1111
|
+
// find highest prio backend that supports the buffer type and the op
|
|
1107
1112
|
for (int i = 0; i < sched->n_backends; i++) {
|
|
1108
|
-
if (
|
|
1113
|
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
|
1114
|
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
|
1109
1115
|
return i;
|
|
1110
1116
|
}
|
|
1111
1117
|
}
|
|
1112
1118
|
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1119
|
+
#ifndef NDEBUG
|
|
1120
|
+
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
|
1121
|
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
|
1122
|
+
#endif
|
|
1116
1123
|
|
|
1117
1124
|
return -1;
|
|
1118
1125
|
}
|
|
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1131
1138
|
// TODO: use supports_op to check if the backend supports the op
|
|
1132
1139
|
|
|
1133
1140
|
// assign pre-allocated nodes to their backend
|
|
1134
|
-
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
|
1141
|
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
|
1135
1142
|
if (cur_backend_id != -1) {
|
|
1136
1143
|
SET_CAUSE(tensor, "1.dst");
|
|
1137
1144
|
return cur_backend_id;
|
|
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1139
1146
|
|
|
1140
1147
|
// view_src
|
|
1141
1148
|
if (tensor->view_src != NULL) {
|
|
1142
|
-
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
|
1149
|
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
|
1143
1150
|
if (cur_backend_id != -1) {
|
|
1144
1151
|
SET_CAUSE(tensor, "1.vsrc");
|
|
1145
1152
|
return cur_backend_id;
|
|
@@ -1161,11 +1168,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1161
1168
|
continue;
|
|
1162
1169
|
}
|
|
1163
1170
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1164
|
-
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
|
1171
|
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
|
1165
1172
|
// check if a backend with higher prio wants to offload the op
|
|
1166
1173
|
if (src_backend_id == sched->n_backends - 1) {
|
|
1167
1174
|
for (int b = 0; b < src_backend_id; b++) {
|
|
1168
|
-
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
|
1175
|
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
|
1169
1176
|
SET_CAUSE(tensor, "1.off");
|
|
1170
1177
|
return b;
|
|
1171
1178
|
}
|
|
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1223
1230
|
}
|
|
1224
1231
|
}
|
|
1225
1232
|
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1233
|
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
|
1234
|
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
|
1235
|
+
ggml_backend_buffer_type_t buft = NULL;
|
|
1236
|
+
|
|
1237
|
+
if (buf) {
|
|
1238
|
+
// the tensor is already allocated
|
|
1239
|
+
buft = buf->buft;
|
|
1240
|
+
} else {
|
|
1241
|
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
|
1242
|
+
int tensor_backend_id = tensor_backend_id(t);
|
|
1243
|
+
if (tensor_backend_id == -1 && t->view_src) {
|
|
1244
|
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
|
1245
|
+
}
|
|
1246
|
+
if (tensor_backend_id != -1) {
|
|
1247
|
+
buft = sched->bufts[tensor_backend_id];
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
|
1255
|
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
|
1256
|
+
*node_backend_id = cur_backend_id;
|
|
1257
|
+
SET_CAUSE(node, "2.sup");
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1230
1260
|
|
|
1231
1261
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
1232
1262
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1280
1310
|
}
|
|
1281
1311
|
}
|
|
1282
1312
|
}
|
|
1283
|
-
#ifdef DEBUG_PASS1
|
|
1284
|
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
1285
|
-
#endif
|
|
1286
1313
|
|
|
1287
1314
|
// pass 2: expand current backend assignments
|
|
1288
1315
|
// assign the same backend to adjacent nodes
|
|
1289
1316
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
|
1290
1317
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
// pass 2.2 expand gpu down
|
|
1318
|
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
|
1319
|
+
// expand gpu down
|
|
1294
1320
|
{
|
|
1295
1321
|
int cur_backend_id = -1;
|
|
1296
1322
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1306
1332
|
} else {
|
|
1307
1333
|
cur_backend_id = *node_backend_id;
|
|
1308
1334
|
}
|
|
1309
|
-
} else {
|
|
1310
|
-
|
|
1311
|
-
SET_CAUSE(node, "2.2");
|
|
1335
|
+
} else if (cur_backend_id != -1) {
|
|
1336
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1312
1337
|
}
|
|
1313
1338
|
}
|
|
1314
1339
|
}
|
|
1315
|
-
//
|
|
1340
|
+
// expand gpu up
|
|
1316
1341
|
{
|
|
1317
1342
|
int cur_backend_id = -1;
|
|
1318
1343
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1328
1353
|
} else {
|
|
1329
1354
|
cur_backend_id = *node_backend_id;
|
|
1330
1355
|
}
|
|
1331
|
-
} else {
|
|
1332
|
-
|
|
1333
|
-
SET_CAUSE(node, "2.1");
|
|
1356
|
+
} else if (cur_backend_id != -1) {
|
|
1357
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1334
1358
|
}
|
|
1335
1359
|
}
|
|
1336
1360
|
}
|
|
1337
|
-
//
|
|
1361
|
+
// expand rest down
|
|
1338
1362
|
{
|
|
1339
1363
|
int cur_backend_id = -1;
|
|
1340
1364
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1345
1369
|
int * node_backend_id = &tensor_backend_id(node);
|
|
1346
1370
|
if (*node_backend_id != -1) {
|
|
1347
1371
|
cur_backend_id = *node_backend_id;
|
|
1348
|
-
} else {
|
|
1349
|
-
|
|
1350
|
-
SET_CAUSE(node, "2.4");
|
|
1372
|
+
} else if (cur_backend_id != -1) {
|
|
1373
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1351
1374
|
}
|
|
1352
1375
|
}
|
|
1353
1376
|
}
|
|
1354
|
-
//
|
|
1377
|
+
// expand rest up
|
|
1355
1378
|
{
|
|
1356
1379
|
int cur_backend_id = -1;
|
|
1357
1380
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1362
1385
|
int * node_backend_id = &tensor_backend_id(node);
|
|
1363
1386
|
if (*node_backend_id != -1) {
|
|
1364
1387
|
cur_backend_id = *node_backend_id;
|
|
1365
|
-
} else {
|
|
1366
|
-
|
|
1367
|
-
SET_CAUSE(node, "2.3");
|
|
1388
|
+
} else if (cur_backend_id != -1) {
|
|
1389
|
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
1368
1390
|
}
|
|
1369
1391
|
}
|
|
1370
1392
|
}
|
|
1371
1393
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1394
|
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
|
1395
|
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
|
1396
|
+
// however, we also need to verify that the sources are in compatible buffer types
|
|
1397
|
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
|
1398
|
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
|
1399
|
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
|
1400
|
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
|
1401
|
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
|
1402
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1403
|
+
struct ggml_tensor * node = graph->nodes[i];
|
|
1404
|
+
if (ggml_is_view_op(node->op)) {
|
|
1405
|
+
continue;
|
|
1406
|
+
}
|
|
1407
|
+
int * node_backend_id = &tensor_backend_id(node);
|
|
1408
|
+
if (*node_backend_id == -1) {
|
|
1409
|
+
// unassigned node: find the backend with the most supported inputs
|
|
1410
|
+
int n_supported_best = -1;
|
|
1411
|
+
for (int b = 0; b < sched->n_backends; b++) {
|
|
1412
|
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
|
1413
|
+
int n_supported = 0;
|
|
1414
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1415
|
+
struct ggml_tensor * src = node->src[j];
|
|
1416
|
+
if (src == NULL) {
|
|
1417
|
+
continue;
|
|
1418
|
+
}
|
|
1419
|
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
|
1420
|
+
n_supported++;
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
if (n_supported > n_supported_best) {
|
|
1424
|
+
n_supported_best = n_supported;
|
|
1425
|
+
*node_backend_id = b;
|
|
1426
|
+
SET_CAUSE(node, "3.best");
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
} else {
|
|
1431
|
+
// assigned node: upgrade to higher prio backend if possible
|
|
1432
|
+
for (int b = 0; b < *node_backend_id; b++) {
|
|
1433
|
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
|
1434
|
+
bool supported = true;
|
|
1435
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1436
|
+
struct ggml_tensor * src = node->src[j];
|
|
1437
|
+
if (src == NULL) {
|
|
1438
|
+
continue;
|
|
1439
|
+
}
|
|
1440
|
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
|
1441
|
+
supported = false;
|
|
1442
|
+
break;
|
|
1443
|
+
}
|
|
1444
|
+
}
|
|
1445
|
+
if (supported) {
|
|
1446
|
+
*node_backend_id = b;
|
|
1447
|
+
SET_CAUSE(node, "3.upg");
|
|
1448
|
+
break;
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1375
1454
|
|
|
1376
|
-
// pass
|
|
1455
|
+
// pass 4: assign backends to remaining src from dst and view_src
|
|
1377
1456
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1378
1457
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1379
1458
|
int * cur_backend_id = &tensor_backend_id(node);
|
|
1380
1459
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
|
1381
1460
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
|
1382
|
-
SET_CAUSE(node, "
|
|
1461
|
+
SET_CAUSE(node, "4.vsrc");
|
|
1383
1462
|
}
|
|
1384
1463
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
1385
1464
|
struct ggml_tensor * src = node->src[j];
|
|
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1391
1470
|
if (src->view_src != NULL) {
|
|
1392
1471
|
// views are always on the same backend as the source
|
|
1393
1472
|
*src_backend_id = tensor_backend_id(src->view_src);
|
|
1394
|
-
SET_CAUSE(src, "
|
|
1473
|
+
SET_CAUSE(src, "4.vsrc");
|
|
1395
1474
|
} else {
|
|
1396
1475
|
*src_backend_id = *cur_backend_id;
|
|
1397
|
-
SET_CAUSE(src, "
|
|
1476
|
+
SET_CAUSE(src, "4.cur");
|
|
1398
1477
|
}
|
|
1399
1478
|
}
|
|
1400
1479
|
}
|
|
1401
1480
|
}
|
|
1402
|
-
#ifdef DEBUG_PASS3
|
|
1403
|
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
1404
|
-
#endif
|
|
1405
1481
|
|
|
1406
1482
|
// pass 4: split graph, find tensors that need to be copied
|
|
1407
1483
|
{
|
|
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1448
1524
|
}
|
|
1449
1525
|
}
|
|
1450
1526
|
// check if the split has too many inputs
|
|
1527
|
+
// FIXME: count the number of inputs instead of only checking when full
|
|
1451
1528
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
|
1452
1529
|
const size_t id = hash_id(src);
|
|
1453
1530
|
int src_backend_id = sched->tensor_backend_id[id];
|
|
1454
|
-
|
|
1531
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1532
|
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
|
1455
1533
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
1456
1534
|
need_new_split = true;
|
|
1457
1535
|
break;
|
|
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1486
1564
|
const int src_backend_id = tensor_backend_id(src);
|
|
1487
1565
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
|
1488
1566
|
|
|
1489
|
-
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)
|
|
1567
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
|
1490
1568
|
size_t id = hash_id(src);
|
|
1491
1569
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
|
1492
1570
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1511
1589
|
}
|
|
1512
1590
|
}
|
|
1513
1591
|
|
|
1514
|
-
|
|
1592
|
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1593
|
+
if (src_backend_id != cur_backend_id && !supported) {
|
|
1515
1594
|
// create a copy of the input in the split's backend
|
|
1516
1595
|
const size_t id = hash_id(src);
|
|
1517
1596
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
|
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1537
1616
|
split->i_end = graph->n_nodes;
|
|
1538
1617
|
sched->n_splits = i_split + 1;
|
|
1539
1618
|
}
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1619
|
+
|
|
1620
|
+
if (sched->debug) {
|
|
1621
|
+
ggml_backend_sched_print_assignments(sched, graph);
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
// swap node_backend_ids and leaf_backend_ids and prevs
|
|
1625
|
+
{
|
|
1626
|
+
int * tmp = sched->node_backend_ids;
|
|
1627
|
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
|
1628
|
+
sched->prev_node_backend_ids = tmp;
|
|
1629
|
+
|
|
1630
|
+
tmp = sched->leaf_backend_ids;
|
|
1631
|
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
|
1632
|
+
sched->prev_leaf_backend_ids = tmp;
|
|
1633
|
+
}
|
|
1543
1634
|
|
|
1544
1635
|
// create copies of the graph for each split
|
|
1545
1636
|
// TODO: avoid this copy
|
|
@@ -1613,8 +1704,26 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1613
1704
|
}
|
|
1614
1705
|
|
|
1615
1706
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
1707
|
+
bool backend_ids_changed = false;
|
|
1708
|
+
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
|
1709
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
|
1710
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
|
1711
|
+
backend_ids_changed = true;
|
|
1712
|
+
break;
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
if (!backend_ids_changed) {
|
|
1716
|
+
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
|
1717
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
|
1718
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
|
1719
|
+
backend_ids_changed = true;
|
|
1720
|
+
break;
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1616
1725
|
// allocate graph
|
|
1617
|
-
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
|
1726
|
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
|
1618
1727
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1619
1728
|
ggml_backend_sched_synchronize(sched);
|
|
1620
1729
|
#ifndef NDEBUG
|
|
@@ -1727,6 +1836,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1727
1836
|
|
|
1728
1837
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
|
1729
1838
|
|
|
1839
|
+
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
|
1840
|
+
|
|
1730
1841
|
// initialize hash table
|
|
1731
1842
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
|
1732
1843
|
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
|
@@ -1735,6 +1846,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1735
1846
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1736
1847
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
|
1737
1848
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
|
1849
|
+
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
1850
|
+
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1738
1851
|
|
|
1739
1852
|
sched->n_backends = n_backends;
|
|
1740
1853
|
|
|
@@ -1747,7 +1860,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1747
1860
|
for (int b = 0; b < n_backends; b++) {
|
|
1748
1861
|
sched->backends[b] = backends[b];
|
|
1749
1862
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
|
1750
|
-
GGML_ASSERT(
|
|
1863
|
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1751
1864
|
if (sched->n_copies > 1) {
|
|
1752
1865
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1753
1866
|
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
@@ -1779,6 +1892,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
|
1779
1892
|
free(sched->tensor_copies);
|
|
1780
1893
|
free(sched->node_backend_ids);
|
|
1781
1894
|
free(sched->leaf_backend_ids);
|
|
1895
|
+
free(sched->prev_node_backend_ids);
|
|
1896
|
+
free(sched->prev_leaf_backend_ids);
|
|
1782
1897
|
free(sched);
|
|
1783
1898
|
}
|
|
1784
1899
|
|
|
@@ -1864,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
|
|
1864
1979
|
return sched->n_copies;
|
|
1865
1980
|
}
|
|
1866
1981
|
|
|
1982
|
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
|
1983
|
+
return sched->n_backends;
|
|
1984
|
+
}
|
|
1985
|
+
|
|
1986
|
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
|
1987
|
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
|
1988
|
+
return sched->backends[i];
|
|
1989
|
+
}
|
|
1990
|
+
|
|
1867
1991
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
|
1868
1992
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1869
1993
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
@@ -1875,6 +1999,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|
|
1875
1999
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
|
1876
2000
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
1877
2001
|
tensor_backend_id(node) = backend_index;
|
|
2002
|
+
SET_CAUSE(node, "usr");
|
|
1878
2003
|
}
|
|
1879
2004
|
|
|
1880
2005
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
@@ -23,7 +23,6 @@ extern "C" {
|
|
|
23
23
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
|
24
24
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
|
25
25
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
26
|
-
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
|
27
26
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
|
28
27
|
|
|
29
28
|
// buffer
|
|
@@ -74,6 +73,7 @@ extern "C" {
|
|
|
74
73
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
75
74
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
76
75
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
76
|
+
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
77
77
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
78
78
|
|
|
79
79
|
// tensor copy between different backends
|
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
|
90
90
|
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
|
91
91
|
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
|
92
92
|
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
|
93
|
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
|
93
|
+
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
|
94
94
|
|
|
95
95
|
//
|
|
96
96
|
// CPU backend
|
|
@@ -119,7 +119,7 @@ extern "C" {
|
|
|
119
119
|
|
|
120
120
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
|
121
121
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
|
122
|
-
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is
|
|
122
|
+
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
|
123
123
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
|
124
124
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
|
125
125
|
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
|
@@ -182,6 +182,9 @@ extern "C" {
|
|
|
182
182
|
// Initialize backend buffers from a measure graph
|
|
183
183
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
|
184
184
|
|
|
185
|
+
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
|
186
|
+
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
|
187
|
+
|
|
185
188
|
// Get the number of splits of the last graph
|
|
186
189
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
187
190
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|