llama_cpp 0.16.0 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/ext/llama_cpp/extconf.rb +3 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +14 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +4 -0
  7. data/vendor/tmp/llama.cpp/Makefile +119 -54
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
  126. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  127. data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
  128. data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
  129. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
  130. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
  131. data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
  132. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
  133. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
  134. data/vendor/tmp/llama.cpp/ggml.c +158 -414
  135. data/vendor/tmp/llama.cpp/ggml.h +6 -0
  136. data/vendor/tmp/llama.cpp/llama.cpp +628 -279
  137. data/vendor/tmp/llama.cpp/llama.h +9 -1
  138. data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
  139. data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
  140. data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
  141. data/vendor/tmp/llama.cpp/unicode.h +1 -1
  142. metadata +15 -3
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
44
44
  return ggml_nbytes(tensor);
45
45
  }
46
46
 
47
- bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
- return buft->iface.supports_backend(buft, backend);
49
- }
50
-
51
47
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
48
  if (buft->iface.is_host) {
53
49
  return buft->iface.is_host(buft);
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
282
  return backend->iface.supports_op(backend, op);
287
283
  }
288
284
 
285
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
286
+ return backend->iface.supports_buft(backend, buft);
287
+ }
288
+
289
289
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
290
  if (backend->iface.offload_op != NULL) {
291
291
  return backend->iface.offload_op(backend, op);
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
639
639
  GGML_UNUSED(buft);
640
640
  }
641
641
 
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
- return ggml_backend_is_cpu(backend);
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
642
  GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
643
  return true;
650
644
 
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
659
653
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
654
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
655
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
656
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
657
  },
665
658
  /* .context = */ NULL,
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
715
708
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
709
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
710
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
711
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
712
  },
721
713
  /* .context = */ NULL,
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
836
828
  GGML_UNUSED(backend);
837
829
  }
838
830
 
831
+ GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
832
+ return ggml_backend_buft_is_host(buft);
833
+
834
+ GGML_UNUSED(backend);
835
+ }
836
+
839
837
  static struct ggml_backend_i cpu_backend_i = {
840
838
  /* .get_name = */ ggml_backend_cpu_name,
841
839
  /* .free = */ ggml_backend_cpu_free,
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
846
844
  /* .synchronize = */ NULL,
847
845
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
848
846
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
847
+ /* .graph_plan_update = */ NULL,
849
848
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
850
849
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
851
850
  /* .supports_op = */ ggml_backend_cpu_supports_op,
851
+ /* .supports_buft = */ ggml_backend_cpu_supports_buft,
852
852
  /* .offload_op = */ NULL,
853
853
  /* .event_new = */ NULL,
854
854
  /* .event_free = */ NULL,
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
1055
1055
  int * node_backend_ids; // [graph_size]
1056
1056
  int * leaf_backend_ids; // [graph_size]
1057
1057
 
1058
+ int * prev_node_backend_ids; // [graph_size]
1059
+ int * prev_leaf_backend_ids; // [graph_size]
1060
+
1058
1061
  // copy of the graph with modified inputs
1059
1062
  struct ggml_cgraph * graph;
1060
1063
 
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
1075
1078
  ggml_backend_sched_eval_callback callback_eval;
1076
1079
  void * callback_eval_user_data;
1077
1080
 
1081
+ bool debug;
1082
+
1078
1083
  // align context_buffer to GGML_MEM_ALIGN
1079
1084
  #ifdef _MSC_VER
1080
1085
  __declspec(align(GGML_MEM_ALIGN))
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1097
1102
  return -1;
1098
1103
  }
1099
1104
 
1100
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1105
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1101
1106
  ggml_backend_buffer_t buffer = tensor->buffer;
1102
1107
  if (buffer == NULL) {
1103
1108
  return -1;
1104
1109
  }
1105
1110
 
1106
- // find highest prio backend that supports the buffer type
1111
+ // find highest prio backend that supports the buffer type and the op
1107
1112
  for (int i = 0; i < sched->n_backends; i++) {
1108
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
1113
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1114
+ ggml_backend_supports_op(sched->backends[i], op)) {
1109
1115
  return i;
1110
1116
  }
1111
1117
  }
1112
1118
 
1113
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1114
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
1115
- GGML_ASSERT(false);
1119
+ #ifndef NDEBUG
1120
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1121
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1122
+ #endif
1116
1123
 
1117
1124
  return -1;
1118
1125
  }
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1131
1138
  // TODO: use supports_op to check if the backend supports the op
1132
1139
 
1133
1140
  // assign pre-allocated nodes to their backend
1134
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1141
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1135
1142
  if (cur_backend_id != -1) {
1136
1143
  SET_CAUSE(tensor, "1.dst");
1137
1144
  return cur_backend_id;
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1139
1146
 
1140
1147
  // view_src
1141
1148
  if (tensor->view_src != NULL) {
1142
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1149
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1143
1150
  if (cur_backend_id != -1) {
1144
1151
  SET_CAUSE(tensor, "1.vsrc");
1145
1152
  return cur_backend_id;
@@ -1161,11 +1168,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1161
1168
  continue;
1162
1169
  }
1163
1170
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1164
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1171
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1165
1172
  // check if a backend with higher prio wants to offload the op
1166
1173
  if (src_backend_id == sched->n_backends - 1) {
1167
1174
  for (int b = 0; b < src_backend_id; b++) {
1168
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1175
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1169
1176
  SET_CAUSE(tensor, "1.off");
1170
1177
  return b;
1171
1178
  }
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1223
1230
  }
1224
1231
  }
1225
1232
 
1226
- //#define DEBUG_PASS1
1227
- //#define DEBUG_PASS2
1228
- //#define DEBUG_PASS3
1229
- //#define DEBUG_PASS4
1233
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1234
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1235
+ ggml_backend_buffer_type_t buft = NULL;
1236
+
1237
+ if (buf) {
1238
+ // the tensor is already allocated
1239
+ buft = buf->buft;
1240
+ } else {
1241
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1242
+ int tensor_backend_id = tensor_backend_id(t);
1243
+ if (tensor_backend_id == -1 && t->view_src) {
1244
+ tensor_backend_id = tensor_backend_id(t->view_src);
1245
+ }
1246
+ if (tensor_backend_id != -1) {
1247
+ buft = sched->bufts[tensor_backend_id];
1248
+ }
1249
+ }
1250
+
1251
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1252
+ }
1253
+
1254
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1255
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1256
+ *node_backend_id = cur_backend_id;
1257
+ SET_CAUSE(node, "2.sup");
1258
+ }
1259
+ }
1230
1260
 
1231
1261
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1232
1262
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1280
1310
  }
1281
1311
  }
1282
1312
  }
1283
- #ifdef DEBUG_PASS1
1284
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1285
- #endif
1286
1313
 
1287
1314
  // pass 2: expand current backend assignments
1288
1315
  // assign the same backend to adjacent nodes
1289
1316
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1290
1317
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1291
-
1292
-
1293
- // pass 2.2 expand gpu down
1318
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1319
+ // expand gpu down
1294
1320
  {
1295
1321
  int cur_backend_id = -1;
1296
1322
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1332
  } else {
1307
1333
  cur_backend_id = *node_backend_id;
1308
1334
  }
1309
- } else {
1310
- *node_backend_id = cur_backend_id;
1311
- SET_CAUSE(node, "2.2");
1335
+ } else if (cur_backend_id != -1) {
1336
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1312
1337
  }
1313
1338
  }
1314
1339
  }
1315
- // pass 2.1 expand gpu up
1340
+ // expand gpu up
1316
1341
  {
1317
1342
  int cur_backend_id = -1;
1318
1343
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
1353
  } else {
1329
1354
  cur_backend_id = *node_backend_id;
1330
1355
  }
1331
- } else {
1332
- *node_backend_id = cur_backend_id;
1333
- SET_CAUSE(node, "2.1");
1356
+ } else if (cur_backend_id != -1) {
1357
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1334
1358
  }
1335
1359
  }
1336
1360
  }
1337
- // pass 2.4 expand rest down
1361
+ // expand rest down
1338
1362
  {
1339
1363
  int cur_backend_id = -1;
1340
1364
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1345
1369
  int * node_backend_id = &tensor_backend_id(node);
1346
1370
  if (*node_backend_id != -1) {
1347
1371
  cur_backend_id = *node_backend_id;
1348
- } else {
1349
- *node_backend_id = cur_backend_id;
1350
- SET_CAUSE(node, "2.4");
1372
+ } else if (cur_backend_id != -1) {
1373
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1351
1374
  }
1352
1375
  }
1353
1376
  }
1354
- // pass 2.3 expand rest up
1377
+ // expand rest up
1355
1378
  {
1356
1379
  int cur_backend_id = -1;
1357
1380
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1362
1385
  int * node_backend_id = &tensor_backend_id(node);
1363
1386
  if (*node_backend_id != -1) {
1364
1387
  cur_backend_id = *node_backend_id;
1365
- } else {
1366
- *node_backend_id = cur_backend_id;
1367
- SET_CAUSE(node, "2.3");
1388
+ } else if (cur_backend_id != -1) {
1389
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1368
1390
  }
1369
1391
  }
1370
1392
  }
1371
1393
 
1372
- #ifdef DEBUG_PASS2
1373
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1374
- #endif
1394
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1395
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1396
+ // however, we also need to verify that the sources are in compatible buffer types
1397
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1398
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1399
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1400
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1401
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1402
+ for (int i = 0; i < graph->n_nodes; i++) {
1403
+ struct ggml_tensor * node = graph->nodes[i];
1404
+ if (ggml_is_view_op(node->op)) {
1405
+ continue;
1406
+ }
1407
+ int * node_backend_id = &tensor_backend_id(node);
1408
+ if (*node_backend_id == -1) {
1409
+ // unassigned node: find the backend with the most supported inputs
1410
+ int n_supported_best = -1;
1411
+ for (int b = 0; b < sched->n_backends; b++) {
1412
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1413
+ int n_supported = 0;
1414
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1415
+ struct ggml_tensor * src = node->src[j];
1416
+ if (src == NULL) {
1417
+ continue;
1418
+ }
1419
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1420
+ n_supported++;
1421
+ }
1422
+ }
1423
+ if (n_supported > n_supported_best) {
1424
+ n_supported_best = n_supported;
1425
+ *node_backend_id = b;
1426
+ SET_CAUSE(node, "3.best");
1427
+ }
1428
+ }
1429
+ }
1430
+ } else {
1431
+ // assigned node: upgrade to higher prio backend if possible
1432
+ for (int b = 0; b < *node_backend_id; b++) {
1433
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1434
+ bool supported = true;
1435
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1436
+ struct ggml_tensor * src = node->src[j];
1437
+ if (src == NULL) {
1438
+ continue;
1439
+ }
1440
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1441
+ supported = false;
1442
+ break;
1443
+ }
1444
+ }
1445
+ if (supported) {
1446
+ *node_backend_id = b;
1447
+ SET_CAUSE(node, "3.upg");
1448
+ break;
1449
+ }
1450
+ }
1451
+ }
1452
+ }
1453
+ }
1375
1454
 
1376
- // pass 3: assign backends to remaining src from dst and view_src
1455
+ // pass 4: assign backends to remaining src from dst and view_src
1377
1456
  for (int i = 0; i < graph->n_nodes; i++) {
1378
1457
  struct ggml_tensor * node = graph->nodes[i];
1379
1458
  int * cur_backend_id = &tensor_backend_id(node);
1380
1459
  if (node->view_src != NULL && *cur_backend_id == -1) {
1381
1460
  *cur_backend_id = tensor_backend_id(node->view_src);
1382
- SET_CAUSE(node, "3.vsrc");
1461
+ SET_CAUSE(node, "4.vsrc");
1383
1462
  }
1384
1463
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1385
1464
  struct ggml_tensor * src = node->src[j];
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1391
1470
  if (src->view_src != NULL) {
1392
1471
  // views are always on the same backend as the source
1393
1472
  *src_backend_id = tensor_backend_id(src->view_src);
1394
- SET_CAUSE(src, "3.vsrc");
1473
+ SET_CAUSE(src, "4.vsrc");
1395
1474
  } else {
1396
1475
  *src_backend_id = *cur_backend_id;
1397
- SET_CAUSE(src, "3.cur");
1476
+ SET_CAUSE(src, "4.cur");
1398
1477
  }
1399
1478
  }
1400
1479
  }
1401
1480
  }
1402
- #ifdef DEBUG_PASS3
1403
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1404
- #endif
1405
1481
 
1406
1482
  // pass 4: split graph, find tensors that need to be copied
1407
1483
  {
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1448
1524
  }
1449
1525
  }
1450
1526
  // check if the split has too many inputs
1527
+ // FIXME: count the number of inputs instead of only checking when full
1451
1528
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1452
1529
  const size_t id = hash_id(src);
1453
1530
  int src_backend_id = sched->tensor_backend_id[id];
1454
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1531
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1532
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1455
1533
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1456
1534
  need_new_split = true;
1457
1535
  break;
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1486
1564
  const int src_backend_id = tensor_backend_id(src);
1487
1565
  assert(src_backend_id != -1); // all inputs should be assigned by now
1488
1566
 
1489
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1567
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1490
1568
  size_t id = hash_id(src);
1491
1569
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1492
1570
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1511
1589
  }
1512
1590
  }
1513
1591
 
1514
- if (src_backend_id != node_backend_id) {
1592
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1593
+ if (src_backend_id != cur_backend_id && !supported) {
1515
1594
  // create a copy of the input in the split's backend
1516
1595
  const size_t id = hash_id(src);
1517
1596
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1537
1616
  split->i_end = graph->n_nodes;
1538
1617
  sched->n_splits = i_split + 1;
1539
1618
  }
1540
- #ifdef DEBUG_PASS4
1541
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1542
- #endif
1619
+
1620
+ if (sched->debug) {
1621
+ ggml_backend_sched_print_assignments(sched, graph);
1622
+ }
1623
+
1624
+ // swap node_backend_ids and leaf_backend_ids and prevs
1625
+ {
1626
+ int * tmp = sched->node_backend_ids;
1627
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1628
+ sched->prev_node_backend_ids = tmp;
1629
+
1630
+ tmp = sched->leaf_backend_ids;
1631
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1632
+ sched->prev_leaf_backend_ids = tmp;
1633
+ }
1543
1634
 
1544
1635
  // create copies of the graph for each split
1545
1636
  // TODO: avoid this copy
@@ -1613,8 +1704,26 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1613
1704
  }
1614
1705
 
1615
1706
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
+ bool backend_ids_changed = false;
1708
+ for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1710
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1711
+ backend_ids_changed = true;
1712
+ break;
1713
+ }
1714
+ }
1715
+ if (!backend_ids_changed) {
1716
+ for (int i = 0; i < sched->graph->n_leafs; i++) {
1717
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1718
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1719
+ backend_ids_changed = true;
1720
+ break;
1721
+ }
1722
+ }
1723
+ }
1724
+
1616
1725
  // allocate graph
1617
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1726
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1618
1727
  // the re-allocation may cause the split inputs to be moved to a different address
1619
1728
  ggml_backend_sched_synchronize(sched);
1620
1729
  #ifndef NDEBUG
@@ -1727,6 +1836,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1727
1836
 
1728
1837
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1729
1838
 
1839
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1840
+
1730
1841
  // initialize hash table
1731
1842
  sched->hash_set = ggml_hash_set_new(graph_size);
1732
1843
  sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
@@ -1735,6 +1846,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1735
1846
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1736
1847
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
1848
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1849
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1850
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1738
1851
 
1739
1852
  sched->n_backends = n_backends;
1740
1853
 
@@ -1747,7 +1860,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1747
1860
  for (int b = 0; b < n_backends; b++) {
1748
1861
  sched->backends[b] = backends[b];
1749
1862
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1750
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1863
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1751
1864
  if (sched->n_copies > 1) {
1752
1865
  for (int c = 0; c < sched->n_copies; c++) {
1753
1866
  sched->events[b][c] = ggml_backend_event_new(backends[b]);
@@ -1779,6 +1892,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1779
1892
  free(sched->tensor_copies);
1780
1893
  free(sched->node_backend_ids);
1781
1894
  free(sched->leaf_backend_ids);
1895
+ free(sched->prev_node_backend_ids);
1896
+ free(sched->prev_leaf_backend_ids);
1782
1897
  free(sched);
1783
1898
  }
1784
1899
 
@@ -1864,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1864
1979
  return sched->n_copies;
1865
1980
  }
1866
1981
 
1982
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1983
+ return sched->n_backends;
1984
+ }
1985
+
1986
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1987
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
1988
+ return sched->backends[i];
1989
+ }
1990
+
1867
1991
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1868
1992
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1869
1993
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -1875,6 +1999,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
1875
1999
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1876
2000
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1877
2001
  tensor_backend_id(node) = backend_index;
2002
+ SET_CAUSE(node, "usr");
1878
2003
  }
1879
2004
 
1880
2005
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -23,7 +23,6 @@ extern "C" {
23
23
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
24
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
25
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
27
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
28
27
 
29
28
  // buffer
@@ -74,6 +73,7 @@ extern "C" {
74
73
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
74
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
76
75
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
76
+ GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
77
77
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
78
 
79
79
  // tensor copy between different backends
@@ -90,7 +90,7 @@ extern "C" {
90
90
  GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
91
  GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
92
  GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
94
94
 
95
95
  //
96
96
  // CPU backend
@@ -119,7 +119,7 @@ extern "C" {
119
119
 
120
120
  GGML_API size_t ggml_backend_reg_get_count(void);
121
121
  GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
122
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
122
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
123
123
  GGML_API const char * ggml_backend_reg_get_name(size_t i);
124
124
  GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
125
125
  GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
@@ -182,6 +182,9 @@ extern "C" {
182
182
  // Initialize backend buffers from a measure graph
183
183
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
184
 
185
+ GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
186
+ GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
187
+
185
188
  // Get the number of splits of the last graph
186
189
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
190
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);