llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
44
44
  return ggml_nbytes(tensor);
45
45
  }
46
46
 
47
- bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
- return buft->iface.supports_backend(buft, backend);
49
- }
50
-
51
47
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
48
  if (buft->iface.is_host) {
53
49
  return buft->iface.is_host(buft);
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
282
  return backend->iface.supports_op(backend, op);
287
283
  }
288
284
 
285
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
286
+ return backend->iface.supports_buft(backend, buft);
287
+ }
288
+
289
289
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
290
  if (backend->iface.offload_op != NULL) {
291
291
  return backend->iface.offload_op(backend, op);
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
639
639
  GGML_UNUSED(buft);
640
640
  }
641
641
 
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
- return ggml_backend_is_cpu(backend);
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
642
  GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
643
  return true;
650
644
 
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
659
653
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
654
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
655
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
656
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
657
  },
665
658
  /* .context = */ NULL,
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
715
708
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
709
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
710
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
711
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
712
  },
721
713
  /* .context = */ NULL,
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
836
828
  GGML_UNUSED(backend);
837
829
  }
838
830
 
831
+ GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
832
+ return ggml_backend_buft_is_host(buft);
833
+
834
+ GGML_UNUSED(backend);
835
+ }
836
+
839
837
  static struct ggml_backend_i cpu_backend_i = {
840
838
  /* .get_name = */ ggml_backend_cpu_name,
841
839
  /* .free = */ ggml_backend_cpu_free,
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
846
844
  /* .synchronize = */ NULL,
847
845
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
848
846
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
847
+ /* .graph_plan_update = */ NULL,
849
848
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
850
849
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
851
850
  /* .supports_op = */ ggml_backend_cpu_supports_op,
851
+ /* .supports_buft = */ ggml_backend_cpu_supports_buft,
852
852
  /* .offload_op = */ NULL,
853
853
  /* .event_new = */ NULL,
854
854
  /* .event_free = */ NULL,
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
1055
1055
  int * node_backend_ids; // [graph_size]
1056
1056
  int * leaf_backend_ids; // [graph_size]
1057
1057
 
1058
+ int * prev_node_backend_ids; // [graph_size]
1059
+ int * prev_leaf_backend_ids; // [graph_size]
1060
+
1058
1061
  // copy of the graph with modified inputs
1059
1062
  struct ggml_cgraph * graph;
1060
1063
 
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
1075
1078
  ggml_backend_sched_eval_callback callback_eval;
1076
1079
  void * callback_eval_user_data;
1077
1080
 
1081
+ bool debug;
1082
+
1078
1083
  // align context_buffer to GGML_MEM_ALIGN
1079
1084
  #ifdef _MSC_VER
1080
1085
  __declspec(align(GGML_MEM_ALIGN))
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1097
1102
  return -1;
1098
1103
  }
1099
1104
 
1100
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1105
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1101
1106
  ggml_backend_buffer_t buffer = tensor->buffer;
1102
1107
  if (buffer == NULL) {
1103
1108
  return -1;
1104
1109
  }
1105
1110
 
1106
- // find highest prio backend that supports the buffer type
1111
+ // find highest prio backend that supports the buffer type and the op
1107
1112
  for (int i = 0; i < sched->n_backends; i++) {
1108
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
1113
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1114
+ ggml_backend_supports_op(sched->backends[i], op)) {
1109
1115
  return i;
1110
1116
  }
1111
1117
  }
1112
1118
 
1113
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1114
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
1115
- GGML_ASSERT(false);
1119
+ #ifndef NDEBUG
1120
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1121
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1122
+ #endif
1116
1123
 
1117
1124
  return -1;
1118
1125
  }
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1131
1138
  // TODO: use supports_op to check if the backend supports the op
1132
1139
 
1133
1140
  // assign pre-allocated nodes to their backend
1134
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1141
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1135
1142
  if (cur_backend_id != -1) {
1136
1143
  SET_CAUSE(tensor, "1.dst");
1137
1144
  return cur_backend_id;
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1139
1146
 
1140
1147
  // view_src
1141
1148
  if (tensor->view_src != NULL) {
1142
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1149
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1143
1150
  if (cur_backend_id != -1) {
1144
1151
  SET_CAUSE(tensor, "1.vsrc");
1145
1152
  return cur_backend_id;
@@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1161
1168
  continue;
1162
1169
  }
1163
1170
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1164
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1171
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1165
1172
  // check if a backend with higher prio wants to offload the op
1166
1173
  if (src_backend_id == sched->n_backends - 1) {
1167
1174
  for (int b = 0; b < src_backend_id; b++) {
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1223
1230
  }
1224
1231
  }
1225
1232
 
1226
- //#define DEBUG_PASS1
1227
- //#define DEBUG_PASS2
1228
- //#define DEBUG_PASS3
1229
- //#define DEBUG_PASS4
1233
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1234
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1235
+ ggml_backend_buffer_type_t buft = NULL;
1236
+
1237
+ if (buf) {
1238
+ // the tensor is already allocated
1239
+ buft = buf->buft;
1240
+ } else {
1241
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1242
+ int tensor_backend_id = tensor_backend_id(t);
1243
+ if (tensor_backend_id == -1 && t->view_src) {
1244
+ tensor_backend_id = tensor_backend_id(t->view_src);
1245
+ }
1246
+ if (tensor_backend_id != -1) {
1247
+ buft = sched->bufts[tensor_backend_id];
1248
+ }
1249
+ }
1250
+
1251
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1252
+ }
1253
+
1254
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1255
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1256
+ *node_backend_id = cur_backend_id;
1257
+ SET_CAUSE(node, "2.sup");
1258
+ }
1259
+ }
1230
1260
 
1231
1261
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1232
1262
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1280
1310
  }
1281
1311
  }
1282
1312
  }
1283
- #ifdef DEBUG_PASS1
1284
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1285
- #endif
1286
1313
 
1287
1314
  // pass 2: expand current backend assignments
1288
1315
  // assign the same backend to adjacent nodes
1289
1316
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1290
1317
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1291
-
1292
-
1293
- // pass 2.2 expand gpu down
1318
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1319
+ // expand gpu down
1294
1320
  {
1295
1321
  int cur_backend_id = -1;
1296
1322
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1332
  } else {
1307
1333
  cur_backend_id = *node_backend_id;
1308
1334
  }
1309
- } else {
1310
- *node_backend_id = cur_backend_id;
1311
- SET_CAUSE(node, "2.2");
1335
+ } else if (cur_backend_id != -1) {
1336
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1312
1337
  }
1313
1338
  }
1314
1339
  }
1315
- // pass 2.1 expand gpu up
1340
+ // expand gpu up
1316
1341
  {
1317
1342
  int cur_backend_id = -1;
1318
1343
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
1353
  } else {
1329
1354
  cur_backend_id = *node_backend_id;
1330
1355
  }
1331
- } else {
1332
- *node_backend_id = cur_backend_id;
1333
- SET_CAUSE(node, "2.1");
1356
+ } else if (cur_backend_id != -1) {
1357
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1334
1358
  }
1335
1359
  }
1336
1360
  }
1337
- // pass 2.4 expand rest down
1361
+ // expand rest down
1338
1362
  {
1339
1363
  int cur_backend_id = -1;
1340
1364
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1345
1369
  int * node_backend_id = &tensor_backend_id(node);
1346
1370
  if (*node_backend_id != -1) {
1347
1371
  cur_backend_id = *node_backend_id;
1348
- } else {
1349
- *node_backend_id = cur_backend_id;
1350
- SET_CAUSE(node, "2.4");
1372
+ } else if (cur_backend_id != -1) {
1373
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1351
1374
  }
1352
1375
  }
1353
1376
  }
1354
- // pass 2.3 expand rest up
1377
+ // expand rest up
1355
1378
  {
1356
1379
  int cur_backend_id = -1;
1357
1380
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1362
1385
  int * node_backend_id = &tensor_backend_id(node);
1363
1386
  if (*node_backend_id != -1) {
1364
1387
  cur_backend_id = *node_backend_id;
1365
- } else {
1366
- *node_backend_id = cur_backend_id;
1367
- SET_CAUSE(node, "2.3");
1388
+ } else if (cur_backend_id != -1) {
1389
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1368
1390
  }
1369
1391
  }
1370
1392
  }
1371
1393
 
1372
- #ifdef DEBUG_PASS2
1373
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1374
- #endif
1394
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1395
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1396
+ // however, we also need to verify that the sources are in compatible buffer types
1397
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1398
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1399
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1400
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1401
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1402
+ for (int i = 0; i < graph->n_nodes; i++) {
1403
+ struct ggml_tensor * node = graph->nodes[i];
1404
+ if (ggml_is_view_op(node->op)) {
1405
+ continue;
1406
+ }
1407
+ int * node_backend_id = &tensor_backend_id(node);
1408
+ if (*node_backend_id == -1) {
1409
+ // unassigned node: find the backend with the most supported inputs
1410
+ int n_supported_best = -1;
1411
+ for (int b = 0; b < sched->n_backends; b++) {
1412
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1413
+ int n_supported = 0;
1414
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1415
+ struct ggml_tensor * src = node->src[j];
1416
+ if (src == NULL) {
1417
+ continue;
1418
+ }
1419
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1420
+ n_supported++;
1421
+ }
1422
+ }
1423
+ if (n_supported > n_supported_best) {
1424
+ n_supported_best = n_supported;
1425
+ *node_backend_id = b;
1426
+ SET_CAUSE(node, "3.best");
1427
+ }
1428
+ }
1429
+ }
1430
+ } else {
1431
+ // assigned node: upgrade to higher prio backend if possible
1432
+ for (int b = 0; b < *node_backend_id; b++) {
1433
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1434
+ bool supported = true;
1435
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1436
+ struct ggml_tensor * src = node->src[j];
1437
+ if (src == NULL) {
1438
+ continue;
1439
+ }
1440
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1441
+ supported = false;
1442
+ break;
1443
+ }
1444
+ }
1445
+ if (supported) {
1446
+ *node_backend_id = b;
1447
+ SET_CAUSE(node, "3.upg");
1448
+ break;
1449
+ }
1450
+ }
1451
+ }
1452
+ }
1453
+ }
1375
1454
 
1376
- // pass 3: assign backends to remaining src from dst and view_src
1455
+ // pass 4: assign backends to remaining src from dst and view_src
1377
1456
  for (int i = 0; i < graph->n_nodes; i++) {
1378
1457
  struct ggml_tensor * node = graph->nodes[i];
1379
1458
  int * cur_backend_id = &tensor_backend_id(node);
1380
1459
  if (node->view_src != NULL && *cur_backend_id == -1) {
1381
1460
  *cur_backend_id = tensor_backend_id(node->view_src);
1382
- SET_CAUSE(node, "3.vsrc");
1461
+ SET_CAUSE(node, "4.vsrc");
1383
1462
  }
1384
1463
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1385
1464
  struct ggml_tensor * src = node->src[j];
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1391
1470
  if (src->view_src != NULL) {
1392
1471
  // views are always on the same backend as the source
1393
1472
  *src_backend_id = tensor_backend_id(src->view_src);
1394
- SET_CAUSE(src, "3.vsrc");
1473
+ SET_CAUSE(src, "4.vsrc");
1395
1474
  } else {
1396
1475
  *src_backend_id = *cur_backend_id;
1397
- SET_CAUSE(src, "3.cur");
1476
+ SET_CAUSE(src, "4.cur");
1398
1477
  }
1399
1478
  }
1400
1479
  }
1401
1480
  }
1402
- #ifdef DEBUG_PASS3
1403
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1404
- #endif
1405
1481
 
1406
1482
  // pass 4: split graph, find tensors that need to be copied
1407
1483
  {
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1448
1524
  }
1449
1525
  }
1450
1526
  // check if the split has too many inputs
1527
+ // FIXME: count the number of inputs instead of only checking when full
1451
1528
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1452
1529
  const size_t id = hash_id(src);
1453
1530
  int src_backend_id = sched->tensor_backend_id[id];
1454
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1531
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1532
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1455
1533
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1456
1534
  need_new_split = true;
1457
1535
  break;
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1486
1564
  const int src_backend_id = tensor_backend_id(src);
1487
1565
  assert(src_backend_id != -1); // all inputs should be assigned by now
1488
1566
 
1489
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1567
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1490
1568
  size_t id = hash_id(src);
1491
1569
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1492
1570
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1511
1589
  }
1512
1590
  }
1513
1591
 
1514
- if (src_backend_id != node_backend_id) {
1592
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1593
+ if (src_backend_id != cur_backend_id && !supported) {
1515
1594
  // create a copy of the input in the split's backend
1516
1595
  const size_t id = hash_id(src);
1517
1596
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1537
1616
  split->i_end = graph->n_nodes;
1538
1617
  sched->n_splits = i_split + 1;
1539
1618
  }
1540
- #ifdef DEBUG_PASS4
1541
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1542
- #endif
1619
+
1620
+ if (sched->debug) {
1621
+ ggml_backend_sched_print_assignments(sched, graph);
1622
+ }
1623
+
1624
+ // swap node_backend_ids and leaf_backend_ids and prevs
1625
+ {
1626
+ int * tmp = sched->node_backend_ids;
1627
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1628
+ sched->prev_node_backend_ids = tmp;
1629
+
1630
+ tmp = sched->leaf_backend_ids;
1631
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1632
+ sched->prev_leaf_backend_ids = tmp;
1633
+ }
1543
1634
 
1544
1635
  // create copies of the graph for each split
1545
1636
  // TODO: avoid this copy
@@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1613
1704
  }
1614
1705
 
1615
1706
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
+ bool backend_ids_changed = false;
1708
+ for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
1710
+ backend_ids_changed = true;
1711
+ break;
1712
+ }
1713
+ }
1714
+ if (!backend_ids_changed) {
1715
+ for (int i = 0; i < sched->graph->n_leafs; i++) {
1716
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
1717
+ backend_ids_changed = true;
1718
+ break;
1719
+ }
1720
+ }
1721
+ }
1722
+
1616
1723
  // allocate graph
1617
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1724
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1618
1725
  // the re-allocation may cause the split inputs to be moved to a different address
1619
1726
  ggml_backend_sched_synchronize(sched);
1620
1727
  #ifndef NDEBUG
@@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1727
1834
 
1728
1835
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1729
1836
 
1837
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1838
+
1730
1839
  // initialize hash table
1731
1840
  sched->hash_set = ggml_hash_set_new(graph_size);
1732
1841
  sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
@@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1735
1844
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1736
1845
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
1846
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1847
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1848
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1738
1849
 
1739
1850
  sched->n_backends = n_backends;
1740
1851
 
@@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1747
1858
  for (int b = 0; b < n_backends; b++) {
1748
1859
  sched->backends[b] = backends[b];
1749
1860
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1750
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1861
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1751
1862
  if (sched->n_copies > 1) {
1752
1863
  for (int c = 0; c < sched->n_copies; c++) {
1753
1864
  sched->events[b][c] = ggml_backend_event_new(backends[b]);
@@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1779
1890
  free(sched->tensor_copies);
1780
1891
  free(sched->node_backend_ids);
1781
1892
  free(sched->leaf_backend_ids);
1893
+ free(sched->prev_node_backend_ids);
1894
+ free(sched->prev_leaf_backend_ids);
1782
1895
  free(sched);
1783
1896
  }
1784
1897
 
@@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
1875
1988
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1876
1989
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1877
1990
  tensor_backend_id(node) = backend_index;
1991
+ SET_CAUSE(node, "usr");
1878
1992
  }
1879
1993
 
1880
1994
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -23,7 +23,6 @@ extern "C" {
23
23
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
24
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
25
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
27
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
28
27
 
29
28
  // buffer
@@ -74,6 +73,7 @@ extern "C" {
74
73
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
74
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
76
75
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
76
+ GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
77
77
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
78
 
79
79
  // tensor copy between different backends
@@ -90,7 +90,7 @@ extern "C" {
90
90
  GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
91
  GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
92
  GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
94
94
 
95
95
  //
96
96
  // CPU backend
@@ -119,7 +119,7 @@ extern "C" {
119
119
 
120
120
  GGML_API size_t ggml_backend_reg_get_count(void);
121
121
  GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
122
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
122
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
123
123
  GGML_API const char * ggml_backend_reg_get_name(size_t i);
124
124
  GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
125
125
  GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);