llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,2225 +0,0 @@
1
- #include "ggml-backend-impl.h"
2
- #include "ggml-alloc.h"
3
- #include "ggml-impl.h"
4
-
5
- #include <assert.h>
6
- #include <limits.h>
7
- #include <stdarg.h>
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
11
-
12
-
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
-
15
- // backend buffer type
16
-
17
- const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
- return buft->iface.get_name(buft);
19
- }
20
-
21
- GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22
- return buft->iface.alloc_buffer(buft, size);
23
- }
24
-
25
- size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
26
- return buft->iface.get_alignment(buft);
27
- }
28
-
29
- size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
30
- // get_max_size is optional, defaults to SIZE_MAX
31
- if (buft->iface.get_max_size) {
32
- return buft->iface.get_max_size(buft);
33
- }
34
- return SIZE_MAX;
35
- }
36
-
37
- GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
- // get_alloc_size is optional, defaults to ggml_nbytes
39
- if (buft->iface.get_alloc_size) {
40
- size_t size = buft->iface.get_alloc_size(buft, tensor);
41
- assert(size >= ggml_nbytes(tensor));
42
- return size;
43
- }
44
- return ggml_nbytes(tensor);
45
- }
46
-
47
- bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
48
- if (buft->iface.is_host) {
49
- return buft->iface.is_host(buft);
50
- }
51
- return false;
52
- }
53
-
54
- // backend buffer
55
-
56
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
- ggml_backend_buffer_type_t buft,
58
- struct ggml_backend_buffer_i iface,
59
- ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
62
-
63
- (*buffer) = (struct ggml_backend_buffer) {
64
- /* .interface = */ iface,
65
- /* .buft = */ buft,
66
- /* .context = */ context,
67
- /* .size = */ size,
68
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
69
- };
70
-
71
- return buffer;
72
- }
73
-
74
- const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
75
- return buffer->iface.get_name(buffer);
76
- }
77
-
78
- void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
79
- if (buffer == NULL) {
80
- return;
81
- }
82
-
83
- if (buffer->iface.free_buffer != NULL) {
84
- buffer->iface.free_buffer(buffer);
85
- }
86
- free(buffer);
87
- }
88
-
89
- size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
90
- return buffer->size;
91
- }
92
-
93
- void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
94
- void * base = buffer->iface.get_base(buffer);
95
-
96
- GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
97
-
98
- return base;
99
- }
100
-
101
- GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
102
- // init_tensor is optional
103
- if (buffer->iface.init_tensor) {
104
- buffer->iface.init_tensor(buffer, tensor);
105
- }
106
- }
107
-
108
- size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
110
- }
111
-
112
- size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
113
- return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
114
- }
115
-
116
- size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
117
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
118
- }
119
-
120
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
121
- buffer->iface.clear(buffer, value);
122
- }
123
-
124
- bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
125
- return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
126
- }
127
-
128
- void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
129
- buffer->usage = usage;
130
-
131
- // FIXME: add a generic callback to the buffer interface
132
- if (ggml_backend_buffer_is_multi_buffer(buffer)) {
133
- ggml_backend_multi_buffer_set_usage(buffer, usage);
134
- }
135
- }
136
-
137
- ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
138
- return buffer->buft;
139
- }
140
-
141
- void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
142
- if (buffer->iface.reset) {
143
- buffer->iface.reset(buffer);
144
- }
145
- }
146
-
147
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
148
- ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
149
- if (dst_buf->iface.cpy_tensor) {
150
- return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
151
- }
152
- return false;
153
- }
154
-
155
- // backend
156
-
157
- ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
158
- if (backend == NULL) {
159
- return NULL;
160
- }
161
- return backend->guid;
162
- }
163
-
164
- const char * ggml_backend_name(ggml_backend_t backend) {
165
- if (backend == NULL) {
166
- return "NULL";
167
- }
168
- return backend->iface.get_name(backend);
169
- }
170
-
171
- void ggml_backend_free(ggml_backend_t backend) {
172
- if (backend == NULL) {
173
- return;
174
- }
175
-
176
- backend->iface.free(backend);
177
- }
178
-
179
- ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
180
- return backend->iface.get_default_buffer_type(backend);
181
- }
182
-
183
- ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
184
- return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
185
- }
186
-
187
- size_t ggml_backend_get_alignment(ggml_backend_t backend) {
188
- return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
189
- }
190
-
191
- size_t ggml_backend_get_max_size(ggml_backend_t backend) {
192
- return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
193
- }
194
-
195
- void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
196
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
197
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
198
-
199
- if (backend->iface.set_tensor_async == NULL) {
200
- ggml_backend_tensor_set(tensor, data, offset, size);
201
- } else {
202
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
203
- }
204
- }
205
-
206
- void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
207
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
208
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
209
-
210
- if (backend->iface.get_tensor_async == NULL) {
211
- ggml_backend_tensor_get(tensor, data, offset, size);
212
- } else {
213
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
214
- }
215
- }
216
-
217
- GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
218
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
219
-
220
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
221
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
222
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
223
-
224
- if (!size) {
225
- return;
226
- }
227
-
228
- buf->iface.set_tensor(buf, tensor, data, offset, size);
229
- }
230
-
231
- GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
232
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
233
-
234
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
235
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
236
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
237
-
238
- if (!size) {
239
- return;
240
- }
241
-
242
- buf->iface.get_tensor(buf, tensor, data, offset, size);
243
- }
244
-
245
- void ggml_backend_synchronize(ggml_backend_t backend) {
246
- if (backend->iface.synchronize == NULL) {
247
- return;
248
- }
249
-
250
- backend->iface.synchronize(backend);
251
- }
252
-
253
- ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
254
- GGML_ASSERT(backend->iface.graph_plan_create != NULL);
255
-
256
- return backend->iface.graph_plan_create(backend, cgraph);
257
- }
258
-
259
- void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
260
- GGML_ASSERT(backend->iface.graph_plan_free != NULL);
261
-
262
- backend->iface.graph_plan_free(backend, plan);
263
- }
264
-
265
- enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
266
- GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
267
-
268
- return backend->iface.graph_plan_compute(backend, plan);
269
- }
270
-
271
- enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
272
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
273
- ggml_backend_synchronize(backend);
274
- return err;
275
- }
276
-
277
- enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
278
- return backend->iface.graph_compute(backend, cgraph);
279
- }
280
-
281
- bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
282
- return backend->iface.supports_op(backend, op);
283
- }
284
-
285
- bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
286
- return backend->iface.supports_buft(backend, buft);
287
- }
288
-
289
- bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
- if (backend->iface.offload_op != NULL) {
291
- return backend->iface.offload_op(backend, op);
292
- }
293
- return false;
294
- }
295
-
296
- // backend copy
297
-
298
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
299
- if (a->type != b->type) {
300
- return false;
301
- }
302
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
303
- if (a->ne[i] != b->ne[i]) {
304
- return false;
305
- }
306
- if (a->nb[i] != b->nb[i]) {
307
- return false;
308
- }
309
- }
310
- return true;
311
- }
312
-
313
- void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
314
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
315
-
316
- if (src == dst) {
317
- return;
318
- }
319
-
320
- if (ggml_backend_buffer_is_host(src->buffer)) {
321
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
322
- } else if (ggml_backend_buffer_is_host(dst->buffer)) {
323
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
324
- } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
325
- #ifndef NDEBUG
326
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
327
- #endif
328
- size_t nbytes = ggml_nbytes(src);
329
- void * data = malloc(nbytes);
330
- ggml_backend_tensor_get(src, data, 0, nbytes);
331
- ggml_backend_tensor_set(dst, data, 0, nbytes);
332
- free(data);
333
- }
334
- }
335
-
336
- void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
337
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
338
-
339
- if (src == dst) {
340
- return;
341
- }
342
-
343
- if (backend_dst->iface.cpy_tensor_async != NULL) {
344
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
345
- return;
346
- }
347
- }
348
-
349
- // an async copy would normally happen after all the queued operations on both backends are completed
350
- // sync src, set_async dst
351
- if (ggml_backend_buffer_is_host(src->buffer)) {
352
- ggml_backend_synchronize(backend_src);
353
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
354
- } else {
355
- ggml_backend_synchronize(backend_src);
356
- ggml_backend_tensor_copy(src, dst);
357
- ggml_backend_synchronize(backend_dst);
358
- }
359
- }
360
-
361
- // events
362
-
363
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
364
- if (backend->iface.event_new == NULL) {
365
- return NULL;
366
- }
367
- return backend->iface.event_new(backend);
368
- }
369
-
370
- void ggml_backend_event_free(ggml_backend_event_t event) {
371
- if (event == NULL) {
372
- return;
373
- }
374
- event->backend->iface.event_free(event);
375
- }
376
-
377
- void ggml_backend_event_record(ggml_backend_event_t event) {
378
- GGML_ASSERT(event->backend->iface.event_record != NULL);
379
-
380
- event->backend->iface.event_record(event);
381
- }
382
-
383
- void ggml_backend_event_synchronize(ggml_backend_event_t event) {
384
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
385
-
386
- event->backend->iface.event_synchronize(event);
387
- }
388
-
389
- void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
390
- GGML_ASSERT(backend->iface.event_wait != NULL);
391
-
392
- backend->iface.event_wait(backend, event);
393
- }
394
-
395
- // backend registry
396
-
397
- #define GGML_REG_MAX_BACKENDS 16
398
-
399
- struct ggml_backend_reg {
400
- char name[128];
401
- ggml_backend_init_fn init_fn;
402
- ggml_backend_buffer_type_t default_buffer_type;
403
- void * user_data;
404
- };
405
-
406
- static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
407
- static size_t ggml_backend_registry_count = 0;
408
-
409
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
410
-
411
- GGML_CALL static void ggml_backend_registry_init(void) {
412
- static bool initialized = false;
413
-
414
- if (initialized) {
415
- return;
416
- }
417
-
418
- initialized = true;
419
-
420
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
-
422
- // add forward decls here to avoid including the backend headers
423
- #ifdef GGML_USE_CUDA
424
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
- ggml_backend_cuda_reg_devices();
426
- #endif
427
-
428
- #ifdef GGML_USE_SYCL
429
- extern void ggml_backend_sycl_reg_devices(void);
430
- ggml_backend_sycl_reg_devices();
431
- #endif
432
-
433
- #ifdef GGML_USE_METAL
434
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
435
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
436
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
437
- #endif
438
-
439
- #ifdef GGML_USE_VULKAN
440
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
441
- ggml_backend_vk_reg_devices();
442
- #endif
443
-
444
- #ifdef GGML_USE_KOMPUTE
445
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
446
- ggml_backend_kompute_reg_devices();
447
- #endif
448
- }
449
-
450
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
451
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
452
-
453
- size_t id = ggml_backend_registry_count;
454
-
455
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
456
- /* .name = */ {0},
457
- /* .fn = */ init_fn,
458
- /* .default_buffer_type = */ default_buffer_type,
459
- /* .user_data = */ user_data,
460
- };
461
-
462
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
463
-
464
- #ifndef NDEBUG
465
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
466
- #endif
467
-
468
- ggml_backend_registry_count++;
469
- }
470
-
471
- size_t ggml_backend_reg_get_count(void) {
472
- ggml_backend_registry_init();
473
-
474
- return ggml_backend_registry_count;
475
- }
476
-
477
- size_t ggml_backend_reg_find_by_name(const char * name) {
478
- ggml_backend_registry_init();
479
-
480
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
481
- // TODO: case insensitive in a portable way
482
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
483
- return i;
484
- }
485
- }
486
-
487
- // not found
488
- return SIZE_MAX;
489
- }
490
-
491
- // init from backend:params string
492
- ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
493
- ggml_backend_registry_init();
494
-
495
- const char * params = strchr(backend_str, ':');
496
- char backend_name[128];
497
- if (params == NULL) {
498
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
499
- params = "";
500
- } else {
501
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
502
- params++;
503
- }
504
-
505
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
506
-
507
- if (backend_i == SIZE_MAX) {
508
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
509
- return NULL;
510
- }
511
-
512
- return ggml_backend_reg_init_backend(backend_i, params);
513
- }
514
-
515
- const char * ggml_backend_reg_get_name(size_t i) {
516
- ggml_backend_registry_init();
517
-
518
- GGML_ASSERT(i < ggml_backend_registry_count);
519
- return ggml_backend_registry[i].name;
520
- }
521
-
522
- ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
523
- ggml_backend_registry_init();
524
-
525
- GGML_ASSERT(i < ggml_backend_registry_count);
526
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
527
- }
528
-
529
- ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
530
- ggml_backend_registry_init();
531
-
532
- GGML_ASSERT(i < ggml_backend_registry_count);
533
- return ggml_backend_registry[i].default_buffer_type;
534
- }
535
-
536
- ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
537
- ggml_backend_registry_init();
538
-
539
- GGML_ASSERT(i < ggml_backend_registry_count);
540
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
541
- }
542
-
543
- // backend CPU
544
-
545
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
546
-
547
- GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
548
- return "CPU";
549
-
550
- GGML_UNUSED(buffer);
551
- }
552
-
553
- GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
554
- uintptr_t data = (uintptr_t)buffer->context;
555
-
556
- // align the buffer
557
- if (data % TENSOR_ALIGNMENT != 0) {
558
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
559
- }
560
-
561
- return (void *)data;
562
- }
563
-
564
- GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
565
- free(buffer->context);
566
- }
567
-
568
- GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
569
- memcpy((char *)tensor->data + offset, data, size);
570
-
571
- GGML_UNUSED(buffer);
572
- }
573
-
574
- GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
575
- memcpy(data, (const char *)tensor->data + offset, size);
576
-
577
- GGML_UNUSED(buffer);
578
- }
579
-
580
- GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
581
- if (ggml_backend_buffer_is_host(src->buffer)) {
582
- memcpy(dst->data, src->data, ggml_nbytes(src));
583
- return true;
584
- }
585
- return false;
586
-
587
- GGML_UNUSED(buffer);
588
- }
589
-
590
- GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
591
- memset(buffer->context, value, buffer->size);
592
- }
593
-
594
- static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
595
- /* .get_name = */ ggml_backend_cpu_buffer_name,
596
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
597
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
598
- /* .init_tensor = */ NULL, // no initialization required
599
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
600
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
601
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
602
- /* .clear = */ ggml_backend_cpu_buffer_clear,
603
- /* .reset = */ NULL,
604
- };
605
-
606
- // for buffers from ptr, free is not called
607
- static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
608
- /* .get_name = */ ggml_backend_cpu_buffer_name,
609
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
610
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
611
- /* .init_tensor = */ NULL, // no initialization required
612
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
613
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
614
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
615
- /* .clear = */ ggml_backend_cpu_buffer_clear,
616
- /* .reset = */ NULL,
617
- };
618
-
619
- GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
620
- return "CPU";
621
-
622
- GGML_UNUSED(buft);
623
- }
624
-
625
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
626
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
627
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
628
- if (data == NULL) {
629
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
630
- return NULL;
631
- }
632
-
633
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
634
- }
635
-
636
- GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
637
- return TENSOR_ALIGNMENT;
638
-
639
- GGML_UNUSED(buft);
640
- }
641
-
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
643
- return true;
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
649
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
650
- /* .iface = */ {
651
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
652
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
653
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
654
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
655
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
656
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
657
- },
658
- /* .context = */ NULL,
659
- };
660
-
661
- return &ggml_backend_cpu_buffer_type;
662
- }
663
-
664
- #ifdef GGML_USE_CPU_HBM
665
-
666
- // buffer type HBM
667
-
668
- #include <hbwmalloc.h>
669
-
670
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
671
- return "CPU_HBM";
672
-
673
- GGML_UNUSED(buft);
674
- }
675
-
676
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
677
- return "CPU_HBM";
678
-
679
- GGML_UNUSED(buf);
680
- }
681
-
682
- GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
683
- hbw_free(buffer->context);
684
- }
685
-
686
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
687
- //void * ptr = hbw_malloc(size);
688
- void * ptr;
689
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
690
- if (result != 0) {
691
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
692
- return NULL;
693
- }
694
-
695
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
696
- buffer->buft = buft;
697
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
698
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
699
-
700
- return buffer;
701
- }
702
-
703
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
704
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
705
- /* .iface = */ {
706
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
707
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
708
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
709
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
710
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
711
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
712
- },
713
- /* .context = */ NULL,
714
- };
715
-
716
- return &ggml_backend_cpu_buffer_type_hbm;
717
- }
718
- #endif
719
-
720
- struct ggml_backend_cpu_context {
721
- int n_threads;
722
- void * work_data;
723
- size_t work_size;
724
-
725
- ggml_abort_callback abort_callback;
726
- void * abort_callback_data;
727
- };
728
-
729
- GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
730
- return "CPU";
731
-
732
- GGML_UNUSED(backend);
733
- }
734
-
735
- GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
736
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
737
- free(cpu_ctx->work_data);
738
- free(cpu_ctx);
739
- free(backend);
740
- }
741
-
742
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
743
- return ggml_backend_cpu_buffer_type();
744
-
745
- GGML_UNUSED(backend);
746
- }
747
-
748
- struct ggml_backend_plan_cpu {
749
- struct ggml_cplan cplan;
750
- struct ggml_cgraph cgraph;
751
- };
752
-
753
- GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
754
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
755
-
756
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
757
-
758
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
759
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
760
-
761
- if (cpu_plan->cplan.work_size > 0) {
762
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
763
- if (cpu_plan->cplan.work_data == NULL) {
764
- free(cpu_plan);
765
- return NULL;
766
- }
767
- }
768
-
769
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
770
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
771
-
772
- return cpu_plan;
773
- }
774
-
775
- GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
776
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
777
-
778
- free(cpu_plan->cplan.work_data);
779
- free(cpu_plan);
780
-
781
- GGML_UNUSED(backend);
782
- }
783
-
784
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
785
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
786
-
787
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
788
-
789
- GGML_UNUSED(backend);
790
- }
791
-
792
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
793
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
794
-
795
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
796
-
797
- if (cpu_ctx->work_size < cplan.work_size) {
798
- free(cpu_ctx->work_data);
799
- cpu_ctx->work_data = malloc(cplan.work_size);
800
- if (cpu_ctx->work_data == NULL) {
801
- cpu_ctx->work_size = 0;
802
- return GGML_STATUS_ALLOC_FAILED;
803
- }
804
- cpu_ctx->work_size = cplan.work_size;
805
- }
806
- cplan.work_data = cpu_ctx->work_data;
807
-
808
- cplan.abort_callback = cpu_ctx->abort_callback;
809
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
810
-
811
- return ggml_graph_compute(cgraph, &cplan);
812
- }
813
-
814
- GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
815
- switch (op->op) {
816
- case GGML_OP_CPY:
817
- return
818
- op->type != GGML_TYPE_IQ2_XXS &&
819
- op->type != GGML_TYPE_IQ2_XS &&
820
- op->type != GGML_TYPE_IQ1_S &&
821
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
822
- case GGML_OP_MUL_MAT:
823
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
824
- default:
825
- return true;
826
- }
827
-
828
- GGML_UNUSED(backend);
829
- }
830
-
831
- GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
832
- return ggml_backend_buft_is_host(buft);
833
-
834
- GGML_UNUSED(backend);
835
- }
836
-
837
- static struct ggml_backend_i cpu_backend_i = {
838
- /* .get_name = */ ggml_backend_cpu_name,
839
- /* .free = */ ggml_backend_cpu_free,
840
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
841
- /* .set_tensor_async = */ NULL,
842
- /* .get_tensor_async = */ NULL,
843
- /* .cpy_tensor_async = */ NULL,
844
- /* .synchronize = */ NULL,
845
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
846
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
847
- /* .graph_plan_update = */ NULL,
848
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
849
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
850
- /* .supports_op = */ ggml_backend_cpu_supports_op,
851
- /* .supports_buft = */ ggml_backend_cpu_supports_buft,
852
- /* .offload_op = */ NULL,
853
- /* .event_new = */ NULL,
854
- /* .event_free = */ NULL,
855
- /* .event_record = */ NULL,
856
- /* .event_wait = */ NULL,
857
- /* .event_synchronize = */ NULL,
858
- };
859
-
860
- static ggml_guid_t ggml_backend_cpu_guid(void) {
861
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
862
- return &guid;
863
- }
864
-
865
- ggml_backend_t ggml_backend_cpu_init(void) {
866
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
867
- if (ctx == NULL) {
868
- return NULL;
869
- }
870
-
871
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
872
- ctx->work_data = NULL;
873
- ctx->work_size = 0;
874
- ctx->abort_callback = NULL;
875
- ctx->abort_callback_data = NULL;
876
-
877
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
878
- if (cpu_backend == NULL) {
879
- free(ctx);
880
- return NULL;
881
- }
882
-
883
- *cpu_backend = (struct ggml_backend) {
884
- /* .guid = */ ggml_backend_cpu_guid(),
885
- /* .interface = */ cpu_backend_i,
886
- /* .context = */ ctx
887
- };
888
- return cpu_backend;
889
- }
890
-
891
- GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
892
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
893
- }
894
-
895
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
896
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
897
-
898
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
899
- ctx->n_threads = n_threads;
900
- }
901
-
902
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
903
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
904
-
905
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
906
- ctx->abort_callback = abort_callback;
907
- ctx->abort_callback_data = abort_callback_data;
908
- }
909
-
910
- GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
911
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
912
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
913
- }
914
-
915
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
916
- return ggml_backend_cpu_init();
917
-
918
- GGML_UNUSED(params);
919
- GGML_UNUSED(user_data);
920
- }
921
-
922
- // multi-buffer buffer
923
-
924
- struct ggml_backend_multi_buffer_context {
925
- ggml_backend_buffer_t * buffers;
926
- size_t n_buffers;
927
- };
928
-
929
- typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
930
-
931
- GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
932
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
933
-
934
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
935
- }
936
-
937
- GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
938
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
939
- for (size_t i = 0; i < ctx->n_buffers; i++) {
940
- ggml_backend_buffer_free(ctx->buffers[i]);
941
- }
942
-
943
- free(ctx->buffers);
944
- free(ctx);
945
- }
946
-
947
- GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
948
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
949
- for (size_t i = 0; i < ctx->n_buffers; i++) {
950
- ggml_backend_buffer_clear(ctx->buffers[i], value);
951
- }
952
- }
953
-
954
- static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
955
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
956
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
957
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
958
- /* .get_base = */ NULL,
959
- /* .init_tensor = */ NULL,
960
- /* .set_tensor = */ NULL,
961
- /* .get_tensor = */ NULL,
962
- /* .cpy_tensor = */ NULL,
963
- /* .clear = */ ggml_backend_multi_buffer_clear,
964
- /* .reset = */ NULL,
965
- };
966
-
967
- return multi_backend_buffer_i;
968
- }
969
-
970
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
971
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
972
- ctx->n_buffers = n_buffers;
973
- ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
974
-
975
- GGML_ASSERT(ctx->buffers != NULL);
976
-
977
- size_t total_size = 0;
978
- for (size_t i = 0; i < n_buffers; i++) {
979
- ctx->buffers[i] = buffers[i];
980
- total_size += ggml_backend_buffer_get_size(buffers[i]);
981
- }
982
-
983
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
984
- }
985
-
986
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
987
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
988
- }
989
-
990
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
991
- GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
992
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
993
- for (size_t i = 0; i < ctx->n_buffers; i++) {
994
- ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
995
- }
996
- }
997
-
998
- // creates a copy of the tensor with the same memory layout
999
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1000
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1001
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
1002
- dup->nb[i] = tensor->nb[i];
1003
- }
1004
- return dup;
1005
- }
1006
-
1007
- static bool ggml_is_view_op(enum ggml_op op) {
1008
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
1009
- }
1010
-
1011
- // scheduler
1012
-
1013
- #ifndef GGML_SCHED_MAX_BACKENDS
1014
- #define GGML_SCHED_MAX_BACKENDS 16
1015
- #endif
1016
-
1017
- #ifndef GGML_SCHED_MAX_SPLITS
1018
- #define GGML_SCHED_MAX_SPLITS 2048
1019
- #endif
1020
-
1021
- #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1022
- #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1023
- #endif
1024
-
1025
- #ifndef GGML_SCHED_MAX_COPIES
1026
- #define GGML_SCHED_MAX_COPIES 4
1027
- #endif
1028
-
1029
- struct ggml_backend_sched_split {
1030
- int backend_id;
1031
- int i_start;
1032
- int i_end;
1033
- struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1034
- int n_inputs;
1035
- // graph view of this split
1036
- struct ggml_cgraph graph;
1037
- };
1038
-
1039
- struct ggml_backend_sched {
1040
- bool is_reset; // true if the scheduler has been reset since the last graph split
1041
- bool is_alloc;
1042
-
1043
- int n_backends;
1044
-
1045
- ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1046
- ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1047
- ggml_gallocr_t galloc;
1048
-
1049
- // hash keys of the nodes in the graph
1050
- struct ggml_hash_set hash_set;
1051
- // hash values
1052
- int * tensor_backend_id;
1053
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1054
-
1055
- int * node_backend_ids; // [graph_size]
1056
- int * leaf_backend_ids; // [graph_size]
1057
-
1058
- int * prev_node_backend_ids; // [graph_size]
1059
- int * prev_leaf_backend_ids; // [graph_size]
1060
-
1061
- // copy of the graph with modified inputs
1062
- struct ggml_cgraph * graph;
1063
-
1064
- // graph splits
1065
- struct ggml_backend_sched_split * splits;
1066
- int n_splits;
1067
- int splits_capacity;
1068
-
1069
- // pipeline parallelism support
1070
- int n_copies;
1071
- int cur_copy;
1072
- ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1073
- struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1074
- int n_graph_inputs;
1075
-
1076
- struct ggml_context * ctx;
1077
-
1078
- ggml_backend_sched_eval_callback callback_eval;
1079
- void * callback_eval_user_data;
1080
-
1081
- bool debug;
1082
-
1083
- // align context_buffer to GGML_MEM_ALIGN
1084
- #ifdef _MSC_VER
1085
- __declspec(align(GGML_MEM_ALIGN))
1086
- #else
1087
- __attribute__((aligned(GGML_MEM_ALIGN)))
1088
- #endif
1089
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1090
- };
1091
-
1092
- #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1093
- #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
1094
-
1095
- // returns the priority of the backend, lower id is higher priority
1096
- static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
1097
- for (int i = 0; i < sched->n_backends; i++) {
1098
- if (sched->backends[i] == backend) {
1099
- return i;
1100
- }
1101
- }
1102
- return -1;
1103
- }
1104
-
1105
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1106
- ggml_backend_buffer_t buffer = tensor->buffer;
1107
- if (buffer == NULL) {
1108
- return -1;
1109
- }
1110
-
1111
- // find highest prio backend that supports the buffer type and the op
1112
- for (int i = 0; i < sched->n_backends; i++) {
1113
- if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1114
- ggml_backend_supports_op(sched->backends[i], op)) {
1115
- return i;
1116
- }
1117
- }
1118
-
1119
- #ifndef NDEBUG
1120
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1121
- __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1122
- #endif
1123
-
1124
- return -1;
1125
- }
1126
-
1127
- #if 0
1128
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1129
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1130
- #define GET_CAUSE(node) causes[hash_id(node)]
1131
- #else
1132
- #define SET_CAUSE(node, ...)
1133
- #define GET_CAUSE(node) ""
1134
- #endif
1135
-
1136
- // returns the backend that should be used for the node based on the current locations
1137
- static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1138
- // TODO: use supports_op to check if the backend supports the op
1139
-
1140
- // assign pre-allocated nodes to their backend
1141
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1142
- if (cur_backend_id != -1) {
1143
- SET_CAUSE(tensor, "1.dst");
1144
- return cur_backend_id;
1145
- }
1146
-
1147
- // view_src
1148
- if (tensor->view_src != NULL) {
1149
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1150
- if (cur_backend_id != -1) {
1151
- SET_CAUSE(tensor, "1.vsrc");
1152
- return cur_backend_id;
1153
- }
1154
- }
1155
-
1156
- // graph input
1157
- if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1158
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1159
- SET_CAUSE(tensor, "1.inp");
1160
- return cur_backend_id;
1161
- }
1162
-
1163
- // assign nodes that use weights to the backend of the weights
1164
- // operations with weights are preferably run on the same backend as the weights
1165
- for (int i = 0; i < GGML_MAX_SRC; i++) {
1166
- const struct ggml_tensor * src = tensor->src[i];
1167
- if (src == NULL) {
1168
- continue;
1169
- }
1170
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1171
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1172
- // check if a backend with higher prio wants to offload the op
1173
- if (src_backend_id == sched->n_backends - 1) {
1174
- for (int b = 0; b < src_backend_id; b++) {
1175
- if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1176
- SET_CAUSE(tensor, "1.off");
1177
- return b;
1178
- }
1179
- }
1180
- }
1181
- SET_CAUSE(tensor, "1.wgt%d", i);
1182
- return src_backend_id;
1183
- }
1184
- }
1185
-
1186
- return -1;
1187
- }
1188
-
1189
- static char * fmt_size(size_t size) {
1190
- static char buffer[128];
1191
- if (size >= 1024*1024) {
1192
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1193
- } else {
1194
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1195
- }
1196
- return buffer;
1197
- }
1198
-
1199
- static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1200
- int cur_split = 0;
1201
- for (int i = 0; i < graph->n_nodes; i++) {
1202
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1203
- ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1204
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1205
- sched->splits[cur_split].n_inputs);
1206
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1207
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1208
- fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1209
- }
1210
- fprintf(stderr, "\n");
1211
- cur_split++;
1212
- }
1213
- struct ggml_tensor * node = graph->nodes[i];
1214
- if (ggml_is_view_op(node->op)) {
1215
- continue;
1216
- }
1217
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1218
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1219
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1220
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1221
- struct ggml_tensor * src = node->src[j];
1222
- if (src == NULL) {
1223
- continue;
1224
- }
1225
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1226
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1227
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1228
- }
1229
- fprintf(stderr, "\n");
1230
- }
1231
- }
1232
-
1233
- static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1234
- ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1235
- ggml_backend_buffer_type_t buft = NULL;
1236
-
1237
- if (buf) {
1238
- // the tensor is already allocated
1239
- buft = buf->buft;
1240
- } else {
1241
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1242
- int tensor_backend_id = tensor_backend_id(t);
1243
- if (tensor_backend_id == -1 && t->view_src) {
1244
- tensor_backend_id = tensor_backend_id(t->view_src);
1245
- }
1246
- if (tensor_backend_id != -1) {
1247
- buft = sched->bufts[tensor_backend_id];
1248
- }
1249
- }
1250
-
1251
- return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1252
- }
1253
-
1254
- static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1255
- if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1256
- *node_backend_id = cur_backend_id;
1257
- SET_CAUSE(node, "2.sup");
1258
- }
1259
- }
1260
-
1261
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1262
- static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1263
- // reset splits
1264
- sched->n_splits = 0;
1265
- sched->n_graph_inputs = 0;
1266
- sched->is_reset = false;
1267
-
1268
- struct ggml_init_params params = {
1269
- /* .mem_size = */ sizeof(sched->context_buffer),
1270
- /* .mem_buffer = */ sched->context_buffer,
1271
- /* .no_alloc = */ true
1272
- };
1273
-
1274
- ggml_free(sched->ctx);
1275
-
1276
- sched->ctx = ggml_init(params);
1277
- if (sched->ctx == NULL) {
1278
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
1279
- GGML_ASSERT(false);
1280
- }
1281
-
1282
- // pass 1: assign backends to ops with pre-allocated inputs
1283
- for (int i = 0; i < graph->n_leafs; i++) {
1284
- struct ggml_tensor * leaf = graph->leafs[i];
1285
- int * leaf_backend_id = &tensor_backend_id(leaf);
1286
- if (*leaf_backend_id != -1) {
1287
- // do not overwrite user assignments
1288
- continue;
1289
- }
1290
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1291
- }
1292
-
1293
- for (int i = 0; i < graph->n_nodes; i++) {
1294
- struct ggml_tensor * node = graph->nodes[i];
1295
- int * node_backend_id = &tensor_backend_id(node);
1296
- if (*node_backend_id != -1) {
1297
- // do not overwrite user assignments
1298
- continue;
1299
- }
1300
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1301
- // src
1302
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1303
- struct ggml_tensor * src = node->src[j];
1304
- if (src == NULL) {
1305
- continue;
1306
- }
1307
- int * src_backend_id = &tensor_backend_id(src);
1308
- if (*src_backend_id == -1) {
1309
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1310
- }
1311
- }
1312
- }
1313
-
1314
- // pass 2: expand current backend assignments
1315
- // assign the same backend to adjacent nodes
1316
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1317
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1318
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1319
- // expand gpu down
1320
- {
1321
- int cur_backend_id = -1;
1322
- for (int i = 0; i < graph->n_nodes; i++) {
1323
- struct ggml_tensor * node = graph->nodes[i];
1324
- if (ggml_is_view_op(node->op)) {
1325
- continue;
1326
- }
1327
- int * node_backend_id = &tensor_backend_id(node);
1328
- if (*node_backend_id != -1) {
1329
- if (*node_backend_id == sched->n_backends - 1) {
1330
- // skip cpu (lowest prio backend)
1331
- cur_backend_id = -1;
1332
- } else {
1333
- cur_backend_id = *node_backend_id;
1334
- }
1335
- } else if (cur_backend_id != -1) {
1336
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1337
- }
1338
- }
1339
- }
1340
- // expand gpu up
1341
- {
1342
- int cur_backend_id = -1;
1343
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1344
- struct ggml_tensor * node = graph->nodes[i];
1345
- if (ggml_is_view_op(node->op)) {
1346
- continue;
1347
- }
1348
- int * node_backend_id = &tensor_backend_id(node);
1349
- if (*node_backend_id != -1) {
1350
- if (*node_backend_id == sched->n_backends - 1) {
1351
- // skip cpu (lowest prio backend)
1352
- cur_backend_id = -1;
1353
- } else {
1354
- cur_backend_id = *node_backend_id;
1355
- }
1356
- } else if (cur_backend_id != -1) {
1357
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1358
- }
1359
- }
1360
- }
1361
- // expand rest down
1362
- {
1363
- int cur_backend_id = -1;
1364
- for (int i = 0; i < graph->n_nodes; i++) {
1365
- struct ggml_tensor * node = graph->nodes[i];
1366
- if (ggml_is_view_op(node->op)) {
1367
- continue;
1368
- }
1369
- int * node_backend_id = &tensor_backend_id(node);
1370
- if (*node_backend_id != -1) {
1371
- cur_backend_id = *node_backend_id;
1372
- } else if (cur_backend_id != -1) {
1373
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1374
- }
1375
- }
1376
- }
1377
- // expand rest up
1378
- {
1379
- int cur_backend_id = -1;
1380
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1381
- struct ggml_tensor * node = graph->nodes[i];
1382
- if (ggml_is_view_op(node->op)) {
1383
- continue;
1384
- }
1385
- int * node_backend_id = &tensor_backend_id(node);
1386
- if (*node_backend_id != -1) {
1387
- cur_backend_id = *node_backend_id;
1388
- } else if (cur_backend_id != -1) {
1389
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1390
- }
1391
- }
1392
- }
1393
-
1394
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1395
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1396
- // however, we also need to verify that the sources are in compatible buffer types
1397
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1398
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1399
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1400
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1401
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1402
- for (int i = 0; i < graph->n_nodes; i++) {
1403
- struct ggml_tensor * node = graph->nodes[i];
1404
- if (ggml_is_view_op(node->op)) {
1405
- continue;
1406
- }
1407
- int * node_backend_id = &tensor_backend_id(node);
1408
- if (*node_backend_id == -1) {
1409
- // unassigned node: find the backend with the most supported inputs
1410
- int n_supported_best = -1;
1411
- for (int b = 0; b < sched->n_backends; b++) {
1412
- if (ggml_backend_supports_op(sched->backends[b], node)) {
1413
- int n_supported = 0;
1414
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1415
- struct ggml_tensor * src = node->src[j];
1416
- if (src == NULL) {
1417
- continue;
1418
- }
1419
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1420
- n_supported++;
1421
- }
1422
- }
1423
- if (n_supported > n_supported_best) {
1424
- n_supported_best = n_supported;
1425
- *node_backend_id = b;
1426
- SET_CAUSE(node, "3.best");
1427
- }
1428
- }
1429
- }
1430
- } else {
1431
- // assigned node: upgrade to higher prio backend if possible
1432
- for (int b = 0; b < *node_backend_id; b++) {
1433
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1434
- bool supported = true;
1435
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1436
- struct ggml_tensor * src = node->src[j];
1437
- if (src == NULL) {
1438
- continue;
1439
- }
1440
- if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1441
- supported = false;
1442
- break;
1443
- }
1444
- }
1445
- if (supported) {
1446
- *node_backend_id = b;
1447
- SET_CAUSE(node, "3.upg");
1448
- break;
1449
- }
1450
- }
1451
- }
1452
- }
1453
- }
1454
-
1455
- // pass 4: assign backends to remaining src from dst and view_src
1456
- for (int i = 0; i < graph->n_nodes; i++) {
1457
- struct ggml_tensor * node = graph->nodes[i];
1458
- int * cur_backend_id = &tensor_backend_id(node);
1459
- if (node->view_src != NULL && *cur_backend_id == -1) {
1460
- *cur_backend_id = tensor_backend_id(node->view_src);
1461
- SET_CAUSE(node, "4.vsrc");
1462
- }
1463
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1464
- struct ggml_tensor * src = node->src[j];
1465
- if (src == NULL) {
1466
- continue;
1467
- }
1468
- int * src_backend_id = &tensor_backend_id(src);
1469
- if (*src_backend_id == -1) {
1470
- if (src->view_src != NULL) {
1471
- // views are always on the same backend as the source
1472
- *src_backend_id = tensor_backend_id(src->view_src);
1473
- SET_CAUSE(src, "4.vsrc");
1474
- } else {
1475
- *src_backend_id = *cur_backend_id;
1476
- SET_CAUSE(src, "4.cur");
1477
- }
1478
- }
1479
- }
1480
- }
1481
-
1482
- // pass 4: split graph, find tensors that need to be copied
1483
- {
1484
- int i_split = 0;
1485
- struct ggml_backend_sched_split * split = &sched->splits[0];
1486
- // find the backend of the first split, skipping view ops
1487
- for (int i = 0; i < graph->n_nodes; i++) {
1488
- struct ggml_tensor * node = graph->nodes[i];
1489
- if (!ggml_is_view_op(node->op)) {
1490
- split->backend_id = tensor_backend_id(node);
1491
- break;
1492
- }
1493
- }
1494
- split->i_start = 0;
1495
- split->n_inputs = 0;
1496
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1497
- int cur_backend_id = split->backend_id;
1498
- for (int i = 0; i < graph->n_nodes; i++) {
1499
- struct ggml_tensor * node = graph->nodes[i];
1500
-
1501
- if (ggml_is_view_op(node->op)) {
1502
- continue;
1503
- }
1504
-
1505
- const int node_backend_id = tensor_backend_id(node);
1506
-
1507
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1508
-
1509
- // check if we should start a new split based on the sources of the current node
1510
- bool need_new_split = false;
1511
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1512
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1513
- struct ggml_tensor * src = node->src[j];
1514
- if (src == NULL) {
1515
- continue;
1516
- }
1517
- // check if a weight is on a different backend
1518
- // by starting a new split, the memory of the previously offloaded weights can be reused
1519
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1520
- int src_backend_id = tensor_backend_id(src);
1521
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1522
- need_new_split = true;
1523
- break;
1524
- }
1525
- }
1526
- // check if the split has too many inputs
1527
- // FIXME: count the number of inputs instead of only checking when full
1528
- if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1529
- const size_t id = hash_id(src);
1530
- int src_backend_id = sched->tensor_backend_id[id];
1531
- bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1532
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1533
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1534
- need_new_split = true;
1535
- break;
1536
- }
1537
- }
1538
- }
1539
- }
1540
-
1541
- if (node_backend_id != cur_backend_id || need_new_split) {
1542
- split->i_end = i;
1543
- i_split++;
1544
- if (i_split >= sched->splits_capacity) {
1545
- sched->splits_capacity *= 2;
1546
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1547
- GGML_ASSERT(sched->splits != NULL);
1548
- }
1549
- GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1550
- split = &sched->splits[i_split];
1551
- split->backend_id = node_backend_id;
1552
- split->i_start = i;
1553
- split->n_inputs = 0;
1554
- cur_backend_id = node_backend_id;
1555
- }
1556
-
1557
- // find inputs that are not on the same backend
1558
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1559
- struct ggml_tensor * src = node->src[j];
1560
- if (src == NULL) {
1561
- continue;
1562
- }
1563
-
1564
- const int src_backend_id = tensor_backend_id(src);
1565
- assert(src_backend_id != -1); // all inputs should be assigned by now
1566
-
1567
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1568
- size_t id = hash_id(src);
1569
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1570
- ggml_backend_t backend = sched->backends[src_backend_id];
1571
- for (int c = 0; c < sched->n_copies; c++) {
1572
- struct ggml_tensor * tensor_copy;
1573
- if (c == sched->cur_copy) {
1574
- tensor_copy = src; // use the original tensor as the current copy
1575
- } else {
1576
- tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1577
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1578
- }
1579
- if (sched->n_copies > 1) {
1580
- ggml_set_input(tensor_copy);
1581
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1582
- }
1583
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1584
- SET_CAUSE(tensor_copy, "4.cpy");
1585
- }
1586
- int n_graph_inputs = sched->n_graph_inputs++;
1587
- GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1588
- sched->graph_inputs[n_graph_inputs] = src;
1589
- }
1590
- }
1591
-
1592
- bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1593
- if (src_backend_id != cur_backend_id && !supported) {
1594
- // create a copy of the input in the split's backend
1595
- const size_t id = hash_id(src);
1596
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1597
- ggml_backend_t backend = sched->backends[cur_backend_id];
1598
- for (int c = 0; c < sched->n_copies; c++) {
1599
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1600
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1601
- if (sched->n_copies > 1) {
1602
- ggml_set_input(tensor_copy);
1603
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1604
- }
1605
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1606
- SET_CAUSE(tensor_copy, "4.cpy");
1607
- }
1608
- int n_inputs = split->n_inputs++;
1609
- GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1610
- split->inputs[n_inputs] = src;
1611
- }
1612
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1613
- }
1614
- }
1615
- }
1616
- split->i_end = graph->n_nodes;
1617
- sched->n_splits = i_split + 1;
1618
- }
1619
-
1620
- if (sched->debug) {
1621
- ggml_backend_sched_print_assignments(sched, graph);
1622
- }
1623
-
1624
- // swap node_backend_ids and leaf_backend_ids and prevs
1625
- {
1626
- int * tmp = sched->node_backend_ids;
1627
- sched->node_backend_ids = sched->prev_node_backend_ids;
1628
- sched->prev_node_backend_ids = tmp;
1629
-
1630
- tmp = sched->leaf_backend_ids;
1631
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1632
- sched->prev_leaf_backend_ids = tmp;
1633
- }
1634
-
1635
- // create copies of the graph for each split
1636
- // TODO: avoid this copy
1637
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1638
- for (int i = 0; i < sched->n_splits; i++) {
1639
- struct ggml_backend_sched_split * split = &sched->splits[i];
1640
- split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1641
-
1642
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1643
- for (int j = 0; j < split->n_inputs; j++) {
1644
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1645
-
1646
- struct ggml_tensor * input = split->inputs[j];
1647
- const size_t input_id = hash_id(input);
1648
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1649
-
1650
- // add a dependency to the input source so that it is not freed before the copy is done
1651
- struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1652
- input_dep->src[0] = input;
1653
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1654
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1655
-
1656
- // add a dependency to the input copy so that it is allocated at the start of the split
1657
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1658
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1659
- }
1660
-
1661
- for (int j = split->i_start; j < split->i_end; j++) {
1662
- assert(graph_copy->size > graph_copy->n_nodes);
1663
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1664
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1665
- }
1666
- }
1667
-
1668
- if (sched->n_copies > 1) {
1669
- // add input copies as leafs so that they are allocated first
1670
- for (int i = 0; i < sched->n_graph_inputs; i++) {
1671
- struct ggml_tensor * input = sched->graph_inputs[i];
1672
- size_t id = hash_id(input);
1673
- int backend_id = tensor_backend_id(input);
1674
- for (int c = 0; c < sched->n_copies; c++) {
1675
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1676
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1677
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1678
- }
1679
- }
1680
-
1681
- for (int i = 0; i < sched->n_splits; i++) {
1682
- struct ggml_backend_sched_split * split = &sched->splits[i];
1683
- int backend_id = split->backend_id;
1684
- for (int j = 0; j < split->n_inputs; j++) {
1685
- struct ggml_tensor * input = split->inputs[j];
1686
- size_t id = hash_id(input);
1687
- for (int c = 0; c < sched->n_copies; c++) {
1688
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1689
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1690
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1691
- }
1692
- }
1693
- }
1694
- }
1695
-
1696
- // add leafs from the original graph
1697
- for (int i = 0; i < graph->n_leafs; i++) {
1698
- struct ggml_tensor * leaf = graph->leafs[i];
1699
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1700
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1701
- }
1702
-
1703
- sched->graph = graph_copy;
1704
- }
1705
-
1706
- static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
- bool backend_ids_changed = false;
1708
- for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1710
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1711
- backend_ids_changed = true;
1712
- break;
1713
- }
1714
- }
1715
- if (!backend_ids_changed) {
1716
- for (int i = 0; i < sched->graph->n_leafs; i++) {
1717
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1718
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1719
- backend_ids_changed = true;
1720
- break;
1721
- }
1722
- }
1723
- }
1724
-
1725
- // allocate graph
1726
- if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1727
- // the re-allocation may cause the split inputs to be moved to a different address
1728
- ggml_backend_sched_synchronize(sched);
1729
- #ifndef NDEBUG
1730
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1731
- #endif
1732
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1733
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1734
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1735
- return false;
1736
- }
1737
- }
1738
-
1739
- return true;
1740
- }
1741
-
1742
- static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1743
- struct ggml_backend_sched_split * splits = sched->splits;
1744
-
1745
- for (int i = 0; i < sched->n_splits; i++) {
1746
- struct ggml_backend_sched_split * split = &splits[i];
1747
- int split_backend_id = split->backend_id;
1748
- ggml_backend_t split_backend = sched->backends[split_backend_id];
1749
-
1750
- // copy the input tensors to the split backend
1751
- for (int j = 0; j < split->n_inputs; j++) {
1752
- ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1753
- struct ggml_tensor * input = split->inputs[j];
1754
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1755
-
1756
- if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1757
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1758
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1759
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1760
- } else {
1761
- ggml_backend_synchronize(split_backend);
1762
- }
1763
- ggml_backend_tensor_copy(input, input_cpy);
1764
- } else {
1765
- // wait for the split backend to finish using the input before overwriting it
1766
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1767
- ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1768
- } else {
1769
- ggml_backend_synchronize(split_backend);
1770
- }
1771
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1772
- }
1773
- }
1774
-
1775
- if (!sched->callback_eval) {
1776
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1777
- if (ec != GGML_STATUS_SUCCESS) {
1778
- return ec;
1779
- }
1780
- } else {
1781
- // similar to ggml_backend_compare_graph_backend
1782
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1783
- struct ggml_tensor * t = split->graph.nodes[j0];
1784
-
1785
- // check if the user needs data from this node
1786
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1787
-
1788
- int j1 = j0;
1789
-
1790
- // determine the range [j0, j1] of nodes that can be computed together
1791
- while (!need && j1 < split->graph.n_nodes - 1) {
1792
- t = split->graph.nodes[++j1];
1793
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1794
- }
1795
-
1796
- struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1797
-
1798
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1799
- if (ec != GGML_STATUS_SUCCESS) {
1800
- return ec;
1801
- }
1802
-
1803
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1804
- ggml_backend_synchronize(split_backend);
1805
-
1806
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1807
- break;
1808
- }
1809
-
1810
- j0 = j1;
1811
- }
1812
- }
1813
-
1814
- // record the event of this copy
1815
- if (split->n_inputs > 0) {
1816
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1817
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1818
- }
1819
- }
1820
- }
1821
-
1822
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1823
-
1824
- return GGML_STATUS_SUCCESS;
1825
- }
1826
-
1827
- ggml_backend_sched_t ggml_backend_sched_new(
1828
- ggml_backend_t * backends,
1829
- ggml_backend_buffer_type_t * bufts,
1830
- int n_backends,
1831
- size_t graph_size,
1832
- bool parallel) {
1833
- GGML_ASSERT(n_backends > 0);
1834
- GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1835
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1836
-
1837
- struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1838
-
1839
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1840
-
1841
- // initialize hash table
1842
- sched->hash_set = ggml_hash_set_new(graph_size);
1843
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1844
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1845
-
1846
- const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1847
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1848
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1849
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1850
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1851
-
1852
- sched->n_backends = n_backends;
1853
-
1854
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1855
-
1856
- const int initial_splits_capacity = 16;
1857
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1858
- sched->splits_capacity = initial_splits_capacity;
1859
-
1860
- for (int b = 0; b < n_backends; b++) {
1861
- sched->backends[b] = backends[b];
1862
- sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1863
- GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1864
- if (sched->n_copies > 1) {
1865
- for (int c = 0; c < sched->n_copies; c++) {
1866
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
1867
- }
1868
- }
1869
- }
1870
-
1871
- sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1872
-
1873
- ggml_backend_sched_reset(sched);
1874
-
1875
- return sched;
1876
- }
1877
-
1878
- void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1879
- if (sched == NULL) {
1880
- return;
1881
- }
1882
- for (int b = 0; b < sched->n_backends; b++) {
1883
- for (int c = 0; c < sched->n_copies; c++) {
1884
- ggml_backend_event_free(sched->events[b][c]);
1885
- }
1886
- }
1887
- ggml_gallocr_free(sched->galloc);
1888
- ggml_free(sched->ctx);
1889
- free(sched->splits);
1890
- free(sched->hash_set.keys);
1891
- free(sched->tensor_backend_id);
1892
- free(sched->tensor_copies);
1893
- free(sched->node_backend_ids);
1894
- free(sched->leaf_backend_ids);
1895
- free(sched->prev_node_backend_ids);
1896
- free(sched->prev_leaf_backend_ids);
1897
- free(sched);
1898
- }
1899
-
1900
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1901
- // reset state for the next run
1902
- if (!sched->is_reset) {
1903
- size_t hash_size = sched->hash_set.size;
1904
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1905
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1906
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1907
-
1908
- sched->is_reset = true;
1909
- }
1910
- sched->is_alloc = false;
1911
- }
1912
-
1913
- bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1914
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1915
-
1916
- ggml_backend_sched_split_graph(sched, measure_graph);
1917
-
1918
- // TODO: extract this to a separate function
1919
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1920
- return false;
1921
- }
1922
-
1923
- ggml_backend_sched_reset(sched);
1924
- ggml_backend_sched_synchronize(sched);
1925
-
1926
- return true;
1927
- }
1928
-
1929
- bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1930
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1931
-
1932
- ggml_backend_sched_split_graph(sched, graph);
1933
-
1934
- if (!ggml_backend_sched_alloc_splits(sched)) {
1935
- return false;
1936
- }
1937
-
1938
- sched->is_alloc = true;
1939
-
1940
- return true;
1941
- }
1942
-
1943
- enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1944
- enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1945
- ggml_backend_sched_synchronize(sched);
1946
- return err;
1947
- }
1948
-
1949
- enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1950
- if (!sched->is_reset && !sched->is_alloc) {
1951
- ggml_backend_sched_reset(sched);
1952
- }
1953
-
1954
- if (!sched->is_alloc) {
1955
- if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1956
- return GGML_STATUS_ALLOC_FAILED;
1957
- }
1958
- }
1959
-
1960
- return ggml_backend_sched_compute_splits(sched);
1961
- }
1962
-
1963
- void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1964
- for (int i = 0; i < sched->n_backends; i++) {
1965
- ggml_backend_synchronize(sched->backends[i]);
1966
- }
1967
- }
1968
-
1969
- void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1970
- sched->callback_eval = callback;
1971
- sched->callback_eval_user_data = user_data;
1972
- }
1973
-
1974
- int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1975
- return sched->n_splits;
1976
- }
1977
-
1978
- int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1979
- return sched->n_copies;
1980
- }
1981
-
1982
- int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1983
- return sched->n_backends;
1984
- }
1985
-
1986
- ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1987
- GGML_ASSERT(i >= 0 && i < sched->n_backends);
1988
- return sched->backends[i];
1989
- }
1990
-
1991
- size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1992
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
1993
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1994
-
1995
- return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1996
- }
1997
-
1998
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1999
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
2000
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2001
- tensor_backend_id(node) = backend_index;
2002
- SET_CAUSE(node, "usr");
2003
- }
2004
-
2005
- ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
2006
- int backend_index = tensor_backend_id(node);
2007
- if (backend_index == -1) {
2008
- return NULL;
2009
- }
2010
- return sched->backends[backend_index];
2011
- }
2012
-
2013
- // utils
2014
-
2015
- void ggml_backend_view_init(struct ggml_tensor * tensor) {
2016
- GGML_ASSERT(tensor->buffer == NULL);
2017
- GGML_ASSERT(tensor->view_src != NULL);
2018
- GGML_ASSERT(tensor->view_src->buffer != NULL);
2019
- GGML_ASSERT(tensor->view_src->data != NULL);
2020
-
2021
- tensor->buffer = tensor->view_src->buffer;
2022
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2023
- ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2024
- }
2025
-
2026
- void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
2027
- GGML_ASSERT(tensor->buffer == NULL);
2028
- GGML_ASSERT(tensor->data == NULL);
2029
- GGML_ASSERT(tensor->view_src == NULL);
2030
- GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
2031
- GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2032
- (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
2033
-
2034
- tensor->buffer = buffer;
2035
- tensor->data = addr;
2036
- ggml_backend_buffer_init_tensor(buffer, tensor);
2037
- }
2038
-
2039
- static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
2040
- struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
2041
-
2042
- GGML_ASSERT(src != NULL);
2043
- GGML_ASSERT(src->data && "graph must be allocated");
2044
-
2045
- size_t id = ggml_hash_insert(hash_set, src);
2046
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
2047
- return node_copies[ggml_hash_find(hash_set, src)];
2048
- }
2049
-
2050
- struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2051
- if (src->view_src != NULL) {
2052
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2053
- dst->view_offs = src->view_offs;
2054
- }
2055
- dst->op = src->op;
2056
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2057
- ggml_set_name(dst, src->name);
2058
-
2059
- // copy src
2060
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2061
- struct ggml_tensor * s = src->src[i];
2062
- if (s == NULL) {
2063
- continue;
2064
- }
2065
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2066
- }
2067
-
2068
- node_copies[id] = dst;
2069
- return dst;
2070
- }
2071
-
2072
- static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2073
- size_t id = ggml_hash_find(hash_set, src);
2074
- if (node_init[id]) {
2075
- return;
2076
- }
2077
- node_init[id] = true;
2078
-
2079
- struct ggml_tensor * dst = node_copies[id];
2080
- if (dst->view_src != NULL) {
2081
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2082
- ggml_backend_view_init(dst);
2083
- }
2084
- else {
2085
- ggml_backend_tensor_copy(src, dst);
2086
- }
2087
-
2088
- // init src
2089
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2090
- struct ggml_tensor * s = src->src[i];
2091
- if (s == NULL) {
2092
- continue;
2093
- }
2094
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2095
- }
2096
- }
2097
-
2098
- struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2099
- struct ggml_hash_set hash_set = {
2100
- /* .size = */ graph->visited_hash_table.size,
2101
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
2102
- };
2103
- struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2104
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2105
-
2106
- struct ggml_init_params params = {
2107
- /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
2108
- /* .mem_buffer = */ NULL,
2109
- /* .no_alloc = */ true
2110
- };
2111
-
2112
- struct ggml_context * ctx_allocated = ggml_init(params);
2113
- struct ggml_context * ctx_unallocated = ggml_init(params);
2114
-
2115
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2116
- fprintf(stderr, "failed to allocate context for graph copy\n");
2117
- free(hash_set.keys);
2118
- free(node_copies);
2119
- free(node_init);
2120
- ggml_free(ctx_allocated);
2121
- ggml_free(ctx_unallocated);
2122
- return (struct ggml_backend_graph_copy) {
2123
- /* .buffer = */ NULL,
2124
- /* .ctx_allocated = */ NULL,
2125
- /* .ctx_unallocated = */ NULL,
2126
- /* .graph = */ NULL,
2127
- };
2128
- }
2129
-
2130
- // dup nodes
2131
- for (int i = 0; i < graph->n_nodes; i++) {
2132
- struct ggml_tensor * node = graph->nodes[i];
2133
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2134
- }
2135
-
2136
- // allocate nodes
2137
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2138
- if (buffer == NULL) {
2139
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2140
- free(hash_set.keys);
2141
- free(node_copies);
2142
- free(node_init);
2143
- ggml_free(ctx_allocated);
2144
- ggml_free(ctx_unallocated);
2145
- return (struct ggml_backend_graph_copy) {
2146
- /* .buffer = */ NULL,
2147
- /* .ctx_allocated = */ NULL,
2148
- /* .ctx_unallocated = */ NULL,
2149
- /* .graph = */ NULL,
2150
- };
2151
- }
2152
-
2153
- //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2154
-
2155
- // copy data and init views
2156
- for (int i = 0; i < graph->n_nodes; i++) {
2157
- struct ggml_tensor * node = graph->nodes[i];
2158
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2159
- }
2160
-
2161
- // build graph copy
2162
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2163
- for (int i = 0; i < graph->n_nodes; i++) {
2164
- struct ggml_tensor * node = graph->nodes[i];
2165
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
2166
- graph_copy->nodes[i] = node_copy;
2167
- }
2168
- graph_copy->n_nodes = graph->n_nodes;
2169
-
2170
- free(hash_set.keys);
2171
- free(node_copies);
2172
- free(node_init);
2173
-
2174
- return (struct ggml_backend_graph_copy) {
2175
- /* .buffer = */ buffer,
2176
- /* .ctx_allocated = */ ctx_allocated,
2177
- /* .ctx_unallocated = */ ctx_unallocated,
2178
- /* .graph = */ graph_copy,
2179
- };
2180
- }
2181
-
2182
- void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2183
- ggml_backend_buffer_free(copy.buffer);
2184
- ggml_free(copy.ctx_allocated);
2185
- ggml_free(copy.ctx_unallocated);
2186
- }
2187
-
2188
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2189
- struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2190
- if (copy.buffer == NULL) {
2191
- return false;
2192
- }
2193
-
2194
- struct ggml_cgraph * g1 = graph;
2195
- struct ggml_cgraph * g2 = copy.graph;
2196
-
2197
- assert(g1->n_nodes == g2->n_nodes);
2198
-
2199
- for (int i = 0; i < g1->n_nodes; i++) {
2200
- //printf("eval %d/%d\n", i, g1->n_nodes);
2201
- struct ggml_tensor * t1 = g1->nodes[i];
2202
- struct ggml_tensor * t2 = g2->nodes[i];
2203
-
2204
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2205
-
2206
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2207
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2208
-
2209
- ggml_backend_graph_compute(backend1, &g1v);
2210
- ggml_backend_graph_compute(backend2, &g2v);
2211
-
2212
- if (ggml_is_view_op(t1->op)) {
2213
- continue;
2214
- }
2215
-
2216
- // compare results, calculate rms etc
2217
- if (!callback(i, t1, t2, user_data)) {
2218
- break;
2219
- }
2220
- }
2221
-
2222
- ggml_backend_graph_copy_free(copy);
2223
-
2224
- return true;
2225
- }