@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,2099 @@
1
+ #include "ggml-backend-impl.h"
2
+ #include "ggml-alloc.h"
3
+ #include "ggml-impl.h"
4
+
5
+ #include <assert.h>
6
+ #include <limits.h>
7
+ #include <stdarg.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+
13
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
+
15
+ // backend buffer type
16
+
17
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
+ return buft->iface.get_name(buft);
19
+ }
20
+
21
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22
+ return buft->iface.alloc_buffer(buft, size);
23
+ }
24
+
25
+ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
26
+ return buft->iface.get_alignment(buft);
27
+ }
28
+
29
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
30
+ // get_max_size is optional, defaults to SIZE_MAX
31
+ if (buft->iface.get_max_size) {
32
+ return buft->iface.get_max_size(buft);
33
+ }
34
+ return SIZE_MAX;
35
+ }
36
+
37
+ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
+ // get_alloc_size is optional, defaults to ggml_nbytes
39
+ if (buft->iface.get_alloc_size) {
40
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
41
+ assert(size >= ggml_nbytes(tensor));
42
+ return size;
43
+ }
44
+ return ggml_nbytes(tensor);
45
+ }
46
+
47
+ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
+ return buft->iface.supports_backend(buft, backend);
49
+ }
50
+
51
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
+ if (buft->iface.is_host) {
53
+ return buft->iface.is_host(buft);
54
+ }
55
+ return false;
56
+ }
57
+
58
+ // backend buffer
59
+
60
+ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
61
+ ggml_backend_buffer_type_t buft,
62
+ struct ggml_backend_buffer_i iface,
63
+ ggml_backend_buffer_context_t context,
64
+ size_t size) {
65
+ ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
66
+
67
+ (*buffer) = (struct ggml_backend_buffer) {
68
+ /* .interface = */ iface,
69
+ /* .buft = */ buft,
70
+ /* .context = */ context,
71
+ /* .size = */ size,
72
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
73
+ };
74
+
75
+ return buffer;
76
+ }
77
+
78
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
79
+ return buffer->iface.get_name(buffer);
80
+ }
81
+
82
+ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
83
+ if (buffer == NULL) {
84
+ return;
85
+ }
86
+
87
+ if (buffer->iface.free_buffer != NULL) {
88
+ buffer->iface.free_buffer(buffer);
89
+ }
90
+ free(buffer);
91
+ }
92
+
93
+ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
94
+ return buffer->size;
95
+ }
96
+
97
+ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
98
+ void * base = buffer->iface.get_base(buffer);
99
+
100
+ GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
101
+
102
+ return base;
103
+ }
104
+
105
+ GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
106
+ // init_tensor is optional
107
+ if (buffer->iface.init_tensor) {
108
+ buffer->iface.init_tensor(buffer, tensor);
109
+ }
110
+ }
111
+
112
+ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
113
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
114
+ }
115
+
116
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
117
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
118
+ }
119
+
120
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
121
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
122
+ }
123
+
124
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
125
+ buffer->iface.clear(buffer, value);
126
+ }
127
+
128
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
129
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
130
+ }
131
+
132
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
133
+ buffer->usage = usage;
134
+
135
+ // FIXME: add a generic callback to the buffer interface
136
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
137
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
138
+ }
139
+ }
140
+
141
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
142
+ return buffer->buft;
143
+ }
144
+
145
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
146
+ if (buffer->iface.reset) {
147
+ buffer->iface.reset(buffer);
148
+ }
149
+ }
150
+
151
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
+ if (dst_buf->iface.cpy_tensor) {
154
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
155
+ }
156
+ return false;
157
+ }
158
+
159
+ // backend
160
+
161
+ ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
+ if (backend == NULL) {
163
+ return NULL;
164
+ }
165
+ return backend->guid;
166
+ }
167
+
168
+ const char * ggml_backend_name(ggml_backend_t backend) {
169
+ if (backend == NULL) {
170
+ return "NULL";
171
+ }
172
+ return backend->iface.get_name(backend);
173
+ }
174
+
175
+ void ggml_backend_free(ggml_backend_t backend) {
176
+ if (backend == NULL) {
177
+ return;
178
+ }
179
+
180
+ backend->iface.free(backend);
181
+ }
182
+
183
+ ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
184
+ return backend->iface.get_default_buffer_type(backend);
185
+ }
186
+
187
+ ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
188
+ return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
189
+ }
190
+
191
+ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
192
+ return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
193
+ }
194
+
195
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
196
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
197
+ }
198
+
199
+ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
200
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
201
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
202
+
203
+ if (backend->iface.set_tensor_async == NULL) {
204
+ ggml_backend_tensor_set(tensor, data, offset, size);
205
+ } else {
206
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
207
+ }
208
+ }
209
+
210
+ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
211
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
212
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
213
+
214
+ if (backend->iface.get_tensor_async == NULL) {
215
+ ggml_backend_tensor_get(tensor, data, offset, size);
216
+ } else {
217
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
218
+ }
219
+ }
220
+
221
+ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
+
224
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
+
228
+ if (!size) {
229
+ return;
230
+ }
231
+
232
+ buf->iface.set_tensor(buf, tensor, data, offset, size);
233
+ }
234
+
235
+ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
+
238
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
+
242
+ if (!size) {
243
+ return;
244
+ }
245
+
246
+ buf->iface.get_tensor(buf, tensor, data, offset, size);
247
+ }
248
+
249
+ void ggml_backend_synchronize(ggml_backend_t backend) {
250
+ if (backend->iface.synchronize == NULL) {
251
+ return;
252
+ }
253
+
254
+ backend->iface.synchronize(backend);
255
+ }
256
+
257
+ ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
258
+ GGML_ASSERT(backend->iface.graph_plan_create != NULL);
259
+
260
+ return backend->iface.graph_plan_create(backend, cgraph);
261
+ }
262
+
263
+ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
264
+ GGML_ASSERT(backend->iface.graph_plan_free != NULL);
265
+
266
+ backend->iface.graph_plan_free(backend, plan);
267
+ }
268
+
269
+ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
270
+ GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
271
+
272
+ return backend->iface.graph_plan_compute(backend, plan);
273
+ }
274
+
275
+ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
276
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
277
+ ggml_backend_synchronize(backend);
278
+ return err;
279
+ }
280
+
281
+ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
282
+ return backend->iface.graph_compute(backend, cgraph);
283
+ }
284
+
285
+ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
286
+ return backend->iface.supports_op(backend, op);
287
+ }
288
+
289
+ bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
+ if (backend->iface.offload_op != NULL) {
291
+ return backend->iface.offload_op(backend, op);
292
+ }
293
+ return false;
294
+ }
295
+
296
+ // backend copy
297
+
298
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
299
+ if (a->type != b->type) {
300
+ return false;
301
+ }
302
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
303
+ if (a->ne[i] != b->ne[i]) {
304
+ return false;
305
+ }
306
+ if (a->nb[i] != b->nb[i]) {
307
+ return false;
308
+ }
309
+ }
310
+ return true;
311
+ }
312
+
313
+ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
314
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
315
+
316
+ if (src == dst) {
317
+ return;
318
+ }
319
+
320
+ if (ggml_backend_buffer_is_host(src->buffer)) {
321
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
322
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
323
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
324
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
325
+ #ifndef NDEBUG
326
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
327
+ #endif
328
+ size_t nbytes = ggml_nbytes(src);
329
+ void * data = malloc(nbytes);
330
+ ggml_backend_tensor_get(src, data, 0, nbytes);
331
+ ggml_backend_tensor_set(dst, data, 0, nbytes);
332
+ free(data);
333
+ }
334
+ }
335
+
336
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
337
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
338
+
339
+ if (src == dst) {
340
+ return;
341
+ }
342
+
343
+ if (backend_dst->iface.cpy_tensor_async != NULL) {
344
+ if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
345
+ return;
346
+ }
347
+ }
348
+
349
+ // an async copy would normally happen after all the queued operations on both backends are completed
350
+ // sync src, set_async dst
351
+ if (ggml_backend_buffer_is_host(src->buffer)) {
352
+ ggml_backend_synchronize(backend_src);
353
+ ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
354
+ } else {
355
+ ggml_backend_synchronize(backend_src);
356
+ ggml_backend_tensor_copy(src, dst);
357
+ ggml_backend_synchronize(backend_dst);
358
+ }
359
+ }
360
+
361
+ // events
362
+
363
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
364
+ if (backend->iface.event_new == NULL) {
365
+ return NULL;
366
+ }
367
+ return backend->iface.event_new(backend);
368
+ }
369
+
370
+ void ggml_backend_event_free(ggml_backend_event_t event) {
371
+ if (event == NULL) {
372
+ return;
373
+ }
374
+ event->backend->iface.event_free(event);
375
+ }
376
+
377
+ void ggml_backend_event_record(ggml_backend_event_t event) {
378
+ GGML_ASSERT(event->backend->iface.event_record != NULL);
379
+
380
+ event->backend->iface.event_record(event);
381
+ }
382
+
383
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
384
+ GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
385
+
386
+ event->backend->iface.event_synchronize(event);
387
+ }
388
+
389
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
390
+ GGML_ASSERT(backend->iface.event_wait != NULL);
391
+
392
+ backend->iface.event_wait(backend, event);
393
+ }
394
+
395
+ // backend registry
396
+
397
+ #define GGML_REG_MAX_BACKENDS 16
398
+
399
+ struct ggml_backend_reg {
400
+ char name[128];
401
+ ggml_backend_init_fn init_fn;
402
+ ggml_backend_buffer_type_t default_buffer_type;
403
+ void * user_data;
404
+ };
405
+
406
+ static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
407
+ static size_t ggml_backend_registry_count = 0;
408
+
409
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
410
+
411
+ GGML_CALL static void ggml_backend_registry_init(void) {
412
+ static bool initialized = false;
413
+
414
+ if (initialized) {
415
+ return;
416
+ }
417
+
418
+ initialized = true;
419
+
420
+ ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
421
+
422
+ // add forward decls here to avoid including the backend headers
423
+ #ifdef GGML_USE_CUDA
424
+ extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
425
+ ggml_backend_cuda_reg_devices();
426
+ #endif
427
+
428
+ #ifdef GGML_USE_SYCL
429
+ extern void ggml_backend_sycl_reg_devices(void);
430
+ ggml_backend_sycl_reg_devices();
431
+ #endif
432
+
433
+ #ifdef GGML_USE_METAL
434
+ extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
435
+ extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
436
+ ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
437
+ #endif
438
+
439
+ #ifdef GGML_USE_VULKAN
440
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
441
+ ggml_backend_vk_reg_devices();
442
+ #endif
443
+
444
+ #ifdef GGML_USE_KOMPUTE
445
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
446
+ ggml_backend_kompute_reg_devices();
447
+ #endif
448
+ }
449
+
450
+ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
451
+ GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
452
+
453
+ size_t id = ggml_backend_registry_count;
454
+
455
+ ggml_backend_registry[id] = (struct ggml_backend_reg) {
456
+ /* .name = */ {0},
457
+ /* .fn = */ init_fn,
458
+ /* .default_buffer_type = */ default_buffer_type,
459
+ /* .user_data = */ user_data,
460
+ };
461
+
462
+ snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
463
+
464
+ #ifndef NDEBUG
465
+ fprintf(stderr, "%s: registered backend %s\n", __func__, name);
466
+ #endif
467
+
468
+ ggml_backend_registry_count++;
469
+ }
470
+
471
+ size_t ggml_backend_reg_get_count(void) {
472
+ ggml_backend_registry_init();
473
+
474
+ return ggml_backend_registry_count;
475
+ }
476
+
477
+ size_t ggml_backend_reg_find_by_name(const char * name) {
478
+ ggml_backend_registry_init();
479
+
480
+ for (size_t i = 0; i < ggml_backend_registry_count; i++) {
481
+ // TODO: case insensitive in a portable way
482
+ if (strcmp(ggml_backend_registry[i].name, name) == 0) {
483
+ return i;
484
+ }
485
+ }
486
+
487
+ // not found
488
+ return SIZE_MAX;
489
+ }
490
+
491
+ // init from backend:params string
492
+ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
493
+ ggml_backend_registry_init();
494
+
495
+ const char * params = strchr(backend_str, ':');
496
+ char backend_name[128];
497
+ if (params == NULL) {
498
+ snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
499
+ params = "";
500
+ } else {
501
+ snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
502
+ params++;
503
+ }
504
+
505
+ size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
506
+
507
+ if (backend_i == SIZE_MAX) {
508
+ fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
509
+ return NULL;
510
+ }
511
+
512
+ return ggml_backend_reg_init_backend(backend_i, params);
513
+ }
514
+
515
+ const char * ggml_backend_reg_get_name(size_t i) {
516
+ ggml_backend_registry_init();
517
+
518
+ GGML_ASSERT(i < ggml_backend_registry_count);
519
+ return ggml_backend_registry[i].name;
520
+ }
521
+
522
+ ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
523
+ ggml_backend_registry_init();
524
+
525
+ GGML_ASSERT(i < ggml_backend_registry_count);
526
+ return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
527
+ }
528
+
529
+ ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
530
+ ggml_backend_registry_init();
531
+
532
+ GGML_ASSERT(i < ggml_backend_registry_count);
533
+ return ggml_backend_registry[i].default_buffer_type;
534
+ }
535
+
536
+ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
537
+ ggml_backend_registry_init();
538
+
539
+ GGML_ASSERT(i < ggml_backend_registry_count);
540
+ return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
541
+ }
542
+
543
+ // backend CPU
544
+
545
+ static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
546
+
547
+ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
548
+ return "CPU";
549
+
550
+ GGML_UNUSED(buffer);
551
+ }
552
+
553
+ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
554
+ uintptr_t data = (uintptr_t)buffer->context;
555
+
556
+ // align the buffer
557
+ if (data % TENSOR_ALIGNMENT != 0) {
558
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
559
+ }
560
+
561
+ return (void *)data;
562
+ }
563
+
564
+ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
565
+ free(buffer->context);
566
+ }
567
+
568
+ GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
569
+ memcpy((char *)tensor->data + offset, data, size);
570
+
571
+ GGML_UNUSED(buffer);
572
+ }
573
+
574
+ GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
575
+ memcpy(data, (const char *)tensor->data + offset, size);
576
+
577
+ GGML_UNUSED(buffer);
578
+ }
579
+
580
+ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
581
+ if (ggml_backend_buffer_is_host(src->buffer)) {
582
+ memcpy(dst->data, src->data, ggml_nbytes(src));
583
+ return true;
584
+ }
585
+ return false;
586
+
587
+ GGML_UNUSED(buffer);
588
+ }
589
+
590
+ GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
591
+ memset(buffer->context, value, buffer->size);
592
+ }
593
+
594
+ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
595
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
596
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
597
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
598
+ /* .init_tensor = */ NULL, // no initialization required
599
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
600
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
601
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
602
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
603
+ /* .reset = */ NULL,
604
+ };
605
+
606
+ // for buffers from ptr, free is not called
607
+ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
608
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
609
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
610
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
611
+ /* .init_tensor = */ NULL, // no initialization required
612
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
613
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
614
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
615
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
616
+ /* .reset = */ NULL,
617
+ };
618
+
619
+ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
620
+ return "CPU";
621
+
622
+ GGML_UNUSED(buft);
623
+ }
624
+
625
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
626
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
627
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
628
+ if (data == NULL) {
629
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
630
+ return NULL;
631
+ }
632
+
633
+ return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
634
+ }
635
+
636
+ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
637
+ return TENSOR_ALIGNMENT;
638
+
639
+ GGML_UNUSED(buft);
640
+ }
641
+
642
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
+ return ggml_backend_is_cpu(backend);
644
+
645
+ GGML_UNUSED(buft);
646
+ }
647
+
648
+ GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
+ return true;
650
+
651
+ GGML_UNUSED(buft);
652
+ }
653
+
654
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
655
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
656
+ /* .iface = */ {
657
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
658
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
659
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
+ },
665
+ /* .context = */ NULL,
666
+ };
667
+
668
+ return &ggml_backend_cpu_buffer_type;
669
+ }
670
+
671
+ #ifdef GGML_USE_CPU_HBM
672
+
673
+ // buffer type HBM
674
+
675
+ #include <hbwmalloc.h>
676
+
677
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
678
+ return "CPU_HBM";
679
+
680
+ GGML_UNUSED(buft);
681
+ }
682
+
683
+ GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
684
+ return "CPU_HBM";
685
+
686
+ GGML_UNUSED(buf);
687
+ }
688
+
689
+ GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
690
+ hbw_free(buffer->context);
691
+ }
692
+
693
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
694
+ //void * ptr = hbw_malloc(size);
695
+ void * ptr;
696
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
697
+ if (result != 0) {
698
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
699
+ return NULL;
700
+ }
701
+
702
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
703
+ buffer->buft = buft;
704
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
705
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
706
+
707
+ return buffer;
708
+ }
709
+
710
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
711
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
712
+ /* .iface = */ {
713
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
714
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
715
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
+ },
721
+ /* .context = */ NULL,
722
+ };
723
+
724
+ return &ggml_backend_cpu_buffer_type_hbm;
725
+ }
726
+ #endif
727
+
728
+ struct ggml_backend_cpu_context {
729
+ int n_threads;
730
+ void * work_data;
731
+ size_t work_size;
732
+
733
+ ggml_abort_callback abort_callback;
734
+ void * abort_callback_data;
735
+ };
736
+
737
+ GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
738
+ return "CPU";
739
+
740
+ GGML_UNUSED(backend);
741
+ }
742
+
743
+ GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
744
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
745
+ free(cpu_ctx->work_data);
746
+ free(cpu_ctx);
747
+ free(backend);
748
+ }
749
+
750
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
751
+ return ggml_backend_cpu_buffer_type();
752
+
753
+ GGML_UNUSED(backend);
754
+ }
755
+
756
+ struct ggml_backend_plan_cpu {
757
+ struct ggml_cplan cplan;
758
+ struct ggml_cgraph cgraph;
759
+ };
760
+
761
+ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
762
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
763
+
764
+ struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
765
+
766
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
767
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
768
+
769
+ if (cpu_plan->cplan.work_size > 0) {
770
+ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771
+ if (cpu_plan->cplan.work_data == NULL) {
772
+ free(cpu_plan);
773
+ return NULL;
774
+ }
775
+ }
776
+
777
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
778
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
779
+
780
+ return cpu_plan;
781
+ }
782
+
783
+ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
784
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
785
+
786
+ free(cpu_plan->cplan.work_data);
787
+ free(cpu_plan);
788
+
789
+ GGML_UNUSED(backend);
790
+ }
791
+
792
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
793
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
794
+
795
+ return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
796
+
797
+ GGML_UNUSED(backend);
798
+ }
799
+
800
+ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
801
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
802
+
803
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
804
+
805
+ if (cpu_ctx->work_size < cplan.work_size) {
806
+ free(cpu_ctx->work_data);
807
+ cpu_ctx->work_data = malloc(cplan.work_size);
808
+ if (cpu_ctx->work_data == NULL) {
809
+ cpu_ctx->work_size = 0;
810
+ return GGML_STATUS_ALLOC_FAILED;
811
+ }
812
+ cpu_ctx->work_size = cplan.work_size;
813
+ }
814
+ cplan.work_data = cpu_ctx->work_data;
815
+
816
+ cplan.abort_callback = cpu_ctx->abort_callback;
817
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
818
+
819
+ return ggml_graph_compute(cgraph, &cplan);
820
+ }
821
+
822
+ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
+ switch (op->op) {
824
+ case GGML_OP_CPY:
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
830
+ case GGML_OP_MUL_MAT:
831
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
832
+ default:
833
+ return true;
834
+ }
835
+
836
+ GGML_UNUSED(backend);
837
+ }
838
+
839
+ static struct ggml_backend_i cpu_backend_i = {
840
+ /* .get_name = */ ggml_backend_cpu_name,
841
+ /* .free = */ ggml_backend_cpu_free,
842
+ /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
843
+ /* .set_tensor_async = */ NULL,
844
+ /* .get_tensor_async = */ NULL,
845
+ /* .cpy_tensor_async = */ NULL,
846
+ /* .synchronize = */ NULL,
847
+ /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
848
+ /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
849
+ /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
850
+ /* .graph_compute = */ ggml_backend_cpu_graph_compute,
851
+ /* .supports_op = */ ggml_backend_cpu_supports_op,
852
+ /* .offload_op = */ NULL,
853
+ /* .event_new = */ NULL,
854
+ /* .event_free = */ NULL,
855
+ /* .event_record = */ NULL,
856
+ /* .event_wait = */ NULL,
857
+ /* .event_synchronize = */ NULL,
858
+ };
859
+
860
+ static ggml_guid_t ggml_backend_cpu_guid(void) {
861
+ static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
862
+ return &guid;
863
+ }
864
+
865
+ ggml_backend_t ggml_backend_cpu_init(void) {
866
+ struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
867
+ if (ctx == NULL) {
868
+ return NULL;
869
+ }
870
+
871
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
872
+ ctx->work_data = NULL;
873
+ ctx->work_size = 0;
874
+ ctx->abort_callback = NULL;
875
+ ctx->abort_callback_data = NULL;
876
+
877
+ ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
878
+ if (cpu_backend == NULL) {
879
+ free(ctx);
880
+ return NULL;
881
+ }
882
+
883
+ *cpu_backend = (struct ggml_backend) {
884
+ /* .guid = */ ggml_backend_cpu_guid(),
885
+ /* .interface = */ cpu_backend_i,
886
+ /* .context = */ ctx
887
+ };
888
+ return cpu_backend;
889
+ }
890
+
891
+ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
892
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
893
+ }
894
+
895
+ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
896
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
897
+
898
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
899
+ ctx->n_threads = n_threads;
900
+ }
901
+
902
+ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
903
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
904
+
905
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
906
+ ctx->abort_callback = abort_callback;
907
+ ctx->abort_callback_data = abort_callback_data;
908
+ }
909
+
910
+ GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
911
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
912
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
913
+ }
914
+
915
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
916
+ return ggml_backend_cpu_init();
917
+
918
+ GGML_UNUSED(params);
919
+ GGML_UNUSED(user_data);
920
+ }
921
+
922
+ // multi-buffer buffer
923
+
924
+ struct ggml_backend_multi_buffer_context {
925
+ ggml_backend_buffer_t * buffers;
926
+ size_t n_buffers;
927
+ };
928
+
929
+ typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
930
+
931
+ GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
932
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
933
+
934
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
935
+ }
936
+
937
+ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
938
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
939
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
940
+ ggml_backend_buffer_free(ctx->buffers[i]);
941
+ }
942
+
943
+ free(ctx->buffers);
944
+ free(ctx);
945
+ }
946
+
947
+ GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
948
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
949
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
950
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
951
+ }
952
+ }
953
+
954
+ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
955
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
956
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
957
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
958
+ /* .get_base = */ NULL,
959
+ /* .init_tensor = */ NULL,
960
+ /* .set_tensor = */ NULL,
961
+ /* .get_tensor = */ NULL,
962
+ /* .cpy_tensor = */ NULL,
963
+ /* .clear = */ ggml_backend_multi_buffer_clear,
964
+ /* .reset = */ NULL,
965
+ };
966
+
967
+ return multi_backend_buffer_i;
968
+ }
969
+
970
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
971
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
972
+ ctx->n_buffers = n_buffers;
973
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
974
+
975
+ GGML_ASSERT(ctx->buffers != NULL);
976
+
977
+ size_t total_size = 0;
978
+ for (size_t i = 0; i < n_buffers; i++) {
979
+ ctx->buffers[i] = buffers[i];
980
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
981
+ }
982
+
983
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
984
+ }
985
+
986
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
987
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
988
+ }
989
+
990
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
991
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
992
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
993
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
994
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
995
+ }
996
+ }
997
+
998
+ // creates a copy of the tensor with the same memory layout
999
+ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1000
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1001
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
1002
+ dup->nb[i] = tensor->nb[i];
1003
+ }
1004
+ return dup;
1005
+ }
1006
+
1007
+ static bool ggml_is_view_op(enum ggml_op op) {
1008
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
1009
+ }
1010
+
1011
+ // scheduler
1012
+
1013
+ #ifndef GGML_SCHED_MAX_BACKENDS
1014
+ #define GGML_SCHED_MAX_BACKENDS 16
1015
+ #endif
1016
+
1017
+ #ifndef GGML_SCHED_MAX_SPLITS
1018
+ #define GGML_SCHED_MAX_SPLITS 2048
1019
+ #endif
1020
+
1021
+ #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1022
+ #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1023
+ #endif
1024
+
1025
+ #ifndef GGML_SCHED_MAX_COPIES
1026
+ #define GGML_SCHED_MAX_COPIES 4
1027
+ #endif
1028
+
1029
+ struct ggml_backend_sched_split {
1030
+ int backend_id;
1031
+ int i_start;
1032
+ int i_end;
1033
+ struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1034
+ int n_inputs;
1035
+ // graph view of this split
1036
+ struct ggml_cgraph graph;
1037
+ };
1038
+
1039
+ struct ggml_backend_sched {
1040
+ bool is_reset; // true if the scheduler has been reset since the last graph split
1041
+ bool is_alloc;
1042
+
1043
+ int n_backends;
1044
+
1045
+ ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1046
+ ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1047
+ ggml_gallocr_t galloc;
1048
+
1049
+ // hash keys of the nodes in the graph
1050
+ struct ggml_hash_set hash_set;
1051
+ // hash values
1052
+ int * tensor_backend_id;
1053
+ struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1054
+
1055
+ int * node_backend_ids; // [graph_size]
1056
+ int * leaf_backend_ids; // [graph_size]
1057
+
1058
+ // copy of the graph with modified inputs
1059
+ struct ggml_cgraph * graph;
1060
+
1061
+ // graph splits
1062
+ struct ggml_backend_sched_split * splits;
1063
+ int n_splits;
1064
+ int splits_capacity;
1065
+
1066
+ // pipeline parallelism support
1067
+ int n_copies;
1068
+ int cur_copy;
1069
+ ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1070
+ struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1071
+ int n_graph_inputs;
1072
+
1073
+ struct ggml_context * ctx;
1074
+
1075
+ ggml_backend_sched_eval_callback callback_eval;
1076
+ void * callback_eval_user_data;
1077
+
1078
+ // align context_buffer to GGML_MEM_ALIGN
1079
+ #ifdef _MSC_VER
1080
+ __declspec(align(GGML_MEM_ALIGN))
1081
+ #else
1082
+ __attribute__((aligned(GGML_MEM_ALIGN)))
1083
+ #endif
1084
+ char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1085
+ };
1086
+
1087
+ #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1088
+ #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
1089
+
1090
+ // returns the priority of the backend, lower id is higher priority
1091
+ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
1092
+ for (int i = 0; i < sched->n_backends; i++) {
1093
+ if (sched->backends[i] == backend) {
1094
+ return i;
1095
+ }
1096
+ }
1097
+ return -1;
1098
+ }
1099
+
1100
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1101
+ ggml_backend_buffer_t buffer = tensor->buffer;
1102
+ if (buffer == NULL) {
1103
+ return -1;
1104
+ }
1105
+
1106
+ // find highest prio backend that supports the buffer type
1107
+ for (int i = 0; i < sched->n_backends; i++) {
1108
+ if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
1109
+ return i;
1110
+ }
1111
+ }
1112
+
1113
+ fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1114
+ __func__, ggml_backend_buffer_name(buffer), tensor->name);
1115
+ GGML_ASSERT(false);
1116
+
1117
+ return -1;
1118
+ }
1119
+
1120
+ #if 0
1121
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1122
+ #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1123
+ #define GET_CAUSE(node) causes[hash_id(node)]
1124
+ #else
1125
+ #define SET_CAUSE(node, ...)
1126
+ #define GET_CAUSE(node) ""
1127
+ #endif
1128
+
1129
+ // returns the backend that should be used for the node based on the current locations
1130
+ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1131
+ // TODO: use supports_op to check if the backend supports the op
1132
+
1133
+ // assign pre-allocated nodes to their backend
1134
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1135
+ if (cur_backend_id != -1) {
1136
+ SET_CAUSE(tensor, "1.dst");
1137
+ return cur_backend_id;
1138
+ }
1139
+
1140
+ // view_src
1141
+ if (tensor->view_src != NULL) {
1142
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1143
+ if (cur_backend_id != -1) {
1144
+ SET_CAUSE(tensor, "1.vsrc");
1145
+ return cur_backend_id;
1146
+ }
1147
+ }
1148
+
1149
+ // graph input
1150
+ if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1151
+ cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1152
+ SET_CAUSE(tensor, "1.inp");
1153
+ return cur_backend_id;
1154
+ }
1155
+
1156
+ // assign nodes that use weights to the backend of the weights
1157
+ // operations with weights are preferably run on the same backend as the weights
1158
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1159
+ const struct ggml_tensor * src = tensor->src[i];
1160
+ if (src == NULL) {
1161
+ continue;
1162
+ }
1163
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1164
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1165
+ // check if a backend with higher prio wants to offload the op
1166
+ if (src_backend_id == sched->n_backends - 1) {
1167
+ for (int b = 0; b < src_backend_id; b++) {
1168
+ if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1169
+ SET_CAUSE(tensor, "1.off");
1170
+ return b;
1171
+ }
1172
+ }
1173
+ }
1174
+ SET_CAUSE(tensor, "1.wgt%d", i);
1175
+ return src_backend_id;
1176
+ }
1177
+ }
1178
+
1179
+ return -1;
1180
+ }
1181
+
1182
+ static char * fmt_size(size_t size) {
1183
+ static char buffer[128];
1184
+ if (size >= 1024*1024) {
1185
+ sprintf(buffer, "%zuM", size/1024/1024);
1186
+ } else {
1187
+ sprintf(buffer, "%zuK", size/1024);
1188
+ }
1189
+ return buffer;
1190
+ }
1191
+
1192
+ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1193
+ int cur_split = 0;
1194
+ for (int i = 0; i < graph->n_nodes; i++) {
1195
+ if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1196
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1197
+ fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1198
+ sched->splits[cur_split].n_inputs);
1199
+ for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1200
+ fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1201
+ fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1202
+ }
1203
+ fprintf(stderr, "\n");
1204
+ cur_split++;
1205
+ }
1206
+ struct ggml_tensor * node = graph->nodes[i];
1207
+ if (ggml_is_view_op(node->op)) {
1208
+ continue;
1209
+ }
1210
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1211
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1212
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1213
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1214
+ struct ggml_tensor * src = node->src[j];
1215
+ if (src == NULL) {
1216
+ continue;
1217
+ }
1218
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1219
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1220
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1221
+ }
1222
+ fprintf(stderr, "\n");
1223
+ }
1224
+ }
1225
+
1226
+ //#define DEBUG_PASS1
1227
+ //#define DEBUG_PASS2
1228
+ //#define DEBUG_PASS3
1229
+ //#define DEBUG_PASS4
1230
+
1231
+ // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1232
+ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1233
+ // reset splits
1234
+ sched->n_splits = 0;
1235
+ sched->n_graph_inputs = 0;
1236
+ sched->is_reset = false;
1237
+
1238
+ struct ggml_init_params params = {
1239
+ /* .mem_size = */ sizeof(sched->context_buffer),
1240
+ /* .mem_buffer = */ sched->context_buffer,
1241
+ /* .no_alloc = */ true
1242
+ };
1243
+
1244
+ ggml_free(sched->ctx);
1245
+
1246
+ sched->ctx = ggml_init(params);
1247
+ if (sched->ctx == NULL) {
1248
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
1249
+ GGML_ASSERT(false);
1250
+ }
1251
+
1252
+ // pass 1: assign backends to ops with pre-allocated inputs
1253
+ for (int i = 0; i < graph->n_leafs; i++) {
1254
+ struct ggml_tensor * leaf = graph->leafs[i];
1255
+ int * leaf_backend_id = &tensor_backend_id(leaf);
1256
+ if (*leaf_backend_id != -1) {
1257
+ // do not overwrite user assignments
1258
+ continue;
1259
+ }
1260
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1261
+ }
1262
+
1263
+ for (int i = 0; i < graph->n_nodes; i++) {
1264
+ struct ggml_tensor * node = graph->nodes[i];
1265
+ int * node_backend_id = &tensor_backend_id(node);
1266
+ if (*node_backend_id != -1) {
1267
+ // do not overwrite user assignments
1268
+ continue;
1269
+ }
1270
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1271
+ // src
1272
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1273
+ struct ggml_tensor * src = node->src[j];
1274
+ if (src == NULL) {
1275
+ continue;
1276
+ }
1277
+ int * src_backend_id = &tensor_backend_id(src);
1278
+ if (*src_backend_id == -1) {
1279
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1280
+ }
1281
+ }
1282
+ }
1283
+ #ifdef DEBUG_PASS1
1284
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1285
+ #endif
1286
+
1287
+ // pass 2: expand current backend assignments
1288
+ // assign the same backend to adjacent nodes
1289
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1290
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1291
+
1292
+
1293
+ // pass 2.2 expand gpu down
1294
+ {
1295
+ int cur_backend_id = -1;
1296
+ for (int i = 0; i < graph->n_nodes; i++) {
1297
+ struct ggml_tensor * node = graph->nodes[i];
1298
+ if (ggml_is_view_op(node->op)) {
1299
+ continue;
1300
+ }
1301
+ int * node_backend_id = &tensor_backend_id(node);
1302
+ if (*node_backend_id != -1) {
1303
+ if (*node_backend_id == sched->n_backends - 1) {
1304
+ // skip cpu (lowest prio backend)
1305
+ cur_backend_id = -1;
1306
+ } else {
1307
+ cur_backend_id = *node_backend_id;
1308
+ }
1309
+ } else {
1310
+ *node_backend_id = cur_backend_id;
1311
+ SET_CAUSE(node, "2.2");
1312
+ }
1313
+ }
1314
+ }
1315
+ // pass 2.1 expand gpu up
1316
+ {
1317
+ int cur_backend_id = -1;
1318
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1319
+ struct ggml_tensor * node = graph->nodes[i];
1320
+ if (ggml_is_view_op(node->op)) {
1321
+ continue;
1322
+ }
1323
+ int * node_backend_id = &tensor_backend_id(node);
1324
+ if (*node_backend_id != -1) {
1325
+ if (*node_backend_id == sched->n_backends - 1) {
1326
+ // skip cpu (lowest prio backend)
1327
+ cur_backend_id = -1;
1328
+ } else {
1329
+ cur_backend_id = *node_backend_id;
1330
+ }
1331
+ } else {
1332
+ *node_backend_id = cur_backend_id;
1333
+ SET_CAUSE(node, "2.1");
1334
+ }
1335
+ }
1336
+ }
1337
+ // pass 2.4 expand rest down
1338
+ {
1339
+ int cur_backend_id = -1;
1340
+ for (int i = 0; i < graph->n_nodes; i++) {
1341
+ struct ggml_tensor * node = graph->nodes[i];
1342
+ if (ggml_is_view_op(node->op)) {
1343
+ continue;
1344
+ }
1345
+ int * node_backend_id = &tensor_backend_id(node);
1346
+ if (*node_backend_id != -1) {
1347
+ cur_backend_id = *node_backend_id;
1348
+ } else {
1349
+ *node_backend_id = cur_backend_id;
1350
+ SET_CAUSE(node, "2.4");
1351
+ }
1352
+ }
1353
+ }
1354
+ // pass 2.3 expand rest up
1355
+ {
1356
+ int cur_backend_id = -1;
1357
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1358
+ struct ggml_tensor * node = graph->nodes[i];
1359
+ if (ggml_is_view_op(node->op)) {
1360
+ continue;
1361
+ }
1362
+ int * node_backend_id = &tensor_backend_id(node);
1363
+ if (*node_backend_id != -1) {
1364
+ cur_backend_id = *node_backend_id;
1365
+ } else {
1366
+ *node_backend_id = cur_backend_id;
1367
+ SET_CAUSE(node, "2.3");
1368
+ }
1369
+ }
1370
+ }
1371
+
1372
+ #ifdef DEBUG_PASS2
1373
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1374
+ #endif
1375
+
1376
+ // pass 3: assign backends to remaining src from dst and view_src
1377
+ for (int i = 0; i < graph->n_nodes; i++) {
1378
+ struct ggml_tensor * node = graph->nodes[i];
1379
+ int * cur_backend_id = &tensor_backend_id(node);
1380
+ if (node->view_src != NULL && *cur_backend_id == -1) {
1381
+ *cur_backend_id = tensor_backend_id(node->view_src);
1382
+ SET_CAUSE(node, "3.vsrc");
1383
+ }
1384
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1385
+ struct ggml_tensor * src = node->src[j];
1386
+ if (src == NULL) {
1387
+ continue;
1388
+ }
1389
+ int * src_backend_id = &tensor_backend_id(src);
1390
+ if (*src_backend_id == -1) {
1391
+ if (src->view_src != NULL) {
1392
+ // views are always on the same backend as the source
1393
+ *src_backend_id = tensor_backend_id(src->view_src);
1394
+ SET_CAUSE(src, "3.vsrc");
1395
+ } else {
1396
+ *src_backend_id = *cur_backend_id;
1397
+ SET_CAUSE(src, "3.cur");
1398
+ }
1399
+ }
1400
+ }
1401
+ }
1402
+ #ifdef DEBUG_PASS3
1403
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1404
+ #endif
1405
+
1406
+ // pass 4: split graph, find tensors that need to be copied
1407
+ {
1408
+ int i_split = 0;
1409
+ struct ggml_backend_sched_split * split = &sched->splits[0];
1410
+ // find the backend of the first split, skipping view ops
1411
+ for (int i = 0; i < graph->n_nodes; i++) {
1412
+ struct ggml_tensor * node = graph->nodes[i];
1413
+ if (!ggml_is_view_op(node->op)) {
1414
+ split->backend_id = tensor_backend_id(node);
1415
+ break;
1416
+ }
1417
+ }
1418
+ split->i_start = 0;
1419
+ split->n_inputs = 0;
1420
+ memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1421
+ int cur_backend_id = split->backend_id;
1422
+ for (int i = 0; i < graph->n_nodes; i++) {
1423
+ struct ggml_tensor * node = graph->nodes[i];
1424
+
1425
+ if (ggml_is_view_op(node->op)) {
1426
+ continue;
1427
+ }
1428
+
1429
+ const int node_backend_id = tensor_backend_id(node);
1430
+
1431
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1432
+
1433
+ // check if we should start a new split based on the sources of the current node
1434
+ bool need_new_split = false;
1435
+ if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1436
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1437
+ struct ggml_tensor * src = node->src[j];
1438
+ if (src == NULL) {
1439
+ continue;
1440
+ }
1441
+ // check if a weight is on a different backend
1442
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1443
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1444
+ int src_backend_id = tensor_backend_id(src);
1445
+ if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1446
+ need_new_split = true;
1447
+ break;
1448
+ }
1449
+ }
1450
+ // check if the split has too many inputs
1451
+ if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1452
+ const size_t id = hash_id(src);
1453
+ int src_backend_id = sched->tensor_backend_id[id];
1454
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1455
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1456
+ need_new_split = true;
1457
+ break;
1458
+ }
1459
+ }
1460
+ }
1461
+ }
1462
+
1463
+ if (node_backend_id != cur_backend_id || need_new_split) {
1464
+ split->i_end = i;
1465
+ i_split++;
1466
+ if (i_split >= sched->splits_capacity) {
1467
+ sched->splits_capacity *= 2;
1468
+ sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1469
+ GGML_ASSERT(sched->splits != NULL);
1470
+ }
1471
+ GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1472
+ split = &sched->splits[i_split];
1473
+ split->backend_id = node_backend_id;
1474
+ split->i_start = i;
1475
+ split->n_inputs = 0;
1476
+ cur_backend_id = node_backend_id;
1477
+ }
1478
+
1479
+ // find inputs that are not on the same backend
1480
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1481
+ struct ggml_tensor * src = node->src[j];
1482
+ if (src == NULL) {
1483
+ continue;
1484
+ }
1485
+
1486
+ const int src_backend_id = tensor_backend_id(src);
1487
+ assert(src_backend_id != -1); // all inputs should be assigned by now
1488
+
1489
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1490
+ size_t id = hash_id(src);
1491
+ if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1492
+ ggml_backend_t backend = sched->backends[src_backend_id];
1493
+ for (int c = 0; c < sched->n_copies; c++) {
1494
+ struct ggml_tensor * tensor_copy;
1495
+ if (c == sched->cur_copy) {
1496
+ tensor_copy = src; // use the original tensor as the current copy
1497
+ } else {
1498
+ tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1499
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1500
+ }
1501
+ if (sched->n_copies > 1) {
1502
+ ggml_set_input(tensor_copy);
1503
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1504
+ }
1505
+ sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1506
+ SET_CAUSE(tensor_copy, "4.cpy");
1507
+ }
1508
+ int n_graph_inputs = sched->n_graph_inputs++;
1509
+ GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1510
+ sched->graph_inputs[n_graph_inputs] = src;
1511
+ }
1512
+ }
1513
+
1514
+ if (src_backend_id != node_backend_id) {
1515
+ // create a copy of the input in the split's backend
1516
+ const size_t id = hash_id(src);
1517
+ if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1518
+ ggml_backend_t backend = sched->backends[cur_backend_id];
1519
+ for (int c = 0; c < sched->n_copies; c++) {
1520
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1521
+ ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1522
+ if (sched->n_copies > 1) {
1523
+ ggml_set_input(tensor_copy);
1524
+ ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1525
+ }
1526
+ sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1527
+ SET_CAUSE(tensor_copy, "4.cpy");
1528
+ }
1529
+ int n_inputs = split->n_inputs++;
1530
+ GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1531
+ split->inputs[n_inputs] = src;
1532
+ }
1533
+ node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1534
+ }
1535
+ }
1536
+ }
1537
+ split->i_end = graph->n_nodes;
1538
+ sched->n_splits = i_split + 1;
1539
+ }
1540
+ #ifdef DEBUG_PASS4
1541
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1542
+ #endif
1543
+
1544
+ // create copies of the graph for each split
1545
+ // TODO: avoid this copy
1546
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1547
+ for (int i = 0; i < sched->n_splits; i++) {
1548
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1549
+ split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1550
+
1551
+ // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1552
+ for (int j = 0; j < split->n_inputs; j++) {
1553
+ assert(graph_copy->size > (graph_copy->n_nodes + 1));
1554
+
1555
+ struct ggml_tensor * input = split->inputs[j];
1556
+ const size_t input_id = hash_id(input);
1557
+ struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1558
+
1559
+ // add a dependency to the input source so that it is not freed before the copy is done
1560
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1561
+ input_dep->src[0] = input;
1562
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1563
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1564
+
1565
+ // add a dependency to the input copy so that it is allocated at the start of the split
1566
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1567
+ graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1568
+ }
1569
+
1570
+ for (int j = split->i_start; j < split->i_end; j++) {
1571
+ assert(graph_copy->size > graph_copy->n_nodes);
1572
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1573
+ graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1574
+ }
1575
+ }
1576
+
1577
+ if (sched->n_copies > 1) {
1578
+ // add input copies as leafs so that they are allocated first
1579
+ for (int i = 0; i < sched->n_graph_inputs; i++) {
1580
+ struct ggml_tensor * input = sched->graph_inputs[i];
1581
+ size_t id = hash_id(input);
1582
+ int backend_id = tensor_backend_id(input);
1583
+ for (int c = 0; c < sched->n_copies; c++) {
1584
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1585
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1586
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1587
+ }
1588
+ }
1589
+
1590
+ for (int i = 0; i < sched->n_splits; i++) {
1591
+ struct ggml_backend_sched_split * split = &sched->splits[i];
1592
+ int backend_id = split->backend_id;
1593
+ for (int j = 0; j < split->n_inputs; j++) {
1594
+ struct ggml_tensor * input = split->inputs[j];
1595
+ size_t id = hash_id(input);
1596
+ for (int c = 0; c < sched->n_copies; c++) {
1597
+ struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1598
+ sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1599
+ graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1600
+ }
1601
+ }
1602
+ }
1603
+ }
1604
+
1605
+ // add leafs from the original graph
1606
+ for (int i = 0; i < graph->n_leafs; i++) {
1607
+ struct ggml_tensor * leaf = graph->leafs[i];
1608
+ sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1609
+ graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1610
+ }
1611
+
1612
+ sched->graph = graph_copy;
1613
+ }
1614
+
1615
+ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1616
+ // allocate graph
1617
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1618
+ // the re-allocation may cause the split inputs to be moved to a different address
1619
+ ggml_backend_sched_synchronize(sched);
1620
+ #ifndef NDEBUG
1621
+ fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1622
+ #endif
1623
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1624
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1625
+ fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1626
+ return false;
1627
+ }
1628
+ }
1629
+
1630
+ return true;
1631
+ }
1632
+
1633
+ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1634
+ struct ggml_backend_sched_split * splits = sched->splits;
1635
+
1636
+ for (int i = 0; i < sched->n_splits; i++) {
1637
+ struct ggml_backend_sched_split * split = &splits[i];
1638
+ int split_backend_id = split->backend_id;
1639
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
1640
+
1641
+ // copy the input tensors to the split backend
1642
+ for (int j = 0; j < split->n_inputs; j++) {
1643
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1644
+ struct ggml_tensor * input = split->inputs[j];
1645
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1646
+
1647
+ if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1648
+ // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1649
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1650
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1651
+ } else {
1652
+ ggml_backend_synchronize(split_backend);
1653
+ }
1654
+ ggml_backend_tensor_copy(input, input_cpy);
1655
+ } else {
1656
+ // wait for the split backend to finish using the input before overwriting it
1657
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1658
+ ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1659
+ } else {
1660
+ ggml_backend_synchronize(split_backend);
1661
+ }
1662
+ ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
1663
+ }
1664
+ }
1665
+
1666
+ if (!sched->callback_eval) {
1667
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1668
+ if (ec != GGML_STATUS_SUCCESS) {
1669
+ return ec;
1670
+ }
1671
+ } else {
1672
+ // similar to ggml_backend_compare_graph_backend
1673
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1674
+ struct ggml_tensor * t = split->graph.nodes[j0];
1675
+
1676
+ // check if the user needs data from this node
1677
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1678
+
1679
+ int j1 = j0;
1680
+
1681
+ // determine the range [j0, j1] of nodes that can be computed together
1682
+ while (!need && j1 < split->graph.n_nodes - 1) {
1683
+ t = split->graph.nodes[++j1];
1684
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1685
+ }
1686
+
1687
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1688
+
1689
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1690
+ if (ec != GGML_STATUS_SUCCESS) {
1691
+ return ec;
1692
+ }
1693
+
1694
+ // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1695
+ ggml_backend_synchronize(split_backend);
1696
+
1697
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1698
+ break;
1699
+ }
1700
+
1701
+ j0 = j1;
1702
+ }
1703
+ }
1704
+
1705
+ // record the event of this copy
1706
+ if (split->n_inputs > 0) {
1707
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1708
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1709
+ }
1710
+ }
1711
+ }
1712
+
1713
+ sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1714
+
1715
+ return GGML_STATUS_SUCCESS;
1716
+ }
1717
+
1718
+ ggml_backend_sched_t ggml_backend_sched_new(
1719
+ ggml_backend_t * backends,
1720
+ ggml_backend_buffer_type_t * bufts,
1721
+ int n_backends,
1722
+ size_t graph_size,
1723
+ bool parallel) {
1724
+ GGML_ASSERT(n_backends > 0);
1725
+ GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1726
+ GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1727
+
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1729
+
1730
+ // initialize hash table
1731
+ sched->hash_set = ggml_hash_set_new(graph_size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1734
+
1735
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1738
+
1739
+ sched->n_backends = n_backends;
1740
+
1741
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1742
+
1743
+ const int initial_splits_capacity = 16;
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1745
+ sched->splits_capacity = initial_splits_capacity;
1746
+
1747
+ for (int b = 0; b < n_backends; b++) {
1748
+ sched->backends[b] = backends[b];
1749
+ sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1750
+ GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1751
+ if (sched->n_copies > 1) {
1752
+ for (int c = 0; c < sched->n_copies; c++) {
1753
+ sched->events[b][c] = ggml_backend_event_new(backends[b]);
1754
+ }
1755
+ }
1756
+ }
1757
+
1758
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1759
+
1760
+ ggml_backend_sched_reset(sched);
1761
+
1762
+ return sched;
1763
+ }
1764
+
1765
+ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1766
+ if (sched == NULL) {
1767
+ return;
1768
+ }
1769
+ for (int b = 0; b < sched->n_backends; b++) {
1770
+ for (int c = 0; c < sched->n_copies; c++) {
1771
+ ggml_backend_event_free(sched->events[b][c]);
1772
+ }
1773
+ }
1774
+ ggml_gallocr_free(sched->galloc);
1775
+ ggml_free(sched->ctx);
1776
+ free(sched->splits);
1777
+ free(sched->hash_set.keys);
1778
+ free(sched->tensor_backend_id);
1779
+ free(sched->tensor_copies);
1780
+ free(sched->node_backend_ids);
1781
+ free(sched->leaf_backend_ids);
1782
+ free(sched);
1783
+ }
1784
+
1785
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1786
+ // reset state for the next run
1787
+ size_t hash_size = sched->hash_set.size;
1788
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1789
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1790
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1791
+
1792
+ sched->is_reset = true;
1793
+ sched->is_alloc = false;
1794
+ }
1795
+
1796
+ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1797
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1798
+
1799
+ ggml_backend_sched_split_graph(sched, measure_graph);
1800
+
1801
+ // TODO: extract this to a separate function
1802
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1803
+ return false;
1804
+ }
1805
+
1806
+ ggml_backend_sched_reset(sched);
1807
+ ggml_backend_sched_synchronize(sched);
1808
+
1809
+ return true;
1810
+ }
1811
+
1812
+ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1813
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1814
+
1815
+ ggml_backend_sched_split_graph(sched, graph);
1816
+
1817
+ if (!ggml_backend_sched_alloc_splits(sched)) {
1818
+ return false;
1819
+ }
1820
+
1821
+ sched->is_alloc = true;
1822
+
1823
+ return true;
1824
+ }
1825
+
1826
+ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1827
+ enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1828
+ ggml_backend_sched_synchronize(sched);
1829
+ return err;
1830
+ }
1831
+
1832
+ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1833
+ if (!sched->is_reset && !sched->is_alloc) {
1834
+ ggml_backend_sched_reset(sched);
1835
+ }
1836
+
1837
+ if (!sched->is_alloc) {
1838
+ if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1839
+ return GGML_STATUS_ALLOC_FAILED;
1840
+ }
1841
+ }
1842
+
1843
+ return ggml_backend_sched_compute_splits(sched);
1844
+ }
1845
+
1846
+ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1847
+ for (int i = 0; i < sched->n_backends; i++) {
1848
+ ggml_backend_synchronize(sched->backends[i]);
1849
+ }
1850
+ }
1851
+
1852
+ void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1853
+ sched->callback_eval = callback;
1854
+ sched->callback_eval_user_data = user_data;
1855
+ }
1856
+
1857
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1858
+ return sched->n_splits;
1859
+ }
1860
+
1861
+ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1862
+ return sched->n_copies;
1863
+ }
1864
+
1865
+ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1866
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1867
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1868
+
1869
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
1870
+ }
1871
+
1872
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1873
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1874
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1875
+ tensor_backend_id(node) = backend_index;
1876
+ }
1877
+
1878
+ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1879
+ int backend_index = tensor_backend_id(node);
1880
+ if (backend_index == -1) {
1881
+ return NULL;
1882
+ }
1883
+ return sched->backends[backend_index];
1884
+ }
1885
+
1886
+ // utils
1887
+
1888
+ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1889
+ GGML_ASSERT(tensor->buffer == NULL);
1890
+ GGML_ASSERT(tensor->view_src != NULL);
1891
+ GGML_ASSERT(tensor->view_src->buffer != NULL);
1892
+ GGML_ASSERT(tensor->view_src->data != NULL);
1893
+
1894
+ tensor->buffer = buffer;
1895
+ tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1896
+ tensor->backend = tensor->view_src->backend;
1897
+ ggml_backend_buffer_init_tensor(buffer, tensor);
1898
+ }
1899
+
1900
+ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
1901
+ GGML_ASSERT(tensor->buffer == NULL);
1902
+ GGML_ASSERT(tensor->data == NULL);
1903
+ GGML_ASSERT(tensor->view_src == NULL);
1904
+ GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
1905
+ GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1906
+ (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
1907
+
1908
+ tensor->buffer = buffer;
1909
+ tensor->data = addr;
1910
+ ggml_backend_buffer_init_tensor(buffer, tensor);
1911
+ }
1912
+
1913
+ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1914
+ struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1915
+
1916
+ GGML_ASSERT(src != NULL);
1917
+ GGML_ASSERT(src->data && "graph must be allocated");
1918
+
1919
+ size_t id = ggml_hash_insert(hash_set, src);
1920
+ if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
1921
+ return node_copies[ggml_hash_find(hash_set, src)];
1922
+ }
1923
+
1924
+ struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
1925
+ if (src->view_src != NULL) {
1926
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
1927
+ dst->view_offs = src->view_offs;
1928
+ }
1929
+ dst->op = src->op;
1930
+ memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
1931
+ ggml_set_name(dst, src->name);
1932
+
1933
+ // copy src
1934
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1935
+ struct ggml_tensor * s = src->src[i];
1936
+ if (s == NULL) {
1937
+ continue;
1938
+ }
1939
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
1940
+ }
1941
+
1942
+ node_copies[id] = dst;
1943
+ return dst;
1944
+ }
1945
+
1946
+ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1947
+ size_t id = ggml_hash_find(hash_set, src);
1948
+ if (node_init[id]) {
1949
+ return;
1950
+ }
1951
+ node_init[id] = true;
1952
+
1953
+ struct ggml_tensor * dst = node_copies[id];
1954
+ if (dst->view_src != NULL) {
1955
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1956
+ ggml_backend_view_init(dst->view_src->buffer, dst);
1957
+ }
1958
+ else {
1959
+ ggml_backend_tensor_copy(src, dst);
1960
+ }
1961
+
1962
+ // init src
1963
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
1964
+ struct ggml_tensor * s = src->src[i];
1965
+ if (s == NULL) {
1966
+ continue;
1967
+ }
1968
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
1969
+ }
1970
+ }
1971
+
1972
+ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1973
+ struct ggml_hash_set hash_set = {
1974
+ /* .size = */ graph->visited_hash_table.size,
1975
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1976
+ };
1977
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1978
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1979
+
1980
+ struct ggml_init_params params = {
1981
+ /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
1982
+ /* .mem_buffer = */ NULL,
1983
+ /* .no_alloc = */ true
1984
+ };
1985
+
1986
+ struct ggml_context * ctx_allocated = ggml_init(params);
1987
+ struct ggml_context * ctx_unallocated = ggml_init(params);
1988
+
1989
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1990
+ fprintf(stderr, "failed to allocate context for graph copy\n");
1991
+ free(hash_set.keys);
1992
+ free(node_copies);
1993
+ free(node_init);
1994
+ ggml_free(ctx_allocated);
1995
+ ggml_free(ctx_unallocated);
1996
+ return (struct ggml_backend_graph_copy) {
1997
+ /* .buffer = */ NULL,
1998
+ /* .ctx_allocated = */ NULL,
1999
+ /* .ctx_unallocated = */ NULL,
2000
+ /* .graph = */ NULL,
2001
+ };
2002
+ }
2003
+
2004
+ // dup nodes
2005
+ for (int i = 0; i < graph->n_nodes; i++) {
2006
+ struct ggml_tensor * node = graph->nodes[i];
2007
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2008
+ }
2009
+
2010
+ // allocate nodes
2011
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2012
+ if (buffer == NULL) {
2013
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
2014
+ free(hash_set.keys);
2015
+ free(node_copies);
2016
+ free(node_init);
2017
+ ggml_free(ctx_allocated);
2018
+ ggml_free(ctx_unallocated);
2019
+ return (struct ggml_backend_graph_copy) {
2020
+ /* .buffer = */ NULL,
2021
+ /* .ctx_allocated = */ NULL,
2022
+ /* .ctx_unallocated = */ NULL,
2023
+ /* .graph = */ NULL,
2024
+ };
2025
+ }
2026
+
2027
+ //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2028
+
2029
+ // copy data and init views
2030
+ for (int i = 0; i < graph->n_nodes; i++) {
2031
+ struct ggml_tensor * node = graph->nodes[i];
2032
+ graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2033
+ }
2034
+
2035
+ // build graph copy
2036
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2037
+ for (int i = 0; i < graph->n_nodes; i++) {
2038
+ struct ggml_tensor * node = graph->nodes[i];
2039
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
2040
+ graph_copy->nodes[i] = node_copy;
2041
+ }
2042
+ graph_copy->n_nodes = graph->n_nodes;
2043
+
2044
+ free(hash_set.keys);
2045
+ free(node_copies);
2046
+ free(node_init);
2047
+
2048
+ return (struct ggml_backend_graph_copy) {
2049
+ /* .buffer = */ buffer,
2050
+ /* .ctx_allocated = */ ctx_allocated,
2051
+ /* .ctx_unallocated = */ ctx_unallocated,
2052
+ /* .graph = */ graph_copy,
2053
+ };
2054
+ }
2055
+
2056
+ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2057
+ ggml_backend_buffer_free(copy.buffer);
2058
+ ggml_free(copy.ctx_allocated);
2059
+ ggml_free(copy.ctx_unallocated);
2060
+ }
2061
+
2062
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2063
+ struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2064
+ if (copy.buffer == NULL) {
2065
+ return false;
2066
+ }
2067
+
2068
+ struct ggml_cgraph * g1 = graph;
2069
+ struct ggml_cgraph * g2 = copy.graph;
2070
+
2071
+ assert(g1->n_nodes == g2->n_nodes);
2072
+
2073
+ for (int i = 0; i < g1->n_nodes; i++) {
2074
+ //printf("eval %d/%d\n", i, g1->n_nodes);
2075
+ struct ggml_tensor * t1 = g1->nodes[i];
2076
+ struct ggml_tensor * t2 = g2->nodes[i];
2077
+
2078
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2079
+
2080
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2081
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2082
+
2083
+ ggml_backend_graph_compute(backend1, &g1v);
2084
+ ggml_backend_graph_compute(backend2, &g2v);
2085
+
2086
+ if (ggml_is_view_op(t1->op)) {
2087
+ continue;
2088
+ }
2089
+
2090
+ // compare results, calculate rms etc
2091
+ if (!callback(i, t1, t2, user_data)) {
2092
+ break;
2093
+ }
2094
+ }
2095
+
2096
+ ggml_backend_graph_copy_free(copy);
2097
+
2098
+ return true;
2099
+ }