@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -8,6 +8,7 @@
8
8
  #include <windows.h>
9
9
  #endif
10
10
 
11
+ #include "ggml-backend.h"
11
12
  #include "ggml-backend-impl.h"
12
13
  #include "ggml-alloc.h"
13
14
  #include "ggml-impl.h"
@@ -34,6 +35,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34
35
  }
35
36
 
36
37
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
38
+ if (size == 0) {
39
+ // return a dummy buffer for zero-sized allocations
40
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
41
+ }
42
+
37
43
  return buft->iface.alloc_buffer(buft, size);
38
44
  }
39
45
 
@@ -89,7 +95,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
89
95
  }
90
96
 
91
97
  const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
92
- return buffer->iface.get_name(buffer);
98
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
93
99
  }
94
100
 
95
101
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
108
114
  }
109
115
 
110
116
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
117
+ // get_base is optional if the buffer is zero-sized
118
+ if (buffer->size == 0) {
119
+ return NULL;
120
+ }
121
+
111
122
  void * base = buffer->iface.get_base(buffer);
112
123
 
113
124
  GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -122,6 +133,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
122
133
  }
123
134
  }
124
135
 
136
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
137
+ // clear is optional if the buffer is zero-sized
138
+ if (buffer->size == 0) {
139
+ return;
140
+ }
141
+
142
+ buffer->iface.clear(buffer, value);
143
+ }
144
+
125
145
  size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
126
146
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
127
147
  }
@@ -134,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
134
154
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
135
155
  }
136
156
 
137
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
138
- buffer->iface.clear(buffer, value);
139
- }
140
-
141
157
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
142
158
  return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
143
159
  }
@@ -198,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
198
214
  }
199
215
 
200
216
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
201
- return backend->iface.get_default_buffer_type(backend);
217
+ return ggml_backend_dev_buffer_type(backend->device);
202
218
  }
203
219
 
204
220
  ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
@@ -238,43 +254,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
238
254
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239
255
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240
256
 
257
+ if (size == 0) {
258
+ return;
259
+ }
260
+
241
261
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
242
262
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243
263
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
244
264
 
245
- if (!size) {
246
- return;
247
- }
248
-
249
265
  buf->iface.set_tensor(buf, tensor, data, offset, size);
250
266
  }
251
267
 
252
268
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253
269
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254
270
 
271
+ if (size == 0) {
272
+ return;
273
+ }
274
+
255
275
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
256
276
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
277
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
258
278
 
259
- if (!size) {
260
- return;
261
- }
262
-
263
279
  buf->iface.get_tensor(buf, tensor, data, offset, size);
264
280
  }
265
281
 
266
- GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
282
+ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
283
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
284
 
269
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272
-
273
- if (!size) {
285
+ if (size == 0) {
274
286
  return;
275
287
  }
276
288
 
277
- GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
289
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
290
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
291
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
292
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278
293
 
279
294
  buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
295
  }
@@ -316,33 +331,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
316
331
  }
317
332
 
318
333
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
319
- // helper to ease transition to device interface
320
- if (backend->device) {
321
- return ggml_backend_dev_supports_op(backend->device, op);
322
- }
323
-
324
- return backend->iface.supports_op(backend, op);
334
+ return ggml_backend_dev_supports_op(backend->device, op);
325
335
  }
326
336
 
327
337
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
328
- // helper to ease transition to device interface
329
- if (backend->device) {
330
- return ggml_backend_dev_supports_buft(backend->device, buft);
331
- }
332
-
333
- return backend->iface.supports_buft(backend, buft);
338
+ return ggml_backend_dev_supports_buft(backend->device, buft);
334
339
  }
335
340
 
336
341
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
337
- // helper to ease transition to device interface
338
- if (backend->device) {
339
- return ggml_backend_dev_offload_op(backend->device, op);
340
- }
341
-
342
- if (backend->iface.offload_op != NULL) {
343
- return backend->iface.offload_op(backend, op);
344
- }
345
- return false;
342
+ return ggml_backend_dev_offload_op(backend->device, op);
346
343
  }
347
344
 
348
345
  ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
@@ -379,7 +376,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
379
376
  ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
380
377
  } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
381
378
  #ifndef NDEBUG
382
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
379
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
383
380
  #endif
384
381
  size_t nbytes = ggml_nbytes(src);
385
382
  void * data = malloc(nbytes);
@@ -409,832 +406,123 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
409
406
  ggml_backend_tensor_copy(src, dst);
410
407
  }
411
408
 
412
- // events
413
-
414
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
415
- // null device is allowed for the transition period to the device interface
416
- if (device == NULL || device->iface.event_new == NULL) {
417
- return NULL;
418
- }
419
- return device->iface.event_new(device);
420
- }
421
-
422
- void ggml_backend_event_free(ggml_backend_event_t event) {
423
- if (event == NULL) {
424
- return;
425
- }
426
- event->device->iface.event_free(event->device, event);
427
- }
428
-
429
- void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
430
- GGML_ASSERT(backend->iface.event_record != NULL);
431
-
432
- backend->iface.event_record(backend, event);
433
- }
434
-
435
- void ggml_backend_event_synchronize(ggml_backend_event_t event) {
436
- GGML_ASSERT(event->device->iface.event_synchronize);
437
-
438
- event->device->iface.event_synchronize(event->device, event);
439
- }
440
-
441
- void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
442
- GGML_ASSERT(backend->iface.event_wait != NULL);
443
-
444
- backend->iface.event_wait(backend, event);
445
- }
446
-
447
- // Backend device
448
-
449
- const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
450
- return device->iface.get_name(device);
451
- }
452
-
453
- const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
454
- return device->iface.get_description(device);
455
- }
456
-
457
- void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
458
- device->iface.get_memory(device, free, total);
459
- }
460
-
461
- enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
462
- return device->iface.get_type(device);
463
- }
464
-
465
- void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
466
- device->iface.get_props(device, props);
467
- }
468
-
469
- ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
470
- return device->reg;
471
- }
472
-
473
- ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
474
- return device->iface.init_backend(device, params);
475
- }
476
-
477
- ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
478
- return device->iface.get_buffer_type(device);
479
- }
480
-
481
- ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
482
- return device->iface.get_host_buffer_type(device);
483
- }
484
-
485
- ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
486
- return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
487
- }
488
-
489
- bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
490
- return device->iface.supports_op(device, op);
491
- }
492
-
493
- bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
494
- return device->iface.supports_buft(device, buft);
495
- }
496
-
497
- bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
498
- return device->iface.offload_op(device, op);
499
- }
500
-
501
- // Backend (reg)
502
-
503
- const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
504
- return reg->iface.get_name(reg);
505
- }
506
-
507
- size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
508
- return reg->iface.get_device_count(reg);
509
- }
510
-
511
- ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
512
- return reg->iface.get_device(reg, index);
513
- }
514
-
515
- void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
516
- if (!reg->iface.get_proc_address) {
517
- return NULL;
518
- }
519
- return reg->iface.get_proc_address(reg, name);
520
- }
521
-
522
- // Backend registry
523
-
524
- #ifdef GGML_USE_CUDA
525
- #include "ggml-cuda.h"
526
- #endif
527
-
528
- struct ggml_backend_registry {
529
- std::vector<ggml_backend_reg_t> backends;
530
- std::vector<ggml_backend_dev_t> devices;
531
-
532
- ggml_backend_registry() {
533
- #ifdef GGML_USE_CUDA
534
- register_backend(ggml_backend_cuda_reg());
535
- #endif
536
-
537
- register_backend(ggml_backend_cpu_reg());
538
-
539
- // TODO: sycl, metal, vulkan, kompute, cann
540
- }
541
-
542
- void register_backend(ggml_backend_reg_t reg) {
543
- #ifndef NDEBUG
544
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
545
- __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
546
- #endif
547
- backends.push_back(reg);
548
- for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
549
- register_device(ggml_backend_reg_dev_get(reg, i));
550
- }
551
- }
552
-
553
- void register_device(ggml_backend_dev_t device) {
554
- #ifndef NDEBUG
555
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
556
- #endif
557
- devices.push_back(device);
558
- }
559
- };
560
-
561
- static ggml_backend_registry & get_reg() {
562
- static ggml_backend_registry reg;
563
- return reg;
564
- }
565
-
566
- // Internal API
567
- void ggml_backend_register(ggml_backend_reg_t reg) {
568
- get_reg().register_backend(reg);
569
- }
570
-
571
- void ggml_backend_device_register(ggml_backend_dev_t device) {
572
- get_reg().register_device(device);
573
- }
574
-
575
- // Backend (reg) enumeration
576
- size_t ggml_backend_reg_count() {
577
- return get_reg().backends.size();
578
- }
579
-
580
- ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
581
- GGML_ASSERT(index < ggml_backend_reg_count());
582
- return get_reg().backends[index];
583
- }
584
-
585
- ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
586
- for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
587
- ggml_backend_reg_t reg = ggml_backend_reg_get(i);
588
- if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
589
- return reg;
590
- }
591
- }
592
- return NULL;
593
- }
594
-
595
- // Device enumeration
596
- size_t ggml_backend_dev_count() {
597
- return get_reg().devices.size();
598
- }
599
-
600
- ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
601
- GGML_ASSERT(index < ggml_backend_dev_count());
602
- return get_reg().devices[index];
603
- }
604
-
605
- ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
606
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
607
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
608
- if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
609
- return dev;
610
- }
611
- }
612
- return NULL;
613
- }
614
-
615
- ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
616
- for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
617
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
618
- if (ggml_backend_dev_type(dev) == type) {
619
- return dev;
620
- }
621
- }
622
- return NULL;
623
- }
624
-
625
- // Convenience functions
626
- ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
627
- ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
628
- if (!dev) {
629
- return NULL;
630
- }
631
- return ggml_backend_dev_init(dev, params);
632
- }
633
-
634
- ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
635
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
636
- if (!dev) {
637
- return NULL;
638
- }
639
- return ggml_backend_dev_init(dev, params);
640
- }
641
-
642
- ggml_backend_t ggml_backend_init_best(void) {
643
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
644
- if (!dev) {
645
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
646
- }
647
- if (!dev) {
648
- return NULL;
649
- }
650
- return ggml_backend_dev_init(dev, NULL);
651
- }
652
-
653
- // backend CPU
654
-
655
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
656
-
657
- static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
658
- return "CPU";
659
-
660
- GGML_UNUSED(buffer);
661
- }
662
-
663
- static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
664
- uintptr_t data = (uintptr_t)buffer->context;
665
-
666
- // align the buffer
667
- if (data % TENSOR_ALIGNMENT != 0) {
668
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
669
- }
670
-
671
- return (void *)data;
672
- }
673
-
674
- static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
675
- free(buffer->context);
676
- }
677
-
678
- static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
679
- memset((char *)tensor->data + offset, value, size);
680
-
681
- GGML_UNUSED(buffer);
682
- }
683
-
684
- static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
685
- memcpy((char *)tensor->data + offset, data, size);
686
-
687
- GGML_UNUSED(buffer);
688
- }
689
-
690
- static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
691
- memcpy(data, (const char *)tensor->data + offset, size);
692
-
693
- GGML_UNUSED(buffer);
694
- }
695
-
696
- static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
697
- if (ggml_backend_buffer_is_host(src->buffer)) {
698
- memcpy(dst->data, src->data, ggml_nbytes(src));
699
- return true;
700
- }
701
- return false;
702
-
703
- GGML_UNUSED(buffer);
704
- }
705
-
706
- static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
707
- memset(buffer->context, value, buffer->size);
708
- }
709
-
710
- static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
711
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
712
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
713
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
714
- /* .init_tensor = */ NULL, // no initialization required
715
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
716
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
717
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
718
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
719
- /* .clear = */ ggml_backend_cpu_buffer_clear,
720
- /* .reset = */ NULL,
721
- };
722
-
723
- static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
724
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
725
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
726
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
727
- /* .init_tensor = */ NULL, // no initialization required
728
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
729
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
730
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
731
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
732
- /* .clear = */ ggml_backend_cpu_buffer_clear,
733
- /* .reset = */ NULL,
734
- };
735
-
736
- static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
737
- return "CPU";
738
-
739
- GGML_UNUSED(buft);
740
- }
741
-
742
- static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
743
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
744
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
745
- if (data == NULL) {
746
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
747
- return NULL;
748
- }
749
-
750
- return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
751
- }
752
-
753
- static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
754
- return TENSOR_ALIGNMENT;
755
-
756
- GGML_UNUSED(buft);
757
- }
758
-
759
- static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
760
- return true;
761
-
762
- GGML_UNUSED(buft);
763
- }
764
-
765
- ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
766
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
767
- /* .iface = */ {
768
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
769
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
770
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
771
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
772
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
773
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
774
- },
775
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
776
- /* .context = */ NULL,
777
- };
778
-
779
- return &ggml_backend_cpu_buffer_type;
780
- }
781
-
782
- #ifdef GGML_USE_CPU_HBM
783
-
784
- // buffer type HBM
785
-
786
- #include <hbwmalloc.h>
787
-
788
- static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
789
- return "CPU_HBM";
790
-
791
- GGML_UNUSED(buft);
792
- }
793
-
794
- static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
795
- return "CPU_HBM";
796
-
797
- GGML_UNUSED(buf);
798
- }
799
-
800
- static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
801
- hbw_free(buffer->context);
802
- }
803
-
804
- static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
805
- //void * ptr = hbw_malloc(size);
806
- void * ptr;
807
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
808
- if (result != 0) {
809
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
810
- return NULL;
811
- }
812
-
813
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
814
- buffer->buft = buft;
815
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
816
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
817
-
818
- return buffer;
819
- }
820
-
821
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
822
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
823
- /* .iface = */ {
824
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
825
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
826
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
827
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
828
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
829
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
830
- },
831
- /* .context = */ NULL,
832
- };
833
-
834
- return &ggml_backend_cpu_buffer_type_hbm;
835
- }
836
- #endif
837
-
838
- struct ggml_backend_cpu_context {
839
- int n_threads;
840
- ggml_threadpool_t threadpool;
841
-
842
- uint8_t * work_data;
843
- size_t work_size;
844
-
845
- ggml_abort_callback abort_callback;
846
- void * abort_callback_data;
847
- };
848
-
849
- static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
850
- return "CPU";
851
-
852
- GGML_UNUSED(backend);
853
- }
854
-
855
- static void ggml_backend_cpu_free(ggml_backend_t backend) {
856
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
857
- delete[] cpu_ctx->work_data;
858
- delete cpu_ctx;
859
- delete backend;
860
- }
861
-
862
- static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
863
- return ggml_backend_cpu_buffer_type();
864
-
865
- GGML_UNUSED(backend);
866
- }
867
-
868
- struct ggml_backend_plan_cpu {
869
- struct ggml_cplan cplan;
870
- struct ggml_cgraph cgraph;
871
- };
872
-
873
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
874
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
875
-
876
- struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
877
-
878
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
879
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
880
-
881
- if (cpu_plan->cplan.work_size > 0) {
882
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
883
- if (cpu_plan->cplan.work_data == NULL) {
884
- delete cpu_plan;
885
- return NULL;
886
- }
887
- }
888
-
889
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
890
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
891
-
892
- return cpu_plan;
893
- }
894
-
895
- static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
896
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
897
-
898
- delete[] cpu_plan->cplan.work_data;
899
- delete cpu_plan;
900
-
901
- GGML_UNUSED(backend);
902
- }
903
-
904
- static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
905
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
906
-
907
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
908
-
909
- GGML_UNUSED(backend);
910
- }
911
-
912
- static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
913
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
914
-
915
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
916
-
917
- if (cpu_ctx->work_size < cplan.work_size) {
918
- delete[] cpu_ctx->work_data;
919
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
920
- if (cpu_ctx->work_data == NULL) {
921
- cpu_ctx->work_size = 0;
922
- return GGML_STATUS_ALLOC_FAILED;
923
- }
924
- cpu_ctx->work_size = cplan.work_size;
925
- }
926
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
927
-
928
- cplan.abort_callback = cpu_ctx->abort_callback;
929
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
930
-
931
- return ggml_graph_compute(cgraph, &cplan);
932
- }
933
-
934
- static const struct ggml_backend_i ggml_backend_cpu_i = {
935
- /* .get_name = */ ggml_backend_cpu_get_name,
936
- /* .free = */ ggml_backend_cpu_free,
937
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
938
- /* .set_tensor_async = */ NULL,
939
- /* .get_tensor_async = */ NULL,
940
- /* .cpy_tensor_async = */ NULL,
941
- /* .synchronize = */ NULL,
942
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
943
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
944
- /* .graph_plan_update = */ NULL,
945
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
946
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
947
- /* .supports_op = */ NULL,
948
- /* .supports_buft = */ NULL,
949
- /* .offload_op = */ NULL,
950
- /* .event_record = */ NULL,
951
- /* .event_wait = */ NULL,
952
- };
953
-
954
- static ggml_guid_t ggml_backend_cpu_guid(void) {
955
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
956
- return &guid;
957
- }
958
-
959
- ggml_backend_t ggml_backend_cpu_init(void) {
960
- struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
961
- if (ctx == NULL) {
962
- return NULL;
963
- }
964
-
965
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
966
- ctx->threadpool = NULL;
967
- ctx->work_data = NULL;
968
- ctx->work_size = 0;
969
- ctx->abort_callback = NULL;
970
- ctx->abort_callback_data = NULL;
971
-
972
- ggml_backend_t cpu_backend = new ggml_backend {
973
- /* .guid = */ ggml_backend_cpu_guid(),
974
- /* .interface = */ ggml_backend_cpu_i,
975
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
976
- /* .context = */ ctx,
977
- };
978
-
979
- if (cpu_backend == NULL) {
980
- delete ctx;
981
- return NULL;
982
- }
983
-
984
- return cpu_backend;
985
- }
986
-
987
- bool ggml_backend_is_cpu(ggml_backend_t backend) {
988
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
989
- }
990
-
991
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
992
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
993
-
994
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
995
- ctx->n_threads = n_threads;
996
- }
997
-
998
- void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
999
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1000
-
1001
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1002
-
1003
- if (ctx->threadpool && ctx->threadpool != threadpool) {
1004
- // already had a different threadpool, pause/suspend it before switching
1005
- ggml_threadpool_pause(ctx->threadpool);
1006
- }
1007
- ctx->threadpool = threadpool;
1008
- }
1009
-
1010
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
1011
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1012
-
1013
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1014
- ctx->abort_callback = abort_callback;
1015
- ctx->abort_callback_data = abort_callback_data;
1016
- }
1017
-
1018
- ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1019
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1020
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1021
- }
1022
-
1023
- ////////////////////////
409
+ // events
1024
410
 
1025
- struct ggml_backend_cpu_device_context {
1026
- std::string description = "CPU";
411
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
412
+ // null device is allowed for the transition period to the device interface
413
+ if (device == NULL || device->iface.event_new == NULL) {
414
+ return NULL;
415
+ }
416
+ return device->iface.event_new(device);
417
+ }
1027
418
 
1028
- ggml_backend_cpu_device_context() {
1029
- #ifdef __APPLE__
1030
- size_t len = 0;
1031
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1032
- description.resize(len);
1033
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1034
- }
1035
- #elif defined(__linux__)
1036
- FILE * f = fopen("/proc/cpuinfo", "r");
1037
- if (f) {
1038
- char buf[1024];
1039
- while (fgets(buf, sizeof(buf), f)) {
1040
- if (strncmp(buf, "model name", 10) == 0) {
1041
- char * p = strchr(buf, ':');
1042
- if (p) {
1043
- p++;
1044
- while (std::isspace(*p)) {
1045
- p++;
1046
- }
1047
- while (std::isspace(p[strlen(p) - 1])) {
1048
- p[strlen(p) - 1] = '\0';
1049
- }
1050
- description = p;
1051
- break;
1052
- }
1053
- }
1054
- }
1055
- fclose(f);
1056
- }
1057
- #elif defined(_WIN32)
1058
- HKEY hKey;
1059
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1060
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1061
- 0,
1062
- KEY_READ,
1063
- &hKey) == ERROR_SUCCESS) {
1064
- DWORD cpu_brand_size = 0;
1065
- if (RegQueryValueExA(hKey,
1066
- TEXT("ProcessorNameString"),
1067
- NULL,
1068
- NULL,
1069
- NULL,
1070
- &cpu_brand_size) == ERROR_SUCCESS) {
1071
- description.resize(cpu_brand_size);
1072
- if (RegQueryValueExA(hKey,
1073
- TEXT("ProcessorNameString"),
1074
- NULL,
1075
- NULL,
1076
- (LPBYTE)&description[0], // NOLINT
1077
- &cpu_brand_size) == ERROR_SUCCESS) {
1078
- if (description.find('\0') != std::string::npos) {
1079
- description.resize(description.find('\0'));
1080
- }
1081
- }
1082
- }
1083
- RegCloseKey(hKey);
1084
- }
1085
- #endif
419
+ void ggml_backend_event_free(ggml_backend_event_t event) {
420
+ if (event == NULL) {
421
+ return;
1086
422
  }
1087
- };
423
+ event->device->iface.event_free(event->device, event);
424
+ }
1088
425
 
1089
- static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1090
- return "CPU";
426
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
427
+ GGML_ASSERT(backend->iface.event_record != NULL);
1091
428
 
1092
- GGML_UNUSED(dev);
429
+ backend->iface.event_record(backend, event);
1093
430
  }
1094
431
 
1095
- static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1096
- struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
432
+ void ggml_backend_event_synchronize(ggml_backend_event_t event) {
433
+ GGML_ASSERT(event->device->iface.event_synchronize);
1097
434
 
1098
- return ctx->description.c_str();
435
+ event->device->iface.event_synchronize(event->device, event);
1099
436
  }
1100
437
 
1101
- static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1102
- // TODO
1103
- *free = 0;
1104
- *total = 0;
438
+ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
439
+ GGML_ASSERT(backend->iface.event_wait != NULL);
1105
440
 
1106
- GGML_UNUSED(dev);
441
+ backend->iface.event_wait(backend, event);
1107
442
  }
1108
443
 
1109
- static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1110
- return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
444
+ // Backend device
1111
445
 
1112
- GGML_UNUSED(dev);
446
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
447
+ return device->iface.get_name(device);
1113
448
  }
1114
449
 
1115
- static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1116
- props->name = ggml_backend_cpu_device_get_name(dev);
1117
- props->description = ggml_backend_cpu_device_get_description(dev);
1118
- props->type = ggml_backend_cpu_device_get_type(dev);
1119
- ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1120
- props->caps = {
1121
- /* async */ false,
1122
- /* host_buffer */ false,
1123
- /* events */ false,
1124
- };
450
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
451
+ return device->iface.get_description(device);
1125
452
  }
1126
453
 
1127
- static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
1128
- return ggml_backend_cpu_init();
454
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
455
+ device->iface.get_memory(device, free, total);
456
+ }
1129
457
 
1130
- GGML_UNUSED(dev);
1131
- GGML_UNUSED(params);
458
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
459
+ return device->iface.get_type(device);
1132
460
  }
1133
461
 
1134
- static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
1135
- return ggml_backend_cpu_buffer_type();
462
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
463
+ memset(props, 0, sizeof(*props));
464
+ device->iface.get_props(device, props);
465
+ }
1136
466
 
1137
- GGML_UNUSED(dev);
467
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
468
+ return device->reg;
1138
469
  }
1139
470
 
1140
- static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1141
- return ggml_backend_cpu_buffer_from_ptr(ptr, size);
471
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
472
+ return device->iface.init_backend(device, params);
473
+ }
1142
474
 
1143
- GGML_UNUSED(dev);
1144
- GGML_UNUSED(max_tensor_size);
475
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
476
+ return device->iface.get_buffer_type(device);
1145
477
  }
1146
478
 
1147
- static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1148
- switch (op->op) {
1149
- case GGML_OP_CPY:
1150
- return
1151
- op->type != GGML_TYPE_IQ2_XXS &&
1152
- op->type != GGML_TYPE_IQ2_XS &&
1153
- op->type != GGML_TYPE_IQ1_S &&
1154
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
1155
- case GGML_OP_MUL_MAT:
1156
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
1157
- case GGML_OP_ROPE_BACK:
1158
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1159
- case GGML_OP_IM2COL_BACK:
1160
- return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
1161
- case GGML_OP_OUT_PROD:
1162
- return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
1163
- default:
1164
- return true;
479
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
480
+ if (device->iface.get_host_buffer_type == NULL) {
481
+ return NULL;
1165
482
  }
1166
483
 
1167
- GGML_UNUSED(dev);
484
+ return device->iface.get_host_buffer_type(device);
1168
485
  }
1169
486
 
1170
- static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1171
- return ggml_backend_buft_is_host(buft);
1172
-
1173
- GGML_UNUSED(dev);
487
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
488
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
1174
489
  }
1175
490
 
1176
- static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1177
- /* .get_name = */ ggml_backend_cpu_device_get_name,
1178
- /* .get_description = */ ggml_backend_cpu_device_get_description,
1179
- /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1180
- /* .get_type = */ ggml_backend_cpu_device_get_type,
1181
- /* .get_props = */ ggml_backend_cpu_device_get_props,
1182
- /* .init_backend = */ ggml_backend_cpu_device_init,
1183
- /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1184
- /* .get_host_buffer_type = */ NULL,
1185
- /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
1186
- /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1187
- /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1188
- /* .offload_op = */ NULL,
1189
- /* .event_new = */ NULL,
1190
- /* .event_free = */ NULL,
1191
- /* .event_synchronize = */ NULL,
1192
- };
1193
-
1194
- ////////////////////////
1195
-
1196
- static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1197
- return "CPU";
1198
-
1199
- GGML_UNUSED(reg);
491
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
492
+ return device->iface.supports_op(device, op);
1200
493
  }
1201
494
 
1202
- static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
1203
- return 1;
1204
-
1205
- GGML_UNUSED(reg);
495
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
496
+ return device->iface.supports_buft(device, buft);
1206
497
  }
1207
498
 
1208
- static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1209
- GGML_ASSERT(index == 0);
499
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
500
+ if (device->iface.offload_op != NULL) {
501
+ return device->iface.offload_op(device, op);
502
+ }
1210
503
 
1211
- static ggml_backend_cpu_device_context ctx;
1212
- static ggml_backend_device ggml_backend_cpu_device = {
1213
- /* .iface = */ ggml_backend_cpu_device_i,
1214
- /* .reg = */ reg,
1215
- /* .context = */ &ctx,
1216
- };
504
+ return false;
505
+ }
1217
506
 
1218
- return &ggml_backend_cpu_device;
507
+ // Backend (reg)
1219
508
 
1220
- GGML_UNUSED(reg);
1221
- GGML_UNUSED(index);
509
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
510
+ return reg->iface.get_name(reg);
1222
511
  }
1223
512
 
1224
- static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
1225
- /* .get_name = */ ggml_backend_cpu_reg_get_name,
1226
- /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1227
- /* .get_device = */ ggml_backend_cpu_reg_get_device,
1228
- /* .get_proc_address = */ NULL,
1229
- };
513
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
514
+ return reg->iface.get_device_count(reg);
515
+ }
1230
516
 
1231
- ggml_backend_reg_t ggml_backend_cpu_reg(void) {
1232
- static struct ggml_backend_reg ggml_backend_cpu_reg = {
1233
- /* .iface = */ ggml_backend_cpu_reg_i,
1234
- /* .context = */ NULL,
1235
- };
517
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
518
+ return reg->iface.get_device(reg, index);
519
+ }
1236
520
 
1237
- return &ggml_backend_cpu_reg;
521
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
522
+ if (!reg->iface.get_proc_address) {
523
+ return NULL;
524
+ }
525
+ return reg->iface.get_proc_address(reg, name);
1238
526
  }
1239
527
 
1240
528
  // multi-buffer buffer
@@ -1244,12 +532,6 @@ struct ggml_backend_multi_buffer_context {
1244
532
  size_t n_buffers;
1245
533
  };
1246
534
 
1247
- static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1248
- ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1249
-
1250
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1251
- }
1252
-
1253
535
  static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1254
536
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1255
537
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1268,7 +550,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
1268
550
  }
1269
551
 
1270
552
  static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1271
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
1272
553
  /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1273
554
  /* .get_base = */ NULL,
1274
555
  /* .init_tensor = */ NULL,
@@ -1297,7 +578,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
1297
578
  }
1298
579
 
1299
580
  bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1300
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
581
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
1301
582
  }
1302
583
 
1303
584
  void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
@@ -1389,7 +670,7 @@ struct ggml_backend_sched {
1389
670
  char * context_buffer;
1390
671
  size_t context_buffer_size;
1391
672
 
1392
- bool debug;
673
+ int debug;
1393
674
  };
1394
675
 
1395
676
  #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1408,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1408
689
  }
1409
690
 
1410
691
  static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1411
- ggml_backend_buffer_t buffer = tensor->buffer;
692
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1412
693
  if (buffer == NULL) {
1413
694
  return -1;
1414
695
  }
@@ -1422,7 +703,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1422
703
  }
1423
704
 
1424
705
  #ifndef NDEBUG
1425
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
706
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1426
707
  __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1427
708
  #endif
1428
709
 
@@ -1441,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
1441
722
 
1442
723
  // returns the backend that should be used for the node based on the current locations
1443
724
  static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1444
- // TODO: use supports_op to check if the backend supports the op
1445
-
1446
725
  // assign pre-allocated nodes to their backend
1447
726
  int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1448
727
  if (cur_backend_id != -1) {
@@ -1461,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1461
740
 
1462
741
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1463
742
  // since the tensor is pre-allocated, it cannot be moved to another backend
1464
- GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
743
+ GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
1465
744
  }
1466
745
 
1467
746
  // graph input
@@ -1477,7 +756,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1477
756
  if (src == NULL) {
1478
757
  continue;
1479
758
  }
1480
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
759
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
760
+ // not an ideal solution
761
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1481
762
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1482
763
  // check if a backend with higher prio wants to offload the op
1483
764
  if (src_backend_id == sched->n_backends - 1) {
@@ -1511,32 +792,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1511
792
  for (int i = 0; i < graph->n_nodes; i++) {
1512
793
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1513
794
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1514
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
795
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1515
796
  sched->splits[cur_split].n_inputs);
1516
797
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1517
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
798
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1518
799
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1519
800
  }
1520
- fprintf(stderr, "\n");
801
+ GGML_LOG_DEBUG("\n");
1521
802
  cur_split++;
1522
803
  }
1523
804
  struct ggml_tensor * node = graph->nodes[i];
1524
805
  if (ggml_is_view_op(node->op)) {
1525
806
  continue;
1526
807
  }
1527
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1528
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1529
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1530
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1531
- struct ggml_tensor * src = node->src[j];
1532
- if (src == NULL) {
1533
- continue;
808
+ if (sched->debug > 1) {
809
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
810
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
811
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
812
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
813
+ struct ggml_tensor * src = node->src[j];
814
+ if (src == NULL) {
815
+ continue;
816
+ }
817
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
818
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
819
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1534
820
  }
1535
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1536
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1537
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
821
+ GGML_LOG_DEBUG("\n");
1538
822
  }
1539
- fprintf(stderr, "\n");
1540
823
  }
1541
824
  }
1542
825
 
@@ -1601,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1601
884
  for (int i = 0; i < graph->n_nodes; i++) {
1602
885
  struct ggml_tensor * node = graph->nodes[i];
1603
886
  int * node_backend_id = &tensor_backend_id(node);
887
+ if (ggml_is_view_op(node->op)) {
888
+ continue;
889
+ }
1604
890
  // do not overwrite user assignments
1605
891
  if (*node_backend_id == -1) {
1606
892
  *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
@@ -1828,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1828
1114
  if (src == NULL) {
1829
1115
  continue;
1830
1116
  }
1831
- // check if a weight is on a different backend
1117
+ // check if a weight is on a different and incompatible backend
1832
1118
  // by starting a new split, the memory of the previously offloaded weights can be reused
1833
1119
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1834
1120
  int src_backend_id = tensor_backend_id(src);
1835
- if (src_backend_id != cur_backend_id) {
1121
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1836
1122
  need_new_split = true;
1837
1123
  break;
1838
1124
  }
@@ -1844,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1844
1130
  int src_backend_id = sched->hv_tensor_backend_ids[id];
1845
1131
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1846
1132
  if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1847
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1848
1133
  need_new_split = true;
1849
1134
  break;
1850
1135
  }
@@ -2050,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
2050
1335
  // the re-allocation may cause the split inputs to be moved to a different address
2051
1336
  ggml_backend_sched_synchronize(sched);
2052
1337
  #ifndef NDEBUG
2053
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1338
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2054
1339
  #endif
2055
1340
  ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2056
1341
  if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2057
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1342
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2058
1343
  return false;
2059
1344
  }
2060
1345
  }
@@ -2165,11 +1450,12 @@ ggml_backend_sched_t ggml_backend_sched_new(
2165
1450
  bool parallel) {
2166
1451
  GGML_ASSERT(n_backends > 0);
2167
1452
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
2168
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1453
+ GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
2169
1454
 
2170
1455
  struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
2171
1456
 
2172
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1457
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
1458
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
2173
1459
  sched->n_backends = n_backends;
2174
1460
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
2175
1461
 
@@ -2197,6 +1483,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
2197
1483
  sched->backends[b] = backends[b];
2198
1484
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
2199
1485
  GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1486
+
2200
1487
  if (sched->n_copies > 1) {
2201
1488
  for (int c = 0; c < sched->n_copies; c++) {
2202
1489
  sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
@@ -2252,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
2252
1539
 
2253
1540
  ggml_backend_sched_split_graph(sched, measure_graph);
2254
1541
 
1542
+ ggml_backend_sched_synchronize(sched);
1543
+
2255
1544
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
2256
1545
  return false;
2257
1546
  }
2258
1547
 
2259
1548
  ggml_backend_sched_reset(sched);
2260
- ggml_backend_sched_synchronize(sched);
2261
1549
 
2262
1550
  return true;
2263
1551
  }
@@ -2448,7 +1736,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2448
1736
  struct ggml_context * ctx_unallocated = ggml_init(params);
2449
1737
 
2450
1738
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2451
- fprintf(stderr, "failed to allocate context for graph copy\n");
1739
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2452
1740
  ggml_hash_set_free(&hash_set);
2453
1741
  free(node_copies);
2454
1742
  free(node_init);
@@ -2471,7 +1759,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2471
1759
  // allocate nodes
2472
1760
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2473
1761
  if (buffer == NULL) {
2474
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
1762
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2475
1763
  ggml_hash_set_free(&hash_set);
2476
1764
  free(node_copies);
2477
1765
  free(node_init);
@@ -2558,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
2558
1846
 
2559
1847
  return true;
2560
1848
  }
1849
+
1850
+ // CPU backend - buffer
1851
+
1852
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
1853
+ uintptr_t data = (uintptr_t)buffer->context;
1854
+
1855
+ // align the buffer
1856
+ if (data % TENSOR_ALIGNMENT != 0) {
1857
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
1858
+ }
1859
+
1860
+ return (void *)data;
1861
+ }
1862
+
1863
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1864
+ ggml_aligned_free(buffer->context, buffer->size);
1865
+ }
1866
+
1867
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1868
+ memset((char *)tensor->data + offset, value, size);
1869
+
1870
+ GGML_UNUSED(buffer);
1871
+ }
1872
+
1873
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1874
+ memcpy((char *)tensor->data + offset, data, size);
1875
+
1876
+ GGML_UNUSED(buffer);
1877
+ }
1878
+
1879
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1880
+ memcpy(data, (const char *)tensor->data + offset, size);
1881
+
1882
+ GGML_UNUSED(buffer);
1883
+ }
1884
+
1885
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
1886
+ if (ggml_backend_buffer_is_host(src->buffer)) {
1887
+ memcpy(dst->data, src->data, ggml_nbytes(src));
1888
+ return true;
1889
+ }
1890
+ return false;
1891
+
1892
+ GGML_UNUSED(buffer);
1893
+ }
1894
+
1895
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1896
+ memset(buffer->context, value, buffer->size);
1897
+ }
1898
+
1899
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
1900
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
1901
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1902
+ /* .init_tensor = */ NULL, // no initialization required
1903
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1904
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1905
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1906
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1907
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1908
+ /* .reset = */ NULL,
1909
+ };
1910
+
1911
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
1912
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
1913
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
1914
+ /* .init_tensor = */ NULL, // no initialization required
1915
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
1916
+ /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
1917
+ /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
1918
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
1919
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
1920
+ /* .reset = */ NULL,
1921
+ };
1922
+
1923
+ // CPU backend buffer type
1924
+
1925
+ // this buffer type is defined here to make it available to all backends
1926
+
1927
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1928
+ return "CPU";
1929
+
1930
+ GGML_UNUSED(buft);
1931
+ }
1932
+
1933
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1934
+ void * data = ggml_aligned_malloc(size);
1935
+
1936
+ if (data == NULL) {
1937
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
1938
+ return NULL;
1939
+ }
1940
+
1941
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
1942
+ }
1943
+
1944
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1945
+ return TENSOR_ALIGNMENT;
1946
+
1947
+ GGML_UNUSED(buft);
1948
+ }
1949
+
1950
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1951
+ return true;
1952
+
1953
+ GGML_UNUSED(buft);
1954
+ }
1955
+
1956
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
1957
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1958
+ /* .iface = */ {
1959
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
1960
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1961
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1962
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1963
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1964
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1965
+ },
1966
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1967
+ /* .context = */ NULL,
1968
+ };
1969
+
1970
+ return &ggml_backend_cpu_buffer_type;
1971
+ }
1972
+
1973
+ static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
1974
+ return "CPU_Mapped";
1975
+
1976
+ GGML_UNUSED(buft);
1977
+ }
1978
+
1979
+ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
1980
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
1981
+ /* .iface = */ {
1982
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
1983
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
1984
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
1985
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1986
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1987
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
1988
+ },
1989
+ /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1990
+ /* .context = */ NULL,
1991
+ };
1992
+
1993
+ return &ggml_backend_cpu_buffer_type;
1994
+ }
1995
+
1996
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1997
+ GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
1998
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
1999
+ }