@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,3 +1,13 @@
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
1
11
  #include "ggml-backend-impl.h"
2
12
  #include "ggml-alloc.h"
3
13
  #include "ggml-impl.h"
@@ -8,9 +18,14 @@
8
18
  #include <stdio.h>
9
19
  #include <stdlib.h>
10
20
  #include <string.h>
21
+ #include <string>
22
+ #include <vector>
11
23
 
24
+ #ifdef __APPLE__
25
+ #include <sys/types.h>
26
+ #include <sys/sysctl.h>
27
+ #endif
12
28
 
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
29
 
15
30
  // backend buffer type
16
31
 
@@ -18,7 +33,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
33
  return buft->iface.get_name(buft);
19
34
  }
20
35
 
21
- GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
36
+ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22
37
  return buft->iface.alloc_buffer(buft, size);
23
38
  }
24
39
 
@@ -34,7 +49,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
34
49
  return SIZE_MAX;
35
50
  }
36
51
 
37
- GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
52
+ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
53
  // get_alloc_size is optional, defaults to ggml_nbytes
39
54
  if (buft->iface.get_alloc_size) {
40
55
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -51,16 +66,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
51
66
  return false;
52
67
  }
53
68
 
54
- // backend buffer
69
+ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
70
+ return buft->device;
71
+ }
55
72
 
56
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
- ggml_backend_buffer_type_t buft,
58
- struct ggml_backend_buffer_i iface,
59
- ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
73
+ // backend buffer
62
74
 
63
- (*buffer) = (struct ggml_backend_buffer) {
75
+ ggml_backend_buffer_t ggml_backend_buffer_init(
76
+ ggml_backend_buffer_type_t buft,
77
+ struct ggml_backend_buffer_i iface,
78
+ void * context,
79
+ size_t size) {
80
+ ggml_backend_buffer_t buffer = new ggml_backend_buffer {
64
81
  /* .interface = */ iface,
65
82
  /* .buft = */ buft,
66
83
  /* .context = */ context,
@@ -83,7 +100,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
83
100
  if (buffer->iface.free_buffer != NULL) {
84
101
  buffer->iface.free_buffer(buffer);
85
102
  }
86
- free(buffer);
103
+ delete buffer;
87
104
  }
88
105
 
89
106
  size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -98,14 +115,14 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
98
115
  return base;
99
116
  }
100
117
 
101
- GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
118
+ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
102
119
  // init_tensor is optional
103
120
  if (buffer->iface.init_tensor) {
104
121
  buffer->iface.init_tensor(buffer, tensor);
105
122
  }
106
123
  }
107
124
 
108
- size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
125
+ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
109
126
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
110
127
  }
111
128
 
@@ -218,7 +235,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
218
235
  }
219
236
  }
220
237
 
221
- GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
238
+ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
239
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
240
 
224
241
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
@@ -232,7 +249,7 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
232
249
  buf->iface.set_tensor(buf, tensor, data, offset, size);
233
250
  }
234
251
 
235
- GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
252
+ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
253
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
254
 
238
255
  GGML_ASSERT(buf != NULL && "tensor buffer not set");
@@ -246,6 +263,22 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
246
263
  buf->iface.get_tensor(buf, tensor, data, offset, size);
247
264
  }
248
265
 
266
+ GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268
+
269
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
270
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272
+
273
+ if (!size) {
274
+ return;
275
+ }
276
+
277
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
278
+
279
+ buf->iface.memset_tensor(buf, tensor, value, offset, size);
280
+ }
281
+
249
282
  void ggml_backend_synchronize(ggml_backend_t backend) {
250
283
  if (backend->iface.synchronize == NULL) {
251
284
  return;
@@ -283,20 +316,39 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
283
316
  }
284
317
 
285
318
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
319
+ // helper to ease transition to device interface
320
+ if (backend->device) {
321
+ return ggml_backend_dev_supports_op(backend->device, op);
322
+ }
323
+
286
324
  return backend->iface.supports_op(backend, op);
287
325
  }
288
326
 
289
327
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
328
+ // helper to ease transition to device interface
329
+ if (backend->device) {
330
+ return ggml_backend_dev_supports_buft(backend->device, buft);
331
+ }
332
+
290
333
  return backend->iface.supports_buft(backend, buft);
291
334
  }
292
335
 
293
336
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
337
+ // helper to ease transition to device interface
338
+ if (backend->device) {
339
+ return ggml_backend_dev_offload_op(backend->device, op);
340
+ }
341
+
294
342
  if (backend->iface.offload_op != NULL) {
295
343
  return backend->iface.offload_op(backend, op);
296
344
  }
297
345
  return false;
298
346
  }
299
347
 
348
+ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
349
+ return backend->device;
350
+ }
351
+
300
352
  // backend copy
301
353
 
302
354
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -351,43 +403,39 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
351
403
  }
352
404
 
353
405
  // an async copy would normally happen after all the queued operations on both backends are completed
354
- // sync src, set_async dst
355
- if (ggml_backend_buffer_is_host(src->buffer)) {
356
- ggml_backend_synchronize(backend_src);
357
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
358
- } else {
359
- ggml_backend_synchronize(backend_src);
360
- ggml_backend_tensor_copy(src, dst);
361
- ggml_backend_synchronize(backend_dst);
362
- }
406
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
407
+ ggml_backend_synchronize(backend_src);
408
+ ggml_backend_synchronize(backend_dst);
409
+ ggml_backend_tensor_copy(src, dst);
363
410
  }
364
411
 
365
412
  // events
366
413
 
367
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
368
- if (backend->iface.event_new == NULL) {
414
+ ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
415
+ // null device is allowed for the transition period to the device interface
416
+ if (device == NULL || device->iface.event_new == NULL) {
369
417
  return NULL;
370
418
  }
371
- return backend->iface.event_new(backend);
419
+ return device->iface.event_new(device);
372
420
  }
373
421
 
374
422
  void ggml_backend_event_free(ggml_backend_event_t event) {
375
423
  if (event == NULL) {
376
424
  return;
377
425
  }
378
- event->backend->iface.event_free(event);
426
+ event->device->iface.event_free(event->device, event);
379
427
  }
380
428
 
381
- void ggml_backend_event_record(ggml_backend_event_t event) {
382
- GGML_ASSERT(event->backend->iface.event_record != NULL);
429
+ void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
430
+ GGML_ASSERT(backend->iface.event_record != NULL);
383
431
 
384
- event->backend->iface.event_record(event);
432
+ backend->iface.event_record(backend, event);
385
433
  }
386
434
 
387
435
  void ggml_backend_event_synchronize(ggml_backend_event_t event) {
388
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
436
+ GGML_ASSERT(event->device->iface.event_synchronize);
389
437
 
390
- event->backend->iface.event_synchronize(event);
438
+ event->device->iface.event_synchronize(event->device, event);
391
439
  }
392
440
 
393
441
  void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
@@ -396,170 +444,223 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
396
444
  backend->iface.event_wait(backend, event);
397
445
  }
398
446
 
399
- // backend registry
447
+ // Backend device
400
448
 
401
- #define GGML_REG_MAX_BACKENDS 64
449
+ const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
450
+ return device->iface.get_name(device);
451
+ }
402
452
 
403
- struct ggml_backend_reg {
404
- char name[128];
405
- ggml_backend_init_fn init_fn;
406
- ggml_backend_buffer_type_t default_buffer_type;
407
- void * user_data;
408
- };
453
+ const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
454
+ return device->iface.get_description(device);
455
+ }
409
456
 
410
- static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
411
- static size_t ggml_backend_registry_count = 0;
457
+ void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
458
+ device->iface.get_memory(device, free, total);
459
+ }
412
460
 
413
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
461
+ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
462
+ return device->iface.get_type(device);
463
+ }
414
464
 
415
- GGML_CALL static void ggml_backend_registry_init(void) {
416
- static bool initialized = false;
465
+ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
466
+ device->iface.get_props(device, props);
467
+ }
417
468
 
418
- if (initialized) {
419
- return;
420
- }
469
+ ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
470
+ return device->reg;
471
+ }
421
472
 
422
- initialized = true;
473
+ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
474
+ return device->iface.init_backend(device, params);
475
+ }
423
476
 
424
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
477
+ ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
478
+ return device->iface.get_buffer_type(device);
479
+ }
425
480
 
426
- // add forward decls here to avoid including the backend headers
427
- #ifdef GGML_USE_CUDA
428
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
429
- ggml_backend_cuda_reg_devices();
430
- #endif
481
+ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
482
+ return device->iface.get_host_buffer_type(device);
483
+ }
431
484
 
432
- #ifdef GGML_USE_SYCL
433
- extern void ggml_backend_sycl_reg_devices(void);
434
- ggml_backend_sycl_reg_devices();
435
- #endif
485
+ ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
486
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
487
+ }
436
488
 
437
- #ifdef GGML_USE_METAL
438
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
439
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
440
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
441
- #endif
489
+ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
490
+ return device->iface.supports_op(device, op);
491
+ }
442
492
 
443
- #ifdef GGML_USE_VULKAN
444
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
445
- ggml_backend_vk_reg_devices();
446
- #endif
493
+ bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
494
+ return device->iface.supports_buft(device, buft);
495
+ }
447
496
 
448
- #ifdef GGML_USE_KOMPUTE
449
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
450
- ggml_backend_kompute_reg_devices();
451
- #endif
497
+ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
498
+ return device->iface.offload_op(device, op);
499
+ }
452
500
 
453
- #ifdef GGML_USE_CANN
454
- extern GGML_CALL int ggml_backend_cann_reg_devices(void);
455
- ggml_backend_cann_reg_devices();
456
- #endif
501
+ // Backend (reg)
502
+
503
+ const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
504
+ return reg->iface.get_name(reg);
457
505
  }
458
506
 
459
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
460
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
507
+ size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
508
+ return reg->iface.get_device_count(reg);
509
+ }
461
510
 
462
- size_t id = ggml_backend_registry_count;
511
+ ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
512
+ return reg->iface.get_device(reg, index);
513
+ }
463
514
 
464
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
465
- /* .name = */ {0},
466
- /* .fn = */ init_fn,
467
- /* .default_buffer_type = */ default_buffer_type,
468
- /* .user_data = */ user_data,
469
- };
515
+ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
516
+ if (!reg->iface.get_proc_address) {
517
+ return NULL;
518
+ }
519
+ return reg->iface.get_proc_address(reg, name);
520
+ }
470
521
 
471
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
522
+ // Backend registry
472
523
 
473
- #ifndef NDEBUG
474
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
524
+ #ifdef GGML_USE_CUDA
525
+ #include "ggml-cuda.h"
475
526
  #endif
476
527
 
477
- ggml_backend_registry_count++;
478
- }
528
+ struct ggml_backend_registry {
529
+ std::vector<ggml_backend_reg_t> backends;
530
+ std::vector<ggml_backend_dev_t> devices;
479
531
 
480
- size_t ggml_backend_reg_get_count(void) {
481
- ggml_backend_registry_init();
532
+ ggml_backend_registry() {
533
+ #ifdef GGML_USE_CUDA
534
+ register_backend(ggml_backend_cuda_reg());
535
+ #endif
482
536
 
483
- return ggml_backend_registry_count;
484
- }
537
+ register_backend(ggml_backend_cpu_reg());
485
538
 
486
- size_t ggml_backend_reg_find_by_name(const char * name) {
487
- ggml_backend_registry_init();
539
+ // TODO: sycl, metal, vulkan, kompute, cann
540
+ }
488
541
 
489
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
490
- // TODO: case insensitive in a portable way
491
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
492
- return i;
542
+ void register_backend(ggml_backend_reg_t reg) {
543
+ #ifndef NDEBUG
544
+ fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
545
+ __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
546
+ #endif
547
+ backends.push_back(reg);
548
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
549
+ register_device(ggml_backend_reg_dev_get(reg, i));
493
550
  }
494
551
  }
495
552
 
496
- // not found
497
- return SIZE_MAX;
498
- }
553
+ void register_device(ggml_backend_dev_t device) {
554
+ #ifndef NDEBUG
555
+ fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
556
+ #endif
557
+ devices.push_back(device);
558
+ }
559
+ };
499
560
 
500
- // init from backend:params string
501
- ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
502
- ggml_backend_registry_init();
561
+ static ggml_backend_registry & get_reg() {
562
+ static ggml_backend_registry reg;
563
+ return reg;
564
+ }
503
565
 
504
- const char * params = strchr(backend_str, ':');
505
- char backend_name[128];
506
- if (params == NULL) {
507
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
508
- params = "";
509
- } else {
510
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
511
- params++;
512
- }
566
+ // Internal API
567
+ void ggml_backend_register(ggml_backend_reg_t reg) {
568
+ get_reg().register_backend(reg);
569
+ }
513
570
 
514
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
571
+ void ggml_backend_device_register(ggml_backend_dev_t device) {
572
+ get_reg().register_device(device);
573
+ }
515
574
 
516
- if (backend_i == SIZE_MAX) {
517
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
518
- return NULL;
519
- }
575
+ // Backend (reg) enumeration
576
+ size_t ggml_backend_reg_count() {
577
+ return get_reg().backends.size();
578
+ }
520
579
 
521
- return ggml_backend_reg_init_backend(backend_i, params);
580
+ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
581
+ GGML_ASSERT(index < ggml_backend_reg_count());
582
+ return get_reg().backends[index];
522
583
  }
523
584
 
524
- const char * ggml_backend_reg_get_name(size_t i) {
525
- ggml_backend_registry_init();
585
+ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
586
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
587
+ ggml_backend_reg_t reg = ggml_backend_reg_get(i);
588
+ if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
589
+ return reg;
590
+ }
591
+ }
592
+ return NULL;
593
+ }
526
594
 
527
- GGML_ASSERT(i < ggml_backend_registry_count);
528
- return ggml_backend_registry[i].name;
595
+ // Device enumeration
596
+ size_t ggml_backend_dev_count() {
597
+ return get_reg().devices.size();
529
598
  }
530
599
 
531
- ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
532
- ggml_backend_registry_init();
600
+ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
601
+ GGML_ASSERT(index < ggml_backend_dev_count());
602
+ return get_reg().devices[index];
603
+ }
533
604
 
534
- GGML_ASSERT(i < ggml_backend_registry_count);
535
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
605
+ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
606
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
607
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
608
+ if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
609
+ return dev;
610
+ }
611
+ }
612
+ return NULL;
536
613
  }
537
614
 
538
- ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
539
- ggml_backend_registry_init();
615
+ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
616
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
617
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
618
+ if (ggml_backend_dev_type(dev) == type) {
619
+ return dev;
620
+ }
621
+ }
622
+ return NULL;
623
+ }
540
624
 
541
- GGML_ASSERT(i < ggml_backend_registry_count);
542
- return ggml_backend_registry[i].default_buffer_type;
625
+ // Convenience functions
626
+ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
627
+ ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
628
+ if (!dev) {
629
+ return NULL;
630
+ }
631
+ return ggml_backend_dev_init(dev, params);
543
632
  }
544
633
 
545
- ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
546
- ggml_backend_registry_init();
634
+ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
635
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
636
+ if (!dev) {
637
+ return NULL;
638
+ }
639
+ return ggml_backend_dev_init(dev, params);
640
+ }
547
641
 
548
- GGML_ASSERT(i < ggml_backend_registry_count);
549
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
642
+ ggml_backend_t ggml_backend_init_best(void) {
643
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
644
+ if (!dev) {
645
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
646
+ }
647
+ if (!dev) {
648
+ return NULL;
649
+ }
650
+ return ggml_backend_dev_init(dev, NULL);
550
651
  }
551
652
 
552
653
  // backend CPU
553
654
 
554
655
  static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
555
656
 
556
- GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
657
+ static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
557
658
  return "CPU";
558
659
 
559
660
  GGML_UNUSED(buffer);
560
661
  }
561
662
 
562
- GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
663
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
563
664
  uintptr_t data = (uintptr_t)buffer->context;
564
665
 
565
666
  // align the buffer
@@ -570,23 +671,29 @@ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t b
570
671
  return (void *)data;
571
672
  }
572
673
 
573
- GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
674
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
574
675
  free(buffer->context);
575
676
  }
576
677
 
577
- GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
678
+ static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
679
+ memset((char *)tensor->data + offset, value, size);
680
+
681
+ GGML_UNUSED(buffer);
682
+ }
683
+
684
+ static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
578
685
  memcpy((char *)tensor->data + offset, data, size);
579
686
 
580
687
  GGML_UNUSED(buffer);
581
688
  }
582
689
 
583
- GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
690
+ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
584
691
  memcpy(data, (const char *)tensor->data + offset, size);
585
692
 
586
693
  GGML_UNUSED(buffer);
587
694
  }
588
695
 
589
- GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
696
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
590
697
  if (ggml_backend_buffer_is_host(src->buffer)) {
591
698
  memcpy(dst->data, src->data, ggml_nbytes(src));
592
699
  return true;
@@ -596,15 +703,16 @@ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t b
596
703
  GGML_UNUSED(buffer);
597
704
  }
598
705
 
599
- GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
706
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
600
707
  memset(buffer->context, value, buffer->size);
601
708
  }
602
709
 
603
- static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
604
- /* .get_name = */ ggml_backend_cpu_buffer_name,
710
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
711
+ /* .get_name = */ ggml_backend_cpu_buffer_get_name,
605
712
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
606
713
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
607
714
  /* .init_tensor = */ NULL, // no initialization required
715
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
608
716
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
609
717
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
610
718
  /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
@@ -612,12 +720,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
612
720
  /* .reset = */ NULL,
613
721
  };
614
722
 
615
- // for buffers from ptr, free is not called
616
- static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
617
- /* .get_name = */ ggml_backend_cpu_buffer_name,
723
+ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
724
+ /* .get_name = */ ggml_backend_cpu_buffer_get_name,
618
725
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
619
726
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
620
727
  /* .init_tensor = */ NULL, // no initialization required
728
+ /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
621
729
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
622
730
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
623
731
  /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
@@ -625,13 +733,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
625
733
  /* .reset = */ NULL,
626
734
  };
627
735
 
628
- GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
736
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
629
737
  return "CPU";
630
738
 
631
739
  GGML_UNUSED(buft);
632
740
  }
633
741
 
634
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
742
+ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
635
743
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
636
744
  void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
637
745
  if (data == NULL) {
@@ -639,24 +747,24 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer
639
747
  return NULL;
640
748
  }
641
749
 
642
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
750
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
643
751
  }
644
752
 
645
- GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
753
+ static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
646
754
  return TENSOR_ALIGNMENT;
647
755
 
648
756
  GGML_UNUSED(buft);
649
757
  }
650
758
 
651
- GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
759
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
652
760
  return true;
653
761
 
654
762
  GGML_UNUSED(buft);
655
763
  }
656
764
 
657
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
765
+ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
658
766
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
659
- /* .iface = */ {
767
+ /* .iface = */ {
660
768
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
661
769
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
662
770
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
@@ -664,6 +772,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
664
772
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
665
773
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
666
774
  },
775
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
667
776
  /* .context = */ NULL,
668
777
  };
669
778
 
@@ -676,23 +785,23 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
676
785
 
677
786
  #include <hbwmalloc.h>
678
787
 
679
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
788
+ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
680
789
  return "CPU_HBM";
681
790
 
682
791
  GGML_UNUSED(buft);
683
792
  }
684
793
 
685
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
794
+ static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
686
795
  return "CPU_HBM";
687
796
 
688
797
  GGML_UNUSED(buf);
689
798
  }
690
799
 
691
- GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
800
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
692
801
  hbw_free(buffer->context);
693
802
  }
694
803
 
695
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
804
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
696
805
  //void * ptr = hbw_malloc(size);
697
806
  void * ptr;
698
807
  int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -727,28 +836,30 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
727
836
  #endif
728
837
 
729
838
  struct ggml_backend_cpu_context {
730
- int n_threads;
731
- void * work_data;
732
- size_t work_size;
839
+ int n_threads;
840
+ ggml_threadpool_t threadpool;
841
+
842
+ uint8_t * work_data;
843
+ size_t work_size;
733
844
 
734
845
  ggml_abort_callback abort_callback;
735
846
  void * abort_callback_data;
736
847
  };
737
848
 
738
- GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
849
+ static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
739
850
  return "CPU";
740
851
 
741
852
  GGML_UNUSED(backend);
742
853
  }
743
854
 
744
- GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
855
+ static void ggml_backend_cpu_free(ggml_backend_t backend) {
745
856
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
746
- free(cpu_ctx->work_data);
747
- free(cpu_ctx);
748
- free(backend);
857
+ delete[] cpu_ctx->work_data;
858
+ delete cpu_ctx;
859
+ delete backend;
749
860
  }
750
861
 
751
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
862
+ static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
752
863
  return ggml_backend_cpu_buffer_type();
753
864
 
754
865
  GGML_UNUSED(backend);
@@ -759,18 +870,18 @@ struct ggml_backend_plan_cpu {
759
870
  struct ggml_cgraph cgraph;
760
871
  };
761
872
 
762
- GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
873
+ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
763
874
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
764
875
 
765
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
876
+ struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
766
877
 
767
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
878
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
768
879
  cpu_plan->cgraph = *cgraph; // FIXME: deep copy
769
880
 
770
881
  if (cpu_plan->cplan.work_size > 0) {
771
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
882
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
772
883
  if (cpu_plan->cplan.work_data == NULL) {
773
- free(cpu_plan);
884
+ delete cpu_plan;
774
885
  return NULL;
775
886
  }
776
887
  }
@@ -781,16 +892,16 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
781
892
  return cpu_plan;
782
893
  }
783
894
 
784
- GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
895
+ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
785
896
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
786
897
 
787
- free(cpu_plan->cplan.work_data);
788
- free(cpu_plan);
898
+ delete[] cpu_plan->cplan.work_data;
899
+ delete cpu_plan;
789
900
 
790
901
  GGML_UNUSED(backend);
791
902
  }
792
903
 
793
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
904
+ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
794
905
  struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
795
906
 
796
907
  return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -798,21 +909,21 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
798
909
  GGML_UNUSED(backend);
799
910
  }
800
911
 
801
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
912
+ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
802
913
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
803
914
 
804
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
915
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
805
916
 
806
917
  if (cpu_ctx->work_size < cplan.work_size) {
807
- free(cpu_ctx->work_data);
808
- cpu_ctx->work_data = malloc(cplan.work_size);
918
+ delete[] cpu_ctx->work_data;
919
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
809
920
  if (cpu_ctx->work_data == NULL) {
810
921
  cpu_ctx->work_size = 0;
811
922
  return GGML_STATUS_ALLOC_FAILED;
812
923
  }
813
924
  cpu_ctx->work_size = cplan.work_size;
814
925
  }
815
- cplan.work_data = cpu_ctx->work_data;
926
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
816
927
 
817
928
  cplan.abort_callback = cpu_ctx->abort_callback;
818
929
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
@@ -820,31 +931,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
820
931
  return ggml_graph_compute(cgraph, &cplan);
821
932
  }
822
933
 
823
- GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
824
- switch (op->op) {
825
- case GGML_OP_CPY:
826
- return
827
- op->type != GGML_TYPE_IQ2_XXS &&
828
- op->type != GGML_TYPE_IQ2_XS &&
829
- op->type != GGML_TYPE_IQ1_S &&
830
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
831
- case GGML_OP_MUL_MAT:
832
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
833
- default:
834
- return true;
835
- }
836
-
837
- GGML_UNUSED(backend);
838
- }
839
-
840
- GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
841
- return ggml_backend_buft_is_host(buft);
842
-
843
- GGML_UNUSED(backend);
844
- }
845
-
846
- static struct ggml_backend_i cpu_backend_i = {
847
- /* .get_name = */ ggml_backend_cpu_name,
934
+ static const struct ggml_backend_i ggml_backend_cpu_i = {
935
+ /* .get_name = */ ggml_backend_cpu_get_name,
848
936
  /* .free = */ ggml_backend_cpu_free,
849
937
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
850
938
  /* .set_tensor_async = */ NULL,
@@ -856,14 +944,11 @@ static struct ggml_backend_i cpu_backend_i = {
856
944
  /* .graph_plan_update = */ NULL,
857
945
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
858
946
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
859
- /* .supports_op = */ ggml_backend_cpu_supports_op,
860
- /* .supports_buft = */ ggml_backend_cpu_supports_buft,
947
+ /* .supports_op = */ NULL,
948
+ /* .supports_buft = */ NULL,
861
949
  /* .offload_op = */ NULL,
862
- /* .event_new = */ NULL,
863
- /* .event_free = */ NULL,
864
950
  /* .event_record = */ NULL,
865
951
  /* .event_wait = */ NULL,
866
- /* .event_synchronize = */ NULL,
867
952
  };
868
953
 
869
954
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -872,32 +957,34 @@ static ggml_guid_t ggml_backend_cpu_guid(void) {
872
957
  }
873
958
 
874
959
  ggml_backend_t ggml_backend_cpu_init(void) {
875
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
960
+ struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
876
961
  if (ctx == NULL) {
877
962
  return NULL;
878
963
  }
879
964
 
880
965
  ctx->n_threads = GGML_DEFAULT_N_THREADS;
966
+ ctx->threadpool = NULL;
881
967
  ctx->work_data = NULL;
882
968
  ctx->work_size = 0;
883
969
  ctx->abort_callback = NULL;
884
970
  ctx->abort_callback_data = NULL;
885
971
 
886
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
972
+ ggml_backend_t cpu_backend = new ggml_backend {
973
+ /* .guid = */ ggml_backend_cpu_guid(),
974
+ /* .interface = */ ggml_backend_cpu_i,
975
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
976
+ /* .context = */ ctx,
977
+ };
978
+
887
979
  if (cpu_backend == NULL) {
888
- free(ctx);
980
+ delete ctx;
889
981
  return NULL;
890
982
  }
891
983
 
892
- *cpu_backend = (struct ggml_backend) {
893
- /* .guid = */ ggml_backend_cpu_guid(),
894
- /* .interface = */ cpu_backend_i,
895
- /* .context = */ ctx
896
- };
897
984
  return cpu_backend;
898
985
  }
899
986
 
900
- GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
987
+ bool ggml_backend_is_cpu(ggml_backend_t backend) {
901
988
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
902
989
  }
903
990
 
@@ -908,6 +995,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
908
995
  ctx->n_threads = n_threads;
909
996
  }
910
997
 
998
+ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
999
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
1000
+
1001
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
1002
+
1003
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
1004
+ // already had a different threadpool, pause/suspend it before switching
1005
+ ggml_threadpool_pause(ctx->threadpool);
1006
+ }
1007
+ ctx->threadpool = threadpool;
1008
+ }
1009
+
911
1010
  void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
912
1011
  GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
913
1012
 
@@ -916,16 +1015,226 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
916
1015
  ctx->abort_callback_data = abort_callback_data;
917
1016
  }
918
1017
 
919
- GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1018
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
920
1019
  GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
921
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
1020
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
922
1021
  }
923
1022
 
924
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
1023
+ ////////////////////////
1024
+
1025
+ struct ggml_backend_cpu_device_context {
1026
+ std::string description = "CPU";
1027
+
1028
+ ggml_backend_cpu_device_context() {
1029
+ #ifdef __APPLE__
1030
+ size_t len = 0;
1031
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1032
+ description.resize(len);
1033
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1034
+ }
1035
+ #elif defined(__linux__)
1036
+ FILE * f = fopen("/proc/cpuinfo", "r");
1037
+ if (f) {
1038
+ char buf[1024];
1039
+ while (fgets(buf, sizeof(buf), f)) {
1040
+ if (strncmp(buf, "model name", 10) == 0) {
1041
+ char * p = strchr(buf, ':');
1042
+ if (p) {
1043
+ p++;
1044
+ while (std::isspace(*p)) {
1045
+ p++;
1046
+ }
1047
+ while (std::isspace(p[strlen(p) - 1])) {
1048
+ p[strlen(p) - 1] = '\0';
1049
+ }
1050
+ description = p;
1051
+ break;
1052
+ }
1053
+ }
1054
+ }
1055
+ fclose(f);
1056
+ }
1057
+ #elif defined(_WIN32)
1058
+ HKEY hKey;
1059
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1060
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1061
+ 0,
1062
+ KEY_READ,
1063
+ &hKey) == ERROR_SUCCESS) {
1064
+ DWORD cpu_brand_size = 0;
1065
+ if (RegQueryValueExA(hKey,
1066
+ TEXT("ProcessorNameString"),
1067
+ NULL,
1068
+ NULL,
1069
+ NULL,
1070
+ &cpu_brand_size) == ERROR_SUCCESS) {
1071
+ description.resize(cpu_brand_size);
1072
+ if (RegQueryValueExA(hKey,
1073
+ TEXT("ProcessorNameString"),
1074
+ NULL,
1075
+ NULL,
1076
+ (LPBYTE)&description[0], // NOLINT
1077
+ &cpu_brand_size) == ERROR_SUCCESS) {
1078
+ if (description.find('\0') != std::string::npos) {
1079
+ description.resize(description.find('\0'));
1080
+ }
1081
+ }
1082
+ }
1083
+ RegCloseKey(hKey);
1084
+ }
1085
+ #endif
1086
+ }
1087
+ };
1088
+
1089
+ static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
1090
+ return "CPU";
1091
+
1092
+ GGML_UNUSED(dev);
1093
+ }
1094
+
1095
+ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
1096
+ struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
1097
+
1098
+ return ctx->description.c_str();
1099
+ }
1100
+
1101
+ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1102
+ // TODO
1103
+ *free = 0;
1104
+ *total = 0;
1105
+
1106
+ GGML_UNUSED(dev);
1107
+ }
1108
+
1109
+ static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
1110
+ return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1111
+
1112
+ GGML_UNUSED(dev);
1113
+ }
1114
+
1115
+ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
1116
+ props->name = ggml_backend_cpu_device_get_name(dev);
1117
+ props->description = ggml_backend_cpu_device_get_description(dev);
1118
+ props->type = ggml_backend_cpu_device_get_type(dev);
1119
+ ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1120
+ props->caps = {
1121
+ /* async */ false,
1122
+ /* host_buffer */ false,
1123
+ /* events */ false,
1124
+ };
1125
+ }
1126
+
1127
+ static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
925
1128
  return ggml_backend_cpu_init();
926
1129
 
1130
+ GGML_UNUSED(dev);
927
1131
  GGML_UNUSED(params);
928
- GGML_UNUSED(user_data);
1132
+ }
1133
+
1134
+ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
1135
+ return ggml_backend_cpu_buffer_type();
1136
+
1137
+ GGML_UNUSED(dev);
1138
+ }
1139
+
1140
+ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1141
+ return ggml_backend_cpu_buffer_from_ptr(ptr, size);
1142
+
1143
+ GGML_UNUSED(dev);
1144
+ GGML_UNUSED(max_tensor_size);
1145
+ }
1146
+
1147
+ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1148
+ switch (op->op) {
1149
+ case GGML_OP_CPY:
1150
+ return
1151
+ op->type != GGML_TYPE_IQ2_XXS &&
1152
+ op->type != GGML_TYPE_IQ2_XS &&
1153
+ op->type != GGML_TYPE_IQ1_S &&
1154
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
1155
+ case GGML_OP_MUL_MAT:
1156
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
1157
+ case GGML_OP_ROPE_BACK:
1158
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1159
+ case GGML_OP_IM2COL_BACK:
1160
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
1161
+ case GGML_OP_OUT_PROD:
1162
+ return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
1163
+ default:
1164
+ return true;
1165
+ }
1166
+
1167
+ GGML_UNUSED(dev);
1168
+ }
1169
+
1170
+ static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1171
+ return ggml_backend_buft_is_host(buft);
1172
+
1173
+ GGML_UNUSED(dev);
1174
+ }
1175
+
1176
+ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
1177
+ /* .get_name = */ ggml_backend_cpu_device_get_name,
1178
+ /* .get_description = */ ggml_backend_cpu_device_get_description,
1179
+ /* .get_memory = */ ggml_backend_cpu_device_get_memory,
1180
+ /* .get_type = */ ggml_backend_cpu_device_get_type,
1181
+ /* .get_props = */ ggml_backend_cpu_device_get_props,
1182
+ /* .init_backend = */ ggml_backend_cpu_device_init,
1183
+ /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
1184
+ /* .get_host_buffer_type = */ NULL,
1185
+ /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
1186
+ /* .supports_op = */ ggml_backend_cpu_device_supports_op,
1187
+ /* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
1188
+ /* .offload_op = */ NULL,
1189
+ /* .event_new = */ NULL,
1190
+ /* .event_free = */ NULL,
1191
+ /* .event_synchronize = */ NULL,
1192
+ };
1193
+
1194
+ ////////////////////////
1195
+
1196
+ static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
1197
+ return "CPU";
1198
+
1199
+ GGML_UNUSED(reg);
1200
+ }
1201
+
1202
+ static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
1203
+ return 1;
1204
+
1205
+ GGML_UNUSED(reg);
1206
+ }
1207
+
1208
+ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1209
+ GGML_ASSERT(index == 0);
1210
+
1211
+ static ggml_backend_cpu_device_context ctx;
1212
+ static ggml_backend_device ggml_backend_cpu_device = {
1213
+ /* .iface = */ ggml_backend_cpu_device_i,
1214
+ /* .reg = */ reg,
1215
+ /* .context = */ &ctx,
1216
+ };
1217
+
1218
+ return &ggml_backend_cpu_device;
1219
+
1220
+ GGML_UNUSED(reg);
1221
+ GGML_UNUSED(index);
1222
+ }
1223
+
1224
+ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
1225
+ /* .get_name = */ ggml_backend_cpu_reg_get_name,
1226
+ /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
1227
+ /* .get_device = */ ggml_backend_cpu_reg_get_device,
1228
+ /* .get_proc_address = */ NULL,
1229
+ };
1230
+
1231
+ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
1232
+ static struct ggml_backend_reg ggml_backend_cpu_reg = {
1233
+ /* .iface = */ ggml_backend_cpu_reg_i,
1234
+ /* .context = */ NULL,
1235
+ };
1236
+
1237
+ return &ggml_backend_cpu_reg;
929
1238
  }
930
1239
 
931
1240
  // multi-buffer buffer
@@ -935,16 +1244,14 @@ struct ggml_backend_multi_buffer_context {
935
1244
  size_t n_buffers;
936
1245
  };
937
1246
 
938
- typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
939
-
940
- GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
941
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
1247
+ static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1248
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
942
1249
 
943
1250
  return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
944
1251
  }
945
1252
 
946
- GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
947
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
1253
+ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1254
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
948
1255
  for (size_t i = 0; i < ctx->n_buffers; i++) {
949
1256
  ggml_backend_buffer_free(ctx->buffers[i]);
950
1257
  }
@@ -953,31 +1260,28 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_
953
1260
  free(ctx);
954
1261
  }
955
1262
 
956
- GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
957
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
1263
+ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1264
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
958
1265
  for (size_t i = 0; i < ctx->n_buffers; i++) {
959
1266
  ggml_backend_buffer_clear(ctx->buffers[i], value);
960
1267
  }
961
1268
  }
962
1269
 
963
- static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
964
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
965
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
966
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
967
- /* .get_base = */ NULL,
968
- /* .init_tensor = */ NULL,
969
- /* .set_tensor = */ NULL,
970
- /* .get_tensor = */ NULL,
971
- /* .cpy_tensor = */ NULL,
972
- /* .clear = */ ggml_backend_multi_buffer_clear,
973
- /* .reset = */ NULL,
974
- };
975
-
976
- return multi_backend_buffer_i;
977
- }
1270
+ static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1271
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
1272
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1273
+ /* .get_base = */ NULL,
1274
+ /* .init_tensor = */ NULL,
1275
+ /* .memset_tensor = */ NULL,
1276
+ /* .set_tensor = */ NULL,
1277
+ /* .get_tensor = */ NULL,
1278
+ /* .cpy_tensor = */ NULL,
1279
+ /* .clear = */ ggml_backend_multi_buffer_clear,
1280
+ /* .reset = */ NULL,
1281
+ };
978
1282
 
979
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
980
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
1283
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
1284
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
981
1285
  ctx->n_buffers = n_buffers;
982
1286
  ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
983
1287
 
@@ -989,16 +1293,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
989
1293
  total_size += ggml_backend_buffer_get_size(buffers[i]);
990
1294
  }
991
1295
 
992
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
1296
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
993
1297
  }
994
1298
 
995
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1299
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
996
1300
  return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
997
1301
  }
998
1302
 
999
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
1303
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
1000
1304
  GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
1001
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
1305
+ ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1002
1306
  for (size_t i = 0; i < ctx->n_buffers; i++) {
1003
1307
  ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1004
1308
  }
@@ -1023,10 +1327,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
1023
1327
  #define GGML_SCHED_MAX_BACKENDS 16
1024
1328
  #endif
1025
1329
 
1026
- #ifndef GGML_SCHED_MAX_SPLITS
1027
- #define GGML_SCHED_MAX_SPLITS 2048
1028
- #endif
1029
-
1030
1330
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1031
1331
  #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1032
1332
  #endif
@@ -1130,7 +1430,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
1130
1430
  }
1131
1431
 
1132
1432
  #if 0
1133
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1433
+ #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
1434
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1134
1435
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1135
1436
  #define GET_CAUSE(node) causes[hash_id(node)]
1136
1437
  #else
@@ -1158,6 +1459,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1158
1459
  }
1159
1460
  }
1160
1461
 
1462
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1463
+ // since the tensor is pre-allocated, it cannot be moved to another backend
1464
+ GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1465
+ }
1466
+
1161
1467
  // graph input
1162
1468
  if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1163
1469
  cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
@@ -1551,10 +1857,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1551
1857
  i_split++;
1552
1858
  if (i_split >= sched->splits_capacity) {
1553
1859
  sched->splits_capacity *= 2;
1554
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1860
+ sched->splits = (ggml_backend_sched_split *)
1861
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1555
1862
  GGML_ASSERT(sched->splits != NULL);
1556
1863
  }
1557
- GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
1558
1864
  split = &sched->splits[i_split];
1559
1865
  split->backend_id = node_backend_id;
1560
1866
  split->i_start = i;
@@ -1638,11 +1944,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1638
1944
  sched->prev_leaf_backend_ids = tmp;
1639
1945
  }
1640
1946
 
1641
- int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1947
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1642
1948
  if (sched->graph.size < graph_size) {
1643
1949
  sched->graph.size = graph_size;
1644
- sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1645
- sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1950
+ sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1951
+ sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1646
1952
  GGML_ASSERT(sched->graph.nodes != NULL);
1647
1953
  GGML_ASSERT(sched->graph.leafs != NULL);
1648
1954
  }
@@ -1690,6 +1996,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1690
1996
  for (int c = 0; c < sched->n_copies; c++) {
1691
1997
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1692
1998
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1999
+ assert(graph_copy->size > graph_copy->n_leafs);
1693
2000
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1694
2001
  }
1695
2002
  }
@@ -1703,6 +2010,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1703
2010
  for (int c = 0; c < sched->n_copies; c++) {
1704
2011
  struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1705
2012
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
2013
+ assert(graph_copy->size > graph_copy->n_leafs);
1706
2014
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1707
2015
  }
1708
2016
  }
@@ -1713,6 +2021,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1713
2021
  for (int i = 0; i < graph->n_leafs; i++) {
1714
2022
  struct ggml_tensor * leaf = graph->leafs[i];
1715
2023
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
2024
+ assert(graph_copy->size > graph_copy->n_leafs);
1716
2025
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1717
2026
  }
1718
2027
  }
@@ -1782,7 +2091,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1782
2091
  } else {
1783
2092
  ggml_backend_synchronize(split_backend);
1784
2093
  }
1785
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
2094
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
2095
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
2096
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
2097
+ ggml_backend_synchronize(input_backend);
2098
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2099
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
2100
+ } else {
2101
+ ggml_backend_synchronize(split_backend);
2102
+ }
2103
+ ggml_backend_tensor_copy(input, input_cpy);
2104
+ }
1786
2105
  }
1787
2106
  }
1788
2107
 
@@ -1828,7 +2147,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1828
2147
  // record the event of this copy
1829
2148
  if (split->n_inputs > 0) {
1830
2149
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1831
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
2150
+ ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1832
2151
  }
1833
2152
  }
1834
2153
  }
@@ -1848,7 +2167,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1848
2167
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1849
2168
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1850
2169
 
1851
- struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
2170
+ struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
1852
2171
 
1853
2172
  sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1854
2173
  sched->n_backends = n_backends;
@@ -1857,20 +2176,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
1857
2176
  // initialize hash table
1858
2177
  // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1859
2178
  sched->hash_set = ggml_hash_set_new(graph_size);
1860
- sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1861
- sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
2179
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2180
+ sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1862
2181
 
1863
- const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1865
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1866
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1867
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2182
+ const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
2183
+ const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
2184
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2185
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2186
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2187
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1868
2188
 
1869
- sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1870
- sched->context_buffer = malloc(sched->context_buffer_size);
2189
+ sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
2190
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1871
2191
 
1872
2192
  const int initial_splits_capacity = 16;
1873
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2193
+ sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1874
2194
  sched->splits_capacity = initial_splits_capacity;
1875
2195
 
1876
2196
  for (int b = 0; b < n_backends; b++) {
@@ -1879,7 +2199,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1879
2199
  GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1880
2200
  if (sched->n_copies > 1) {
1881
2201
  for (int c = 0; c < sched->n_copies; c++) {
1882
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
2202
+ sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
1883
2203
  }
1884
2204
  }
1885
2205
  }
@@ -2115,8 +2435,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
2115
2435
 
2116
2436
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2117
2437
  struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
2118
- struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2119
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2438
+ struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2439
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2120
2440
 
2121
2441
  struct ggml_init_params params = {
2122
2442
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -2134,7 +2454,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2134
2454
  free(node_init);
2135
2455
  ggml_free(ctx_allocated);
2136
2456
  ggml_free(ctx_unallocated);
2137
- return (struct ggml_backend_graph_copy) {
2457
+ return {
2138
2458
  /* .buffer = */ NULL,
2139
2459
  /* .ctx_allocated = */ NULL,
2140
2460
  /* .ctx_unallocated = */ NULL,
@@ -2157,7 +2477,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2157
2477
  free(node_init);
2158
2478
  ggml_free(ctx_allocated);
2159
2479
  ggml_free(ctx_unallocated);
2160
- return (struct ggml_backend_graph_copy) {
2480
+ return {
2161
2481
  /* .buffer = */ NULL,
2162
2482
  /* .ctx_allocated = */ NULL,
2163
2483
  /* .ctx_unallocated = */ NULL,
@@ -2186,7 +2506,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2186
2506
  free(node_copies);
2187
2507
  free(node_init);
2188
2508
 
2189
- return (struct ggml_backend_graph_copy) {
2509
+ return {
2190
2510
  /* .buffer = */ buffer,
2191
2511
  /* .ctx_allocated = */ ctx_allocated,
2192
2512
  /* .ctx_unallocated = */ ctx_unallocated,