@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <windows.h>
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
+
#include "ggml-backend.h"
|
|
11
12
|
#include "ggml-backend-impl.h"
|
|
12
13
|
#include "ggml-alloc.h"
|
|
13
14
|
#include "ggml-impl.h"
|
|
@@ -34,6 +35,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
38
|
+
if (size == 0) {
|
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
|
40
|
+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
41
|
+
}
|
|
42
|
+
|
|
37
43
|
return buft->iface.alloc_buffer(buft, size);
|
|
38
44
|
}
|
|
39
45
|
|
|
@@ -89,7 +95,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
89
95
|
}
|
|
90
96
|
|
|
91
97
|
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
|
92
|
-
return
|
|
98
|
+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
|
93
99
|
}
|
|
94
100
|
|
|
95
101
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
@@ -108,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
|
108
114
|
}
|
|
109
115
|
|
|
110
116
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
|
118
|
+
if (buffer->size == 0) {
|
|
119
|
+
return NULL;
|
|
120
|
+
}
|
|
121
|
+
|
|
111
122
|
void * base = buffer->iface.get_base(buffer);
|
|
112
123
|
|
|
113
124
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -122,6 +133,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
|
122
133
|
}
|
|
123
134
|
}
|
|
124
135
|
|
|
136
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
137
|
+
// clear is optional if the buffer is zero-sized
|
|
138
|
+
if (buffer->size == 0) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
buffer->iface.clear(buffer, value);
|
|
143
|
+
}
|
|
144
|
+
|
|
125
145
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|
126
146
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
|
127
147
|
}
|
|
@@ -134,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
|
134
154
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
|
135
155
|
}
|
|
136
156
|
|
|
137
|
-
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
138
|
-
buffer->iface.clear(buffer, value);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
157
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
142
158
|
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
|
143
159
|
}
|
|
@@ -198,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
|
198
214
|
}
|
|
199
215
|
|
|
200
216
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
|
201
|
-
return backend->
|
|
217
|
+
return ggml_backend_dev_buffer_type(backend->device);
|
|
202
218
|
}
|
|
203
219
|
|
|
204
220
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
|
@@ -238,43 +254,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
|
238
254
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
239
255
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
240
256
|
|
|
257
|
+
if (size == 0) {
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
|
|
241
261
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
242
262
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
243
263
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
244
264
|
|
|
245
|
-
if (!size) {
|
|
246
|
-
return;
|
|
247
|
-
}
|
|
248
|
-
|
|
249
265
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
|
250
266
|
}
|
|
251
267
|
|
|
252
268
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
253
269
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
254
270
|
|
|
271
|
+
if (size == 0) {
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
255
275
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
256
276
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
257
277
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
258
278
|
|
|
259
|
-
if (!size) {
|
|
260
|
-
return;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
279
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
|
264
280
|
}
|
|
265
281
|
|
|
266
|
-
|
|
282
|
+
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
267
283
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
268
284
|
|
|
269
|
-
|
|
270
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
271
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
272
|
-
|
|
273
|
-
if (!size) {
|
|
285
|
+
if (size == 0) {
|
|
274
286
|
return;
|
|
275
287
|
}
|
|
276
288
|
|
|
277
|
-
GGML_ASSERT(buf
|
|
289
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
290
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
291
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
292
|
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
|
278
293
|
|
|
279
294
|
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
|
280
295
|
}
|
|
@@ -316,33 +331,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|
|
316
331
|
}
|
|
317
332
|
|
|
318
333
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
319
|
-
|
|
320
|
-
if (backend->device) {
|
|
321
|
-
return ggml_backend_dev_supports_op(backend->device, op);
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
return backend->iface.supports_op(backend, op);
|
|
334
|
+
return ggml_backend_dev_supports_op(backend->device, op);
|
|
325
335
|
}
|
|
326
336
|
|
|
327
337
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
328
|
-
|
|
329
|
-
if (backend->device) {
|
|
330
|
-
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
return backend->iface.supports_buft(backend, buft);
|
|
338
|
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
334
339
|
}
|
|
335
340
|
|
|
336
341
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
337
|
-
|
|
338
|
-
if (backend->device) {
|
|
339
|
-
return ggml_backend_dev_offload_op(backend->device, op);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
if (backend->iface.offload_op != NULL) {
|
|
343
|
-
return backend->iface.offload_op(backend, op);
|
|
344
|
-
}
|
|
345
|
-
return false;
|
|
342
|
+
return ggml_backend_dev_offload_op(backend->device, op);
|
|
346
343
|
}
|
|
347
344
|
|
|
348
345
|
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
@@ -379,7 +376,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
|
379
376
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
|
380
377
|
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
|
381
378
|
#ifndef NDEBUG
|
|
382
|
-
|
|
379
|
+
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
|
383
380
|
#endif
|
|
384
381
|
size_t nbytes = ggml_nbytes(src);
|
|
385
382
|
void * data = malloc(nbytes);
|
|
@@ -409,832 +406,123 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|
|
409
406
|
ggml_backend_tensor_copy(src, dst);
|
|
410
407
|
}
|
|
411
408
|
|
|
412
|
-
// events
|
|
413
|
-
|
|
414
|
-
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
415
|
-
// null device is allowed for the transition period to the device interface
|
|
416
|
-
if (device == NULL || device->iface.event_new == NULL) {
|
|
417
|
-
return NULL;
|
|
418
|
-
}
|
|
419
|
-
return device->iface.event_new(device);
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
423
|
-
if (event == NULL) {
|
|
424
|
-
return;
|
|
425
|
-
}
|
|
426
|
-
event->device->iface.event_free(event->device, event);
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
430
|
-
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
431
|
-
|
|
432
|
-
backend->iface.event_record(backend, event);
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
436
|
-
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
437
|
-
|
|
438
|
-
event->device->iface.event_synchronize(event->device, event);
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
442
|
-
GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
443
|
-
|
|
444
|
-
backend->iface.event_wait(backend, event);
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
// Backend device
|
|
448
|
-
|
|
449
|
-
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
450
|
-
return device->iface.get_name(device);
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
454
|
-
return device->iface.get_description(device);
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
458
|
-
device->iface.get_memory(device, free, total);
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
462
|
-
return device->iface.get_type(device);
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
466
|
-
device->iface.get_props(device, props);
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
470
|
-
return device->reg;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
474
|
-
return device->iface.init_backend(device, params);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
478
|
-
return device->iface.get_buffer_type(device);
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
482
|
-
return device->iface.get_host_buffer_type(device);
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
486
|
-
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
490
|
-
return device->iface.supports_op(device, op);
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
494
|
-
return device->iface.supports_buft(device, buft);
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
498
|
-
return device->iface.offload_op(device, op);
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
// Backend (reg)
|
|
502
|
-
|
|
503
|
-
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
504
|
-
return reg->iface.get_name(reg);
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
508
|
-
return reg->iface.get_device_count(reg);
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
512
|
-
return reg->iface.get_device(reg, index);
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
516
|
-
if (!reg->iface.get_proc_address) {
|
|
517
|
-
return NULL;
|
|
518
|
-
}
|
|
519
|
-
return reg->iface.get_proc_address(reg, name);
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
// Backend registry
|
|
523
|
-
|
|
524
|
-
#ifdef GGML_USE_CUDA
|
|
525
|
-
#include "ggml-cuda.h"
|
|
526
|
-
#endif
|
|
527
|
-
|
|
528
|
-
struct ggml_backend_registry {
|
|
529
|
-
std::vector<ggml_backend_reg_t> backends;
|
|
530
|
-
std::vector<ggml_backend_dev_t> devices;
|
|
531
|
-
|
|
532
|
-
ggml_backend_registry() {
|
|
533
|
-
#ifdef GGML_USE_CUDA
|
|
534
|
-
register_backend(ggml_backend_cuda_reg());
|
|
535
|
-
#endif
|
|
536
|
-
|
|
537
|
-
register_backend(ggml_backend_cpu_reg());
|
|
538
|
-
|
|
539
|
-
// TODO: sycl, metal, vulkan, kompute, cann
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
void register_backend(ggml_backend_reg_t reg) {
|
|
543
|
-
#ifndef NDEBUG
|
|
544
|
-
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
|
545
|
-
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
|
546
|
-
#endif
|
|
547
|
-
backends.push_back(reg);
|
|
548
|
-
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
|
549
|
-
register_device(ggml_backend_reg_dev_get(reg, i));
|
|
550
|
-
}
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
void register_device(ggml_backend_dev_t device) {
|
|
554
|
-
#ifndef NDEBUG
|
|
555
|
-
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
|
556
|
-
#endif
|
|
557
|
-
devices.push_back(device);
|
|
558
|
-
}
|
|
559
|
-
};
|
|
560
|
-
|
|
561
|
-
static ggml_backend_registry & get_reg() {
|
|
562
|
-
static ggml_backend_registry reg;
|
|
563
|
-
return reg;
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
// Internal API
|
|
567
|
-
void ggml_backend_register(ggml_backend_reg_t reg) {
|
|
568
|
-
get_reg().register_backend(reg);
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
|
572
|
-
get_reg().register_device(device);
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
// Backend (reg) enumeration
|
|
576
|
-
size_t ggml_backend_reg_count() {
|
|
577
|
-
return get_reg().backends.size();
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
|
581
|
-
GGML_ASSERT(index < ggml_backend_reg_count());
|
|
582
|
-
return get_reg().backends[index];
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
|
586
|
-
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
587
|
-
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
|
588
|
-
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
|
589
|
-
return reg;
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
return NULL;
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
// Device enumeration
|
|
596
|
-
size_t ggml_backend_dev_count() {
|
|
597
|
-
return get_reg().devices.size();
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
|
601
|
-
GGML_ASSERT(index < ggml_backend_dev_count());
|
|
602
|
-
return get_reg().devices[index];
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
|
606
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
607
|
-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
608
|
-
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
|
609
|
-
return dev;
|
|
610
|
-
}
|
|
611
|
-
}
|
|
612
|
-
return NULL;
|
|
613
|
-
}
|
|
614
|
-
|
|
615
|
-
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
|
616
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
617
|
-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
618
|
-
if (ggml_backend_dev_type(dev) == type) {
|
|
619
|
-
return dev;
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
return NULL;
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
// Convenience functions
|
|
626
|
-
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
|
627
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
628
|
-
if (!dev) {
|
|
629
|
-
return NULL;
|
|
630
|
-
}
|
|
631
|
-
return ggml_backend_dev_init(dev, params);
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
|
635
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
|
636
|
-
if (!dev) {
|
|
637
|
-
return NULL;
|
|
638
|
-
}
|
|
639
|
-
return ggml_backend_dev_init(dev, params);
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
ggml_backend_t ggml_backend_init_best(void) {
|
|
643
|
-
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
|
644
|
-
if (!dev) {
|
|
645
|
-
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
|
646
|
-
}
|
|
647
|
-
if (!dev) {
|
|
648
|
-
return NULL;
|
|
649
|
-
}
|
|
650
|
-
return ggml_backend_dev_init(dev, NULL);
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
// backend CPU
|
|
654
|
-
|
|
655
|
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
|
656
|
-
|
|
657
|
-
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
658
|
-
return "CPU";
|
|
659
|
-
|
|
660
|
-
GGML_UNUSED(buffer);
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
664
|
-
uintptr_t data = (uintptr_t)buffer->context;
|
|
665
|
-
|
|
666
|
-
// align the buffer
|
|
667
|
-
if (data % TENSOR_ALIGNMENT != 0) {
|
|
668
|
-
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
669
|
-
}
|
|
670
|
-
|
|
671
|
-
return (void *)data;
|
|
672
|
-
}
|
|
673
|
-
|
|
674
|
-
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
675
|
-
free(buffer->context);
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
679
|
-
memset((char *)tensor->data + offset, value, size);
|
|
680
|
-
|
|
681
|
-
GGML_UNUSED(buffer);
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
685
|
-
memcpy((char *)tensor->data + offset, data, size);
|
|
686
|
-
|
|
687
|
-
GGML_UNUSED(buffer);
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
691
|
-
memcpy(data, (const char *)tensor->data + offset, size);
|
|
692
|
-
|
|
693
|
-
GGML_UNUSED(buffer);
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
697
|
-
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
698
|
-
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
699
|
-
return true;
|
|
700
|
-
}
|
|
701
|
-
return false;
|
|
702
|
-
|
|
703
|
-
GGML_UNUSED(buffer);
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
707
|
-
memset(buffer->context, value, buffer->size);
|
|
708
|
-
}
|
|
709
|
-
|
|
710
|
-
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
711
|
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
712
|
-
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
713
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
714
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
715
|
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
716
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
717
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
718
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
719
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
720
|
-
/* .reset = */ NULL,
|
|
721
|
-
};
|
|
722
|
-
|
|
723
|
-
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
724
|
-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
725
|
-
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
726
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
727
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
728
|
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
729
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
730
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
731
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
732
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
733
|
-
/* .reset = */ NULL,
|
|
734
|
-
};
|
|
735
|
-
|
|
736
|
-
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
737
|
-
return "CPU";
|
|
738
|
-
|
|
739
|
-
GGML_UNUSED(buft);
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
743
|
-
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
|
744
|
-
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
|
745
|
-
if (data == NULL) {
|
|
746
|
-
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
747
|
-
return NULL;
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
754
|
-
return TENSOR_ALIGNMENT;
|
|
755
|
-
|
|
756
|
-
GGML_UNUSED(buft);
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
760
|
-
return true;
|
|
761
|
-
|
|
762
|
-
GGML_UNUSED(buft);
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
766
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
767
|
-
/* .iface = */ {
|
|
768
|
-
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
769
|
-
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
770
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
771
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
772
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
773
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
774
|
-
},
|
|
775
|
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
776
|
-
/* .context = */ NULL,
|
|
777
|
-
};
|
|
778
|
-
|
|
779
|
-
return &ggml_backend_cpu_buffer_type;
|
|
780
|
-
}
|
|
781
|
-
|
|
782
|
-
#ifdef GGML_USE_CPU_HBM
|
|
783
|
-
|
|
784
|
-
// buffer type HBM
|
|
785
|
-
|
|
786
|
-
#include <hbwmalloc.h>
|
|
787
|
-
|
|
788
|
-
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
789
|
-
return "CPU_HBM";
|
|
790
|
-
|
|
791
|
-
GGML_UNUSED(buft);
|
|
792
|
-
}
|
|
793
|
-
|
|
794
|
-
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
|
795
|
-
return "CPU_HBM";
|
|
796
|
-
|
|
797
|
-
GGML_UNUSED(buf);
|
|
798
|
-
}
|
|
799
|
-
|
|
800
|
-
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
801
|
-
hbw_free(buffer->context);
|
|
802
|
-
}
|
|
803
|
-
|
|
804
|
-
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
805
|
-
//void * ptr = hbw_malloc(size);
|
|
806
|
-
void * ptr;
|
|
807
|
-
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
808
|
-
if (result != 0) {
|
|
809
|
-
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
|
810
|
-
return NULL;
|
|
811
|
-
}
|
|
812
|
-
|
|
813
|
-
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
814
|
-
buffer->buft = buft;
|
|
815
|
-
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
|
816
|
-
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
817
|
-
|
|
818
|
-
return buffer;
|
|
819
|
-
}
|
|
820
|
-
|
|
821
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
822
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
823
|
-
/* .iface = */ {
|
|
824
|
-
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
825
|
-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
826
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
827
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
828
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
829
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
830
|
-
},
|
|
831
|
-
/* .context = */ NULL,
|
|
832
|
-
};
|
|
833
|
-
|
|
834
|
-
return &ggml_backend_cpu_buffer_type_hbm;
|
|
835
|
-
}
|
|
836
|
-
#endif
|
|
837
|
-
|
|
838
|
-
struct ggml_backend_cpu_context {
|
|
839
|
-
int n_threads;
|
|
840
|
-
ggml_threadpool_t threadpool;
|
|
841
|
-
|
|
842
|
-
uint8_t * work_data;
|
|
843
|
-
size_t work_size;
|
|
844
|
-
|
|
845
|
-
ggml_abort_callback abort_callback;
|
|
846
|
-
void * abort_callback_data;
|
|
847
|
-
};
|
|
848
|
-
|
|
849
|
-
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
|
850
|
-
return "CPU";
|
|
851
|
-
|
|
852
|
-
GGML_UNUSED(backend);
|
|
853
|
-
}
|
|
854
|
-
|
|
855
|
-
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
856
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
857
|
-
delete[] cpu_ctx->work_data;
|
|
858
|
-
delete cpu_ctx;
|
|
859
|
-
delete backend;
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
863
|
-
return ggml_backend_cpu_buffer_type();
|
|
864
|
-
|
|
865
|
-
GGML_UNUSED(backend);
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
struct ggml_backend_plan_cpu {
|
|
869
|
-
struct ggml_cplan cplan;
|
|
870
|
-
struct ggml_cgraph cgraph;
|
|
871
|
-
};
|
|
872
|
-
|
|
873
|
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
874
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
875
|
-
|
|
876
|
-
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
|
877
|
-
|
|
878
|
-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
879
|
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
|
880
|
-
|
|
881
|
-
if (cpu_plan->cplan.work_size > 0) {
|
|
882
|
-
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
|
883
|
-
if (cpu_plan->cplan.work_data == NULL) {
|
|
884
|
-
delete cpu_plan;
|
|
885
|
-
return NULL;
|
|
886
|
-
}
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
|
890
|
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
891
|
-
|
|
892
|
-
return cpu_plan;
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
896
|
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
897
|
-
|
|
898
|
-
delete[] cpu_plan->cplan.work_data;
|
|
899
|
-
delete cpu_plan;
|
|
900
|
-
|
|
901
|
-
GGML_UNUSED(backend);
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
905
|
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
906
|
-
|
|
907
|
-
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
908
|
-
|
|
909
|
-
GGML_UNUSED(backend);
|
|
910
|
-
}
|
|
911
|
-
|
|
912
|
-
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
913
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
914
|
-
|
|
915
|
-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
916
|
-
|
|
917
|
-
if (cpu_ctx->work_size < cplan.work_size) {
|
|
918
|
-
delete[] cpu_ctx->work_data;
|
|
919
|
-
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
|
920
|
-
if (cpu_ctx->work_data == NULL) {
|
|
921
|
-
cpu_ctx->work_size = 0;
|
|
922
|
-
return GGML_STATUS_ALLOC_FAILED;
|
|
923
|
-
}
|
|
924
|
-
cpu_ctx->work_size = cplan.work_size;
|
|
925
|
-
}
|
|
926
|
-
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
|
927
|
-
|
|
928
|
-
cplan.abort_callback = cpu_ctx->abort_callback;
|
|
929
|
-
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
930
|
-
|
|
931
|
-
return ggml_graph_compute(cgraph, &cplan);
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
935
|
-
/* .get_name = */ ggml_backend_cpu_get_name,
|
|
936
|
-
/* .free = */ ggml_backend_cpu_free,
|
|
937
|
-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
938
|
-
/* .set_tensor_async = */ NULL,
|
|
939
|
-
/* .get_tensor_async = */ NULL,
|
|
940
|
-
/* .cpy_tensor_async = */ NULL,
|
|
941
|
-
/* .synchronize = */ NULL,
|
|
942
|
-
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
943
|
-
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
944
|
-
/* .graph_plan_update = */ NULL,
|
|
945
|
-
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
946
|
-
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
947
|
-
/* .supports_op = */ NULL,
|
|
948
|
-
/* .supports_buft = */ NULL,
|
|
949
|
-
/* .offload_op = */ NULL,
|
|
950
|
-
/* .event_record = */ NULL,
|
|
951
|
-
/* .event_wait = */ NULL,
|
|
952
|
-
};
|
|
953
|
-
|
|
954
|
-
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
955
|
-
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
|
956
|
-
return &guid;
|
|
957
|
-
}
|
|
958
|
-
|
|
959
|
-
ggml_backend_t ggml_backend_cpu_init(void) {
|
|
960
|
-
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
|
961
|
-
if (ctx == NULL) {
|
|
962
|
-
return NULL;
|
|
963
|
-
}
|
|
964
|
-
|
|
965
|
-
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
|
966
|
-
ctx->threadpool = NULL;
|
|
967
|
-
ctx->work_data = NULL;
|
|
968
|
-
ctx->work_size = 0;
|
|
969
|
-
ctx->abort_callback = NULL;
|
|
970
|
-
ctx->abort_callback_data = NULL;
|
|
971
|
-
|
|
972
|
-
ggml_backend_t cpu_backend = new ggml_backend {
|
|
973
|
-
/* .guid = */ ggml_backend_cpu_guid(),
|
|
974
|
-
/* .interface = */ ggml_backend_cpu_i,
|
|
975
|
-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
976
|
-
/* .context = */ ctx,
|
|
977
|
-
};
|
|
978
|
-
|
|
979
|
-
if (cpu_backend == NULL) {
|
|
980
|
-
delete ctx;
|
|
981
|
-
return NULL;
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
return cpu_backend;
|
|
985
|
-
}
|
|
986
|
-
|
|
987
|
-
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
988
|
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
989
|
-
}
|
|
990
|
-
|
|
991
|
-
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
992
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
993
|
-
|
|
994
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
995
|
-
ctx->n_threads = n_threads;
|
|
996
|
-
}
|
|
997
|
-
|
|
998
|
-
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
|
999
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
1000
|
-
|
|
1001
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
1002
|
-
|
|
1003
|
-
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
|
1004
|
-
// already had a different threadpool, pause/suspend it before switching
|
|
1005
|
-
ggml_threadpool_pause(ctx->threadpool);
|
|
1006
|
-
}
|
|
1007
|
-
ctx->threadpool = threadpool;
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
|
1011
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
1012
|
-
|
|
1013
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
1014
|
-
ctx->abort_callback = abort_callback;
|
|
1015
|
-
ctx->abort_callback_data = abort_callback_data;
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
1019
|
-
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
1020
|
-
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
1021
|
-
}
|
|
1022
|
-
|
|
1023
|
-
////////////////////////
|
|
409
|
+
// events
|
|
1024
410
|
|
|
1025
|
-
|
|
1026
|
-
|
|
411
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
412
|
+
// null device is allowed for the transition period to the device interface
|
|
413
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
|
414
|
+
return NULL;
|
|
415
|
+
}
|
|
416
|
+
return device->iface.event_new(device);
|
|
417
|
+
}
|
|
1027
418
|
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
|
1032
|
-
description.resize(len);
|
|
1033
|
-
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
|
1034
|
-
}
|
|
1035
|
-
#elif defined(__linux__)
|
|
1036
|
-
FILE * f = fopen("/proc/cpuinfo", "r");
|
|
1037
|
-
if (f) {
|
|
1038
|
-
char buf[1024];
|
|
1039
|
-
while (fgets(buf, sizeof(buf), f)) {
|
|
1040
|
-
if (strncmp(buf, "model name", 10) == 0) {
|
|
1041
|
-
char * p = strchr(buf, ':');
|
|
1042
|
-
if (p) {
|
|
1043
|
-
p++;
|
|
1044
|
-
while (std::isspace(*p)) {
|
|
1045
|
-
p++;
|
|
1046
|
-
}
|
|
1047
|
-
while (std::isspace(p[strlen(p) - 1])) {
|
|
1048
|
-
p[strlen(p) - 1] = '\0';
|
|
1049
|
-
}
|
|
1050
|
-
description = p;
|
|
1051
|
-
break;
|
|
1052
|
-
}
|
|
1053
|
-
}
|
|
1054
|
-
}
|
|
1055
|
-
fclose(f);
|
|
1056
|
-
}
|
|
1057
|
-
#elif defined(_WIN32)
|
|
1058
|
-
HKEY hKey;
|
|
1059
|
-
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
1060
|
-
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
1061
|
-
0,
|
|
1062
|
-
KEY_READ,
|
|
1063
|
-
&hKey) == ERROR_SUCCESS) {
|
|
1064
|
-
DWORD cpu_brand_size = 0;
|
|
1065
|
-
if (RegQueryValueExA(hKey,
|
|
1066
|
-
TEXT("ProcessorNameString"),
|
|
1067
|
-
NULL,
|
|
1068
|
-
NULL,
|
|
1069
|
-
NULL,
|
|
1070
|
-
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1071
|
-
description.resize(cpu_brand_size);
|
|
1072
|
-
if (RegQueryValueExA(hKey,
|
|
1073
|
-
TEXT("ProcessorNameString"),
|
|
1074
|
-
NULL,
|
|
1075
|
-
NULL,
|
|
1076
|
-
(LPBYTE)&description[0], // NOLINT
|
|
1077
|
-
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1078
|
-
if (description.find('\0') != std::string::npos) {
|
|
1079
|
-
description.resize(description.find('\0'));
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
RegCloseKey(hKey);
|
|
1084
|
-
}
|
|
1085
|
-
#endif
|
|
419
|
+
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
420
|
+
if (event == NULL) {
|
|
421
|
+
return;
|
|
1086
422
|
}
|
|
1087
|
-
|
|
423
|
+
event->device->iface.event_free(event->device, event);
|
|
424
|
+
}
|
|
1088
425
|
|
|
1089
|
-
|
|
1090
|
-
|
|
426
|
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
427
|
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
1091
428
|
|
|
1092
|
-
|
|
429
|
+
backend->iface.event_record(backend, event);
|
|
1093
430
|
}
|
|
1094
431
|
|
|
1095
|
-
|
|
1096
|
-
|
|
432
|
+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
433
|
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
1097
434
|
|
|
1098
|
-
|
|
435
|
+
event->device->iface.event_synchronize(event->device, event);
|
|
1099
436
|
}
|
|
1100
437
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
*free = 0;
|
|
1104
|
-
*total = 0;
|
|
438
|
+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
439
|
+
GGML_ASSERT(backend->iface.event_wait != NULL);
|
|
1105
440
|
|
|
1106
|
-
|
|
441
|
+
backend->iface.event_wait(backend, event);
|
|
1107
442
|
}
|
|
1108
443
|
|
|
1109
|
-
|
|
1110
|
-
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
|
444
|
+
// Backend device
|
|
1111
445
|
|
|
1112
|
-
|
|
446
|
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
447
|
+
return device->iface.get_name(device);
|
|
1113
448
|
}
|
|
1114
449
|
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
props->description = ggml_backend_cpu_device_get_description(dev);
|
|
1118
|
-
props->type = ggml_backend_cpu_device_get_type(dev);
|
|
1119
|
-
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
1120
|
-
props->caps = {
|
|
1121
|
-
/* async */ false,
|
|
1122
|
-
/* host_buffer */ false,
|
|
1123
|
-
/* events */ false,
|
|
1124
|
-
};
|
|
450
|
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
451
|
+
return device->iface.get_description(device);
|
|
1125
452
|
}
|
|
1126
453
|
|
|
1127
|
-
|
|
1128
|
-
|
|
454
|
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
455
|
+
device->iface.get_memory(device, free, total);
|
|
456
|
+
}
|
|
1129
457
|
|
|
1130
|
-
|
|
1131
|
-
|
|
458
|
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
459
|
+
return device->iface.get_type(device);
|
|
1132
460
|
}
|
|
1133
461
|
|
|
1134
|
-
|
|
1135
|
-
|
|
462
|
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
463
|
+
memset(props, 0, sizeof(*props));
|
|
464
|
+
device->iface.get_props(device, props);
|
|
465
|
+
}
|
|
1136
466
|
|
|
1137
|
-
|
|
467
|
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
468
|
+
return device->reg;
|
|
1138
469
|
}
|
|
1139
470
|
|
|
1140
|
-
|
|
1141
|
-
return
|
|
471
|
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
472
|
+
return device->iface.init_backend(device, params);
|
|
473
|
+
}
|
|
1142
474
|
|
|
1143
|
-
|
|
1144
|
-
|
|
475
|
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
476
|
+
return device->iface.get_buffer_type(device);
|
|
1145
477
|
}
|
|
1146
478
|
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
return
|
|
1151
|
-
op->type != GGML_TYPE_IQ2_XXS &&
|
|
1152
|
-
op->type != GGML_TYPE_IQ2_XS &&
|
|
1153
|
-
op->type != GGML_TYPE_IQ1_S &&
|
|
1154
|
-
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
1155
|
-
case GGML_OP_MUL_MAT:
|
|
1156
|
-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
1157
|
-
case GGML_OP_ROPE_BACK:
|
|
1158
|
-
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
|
1159
|
-
case GGML_OP_IM2COL_BACK:
|
|
1160
|
-
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
1161
|
-
case GGML_OP_OUT_PROD:
|
|
1162
|
-
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
|
1163
|
-
default:
|
|
1164
|
-
return true;
|
|
479
|
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
480
|
+
if (device->iface.get_host_buffer_type == NULL) {
|
|
481
|
+
return NULL;
|
|
1165
482
|
}
|
|
1166
483
|
|
|
1167
|
-
|
|
484
|
+
return device->iface.get_host_buffer_type(device);
|
|
1168
485
|
}
|
|
1169
486
|
|
|
1170
|
-
|
|
1171
|
-
return
|
|
1172
|
-
|
|
1173
|
-
GGML_UNUSED(dev);
|
|
487
|
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
488
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
1174
489
|
}
|
|
1175
490
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
|
1179
|
-
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
|
1180
|
-
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
|
1181
|
-
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
|
1182
|
-
/* .init_backend = */ ggml_backend_cpu_device_init,
|
|
1183
|
-
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
|
1184
|
-
/* .get_host_buffer_type = */ NULL,
|
|
1185
|
-
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
|
1186
|
-
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
|
1187
|
-
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
|
1188
|
-
/* .offload_op = */ NULL,
|
|
1189
|
-
/* .event_new = */ NULL,
|
|
1190
|
-
/* .event_free = */ NULL,
|
|
1191
|
-
/* .event_synchronize = */ NULL,
|
|
1192
|
-
};
|
|
1193
|
-
|
|
1194
|
-
////////////////////////
|
|
1195
|
-
|
|
1196
|
-
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
|
1197
|
-
return "CPU";
|
|
1198
|
-
|
|
1199
|
-
GGML_UNUSED(reg);
|
|
491
|
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
492
|
+
return device->iface.supports_op(device, op);
|
|
1200
493
|
}
|
|
1201
494
|
|
|
1202
|
-
|
|
1203
|
-
return
|
|
1204
|
-
|
|
1205
|
-
GGML_UNUSED(reg);
|
|
495
|
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
496
|
+
return device->iface.supports_buft(device, buft);
|
|
1206
497
|
}
|
|
1207
498
|
|
|
1208
|
-
|
|
1209
|
-
|
|
499
|
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
500
|
+
if (device->iface.offload_op != NULL) {
|
|
501
|
+
return device->iface.offload_op(device, op);
|
|
502
|
+
}
|
|
1210
503
|
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
/* .iface = */ ggml_backend_cpu_device_i,
|
|
1214
|
-
/* .reg = */ reg,
|
|
1215
|
-
/* .context = */ &ctx,
|
|
1216
|
-
};
|
|
504
|
+
return false;
|
|
505
|
+
}
|
|
1217
506
|
|
|
1218
|
-
|
|
507
|
+
// Backend (reg)
|
|
1219
508
|
|
|
1220
|
-
|
|
1221
|
-
|
|
509
|
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
510
|
+
return reg->iface.get_name(reg);
|
|
1222
511
|
}
|
|
1223
512
|
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
|
1228
|
-
/* .get_proc_address = */ NULL,
|
|
1229
|
-
};
|
|
513
|
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
514
|
+
return reg->iface.get_device_count(reg);
|
|
515
|
+
}
|
|
1230
516
|
|
|
1231
|
-
ggml_backend_reg_t
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
/* .context = */ NULL,
|
|
1235
|
-
};
|
|
517
|
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
518
|
+
return reg->iface.get_device(reg, index);
|
|
519
|
+
}
|
|
1236
520
|
|
|
1237
|
-
|
|
521
|
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
522
|
+
if (!reg->iface.get_proc_address) {
|
|
523
|
+
return NULL;
|
|
524
|
+
}
|
|
525
|
+
return reg->iface.get_proc_address(reg, name);
|
|
1238
526
|
}
|
|
1239
527
|
|
|
1240
528
|
// multi-buffer buffer
|
|
@@ -1244,12 +532,6 @@ struct ggml_backend_multi_buffer_context {
|
|
|
1244
532
|
size_t n_buffers;
|
|
1245
533
|
};
|
|
1246
534
|
|
|
1247
|
-
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
1248
|
-
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1249
|
-
|
|
1250
|
-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
|
1251
|
-
}
|
|
1252
|
-
|
|
1253
535
|
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1254
536
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1255
537
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
@@ -1268,7 +550,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|
|
1268
550
|
}
|
|
1269
551
|
|
|
1270
552
|
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
|
1271
|
-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
|
1272
553
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
|
1273
554
|
/* .get_base = */ NULL,
|
|
1274
555
|
/* .init_tensor = */ NULL,
|
|
@@ -1297,7 +578,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
|
|
1297
578
|
}
|
|
1298
579
|
|
|
1299
580
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
|
1300
|
-
return buffer->iface.
|
|
581
|
+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
|
1301
582
|
}
|
|
1302
583
|
|
|
1303
584
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
@@ -1389,7 +670,7 @@ struct ggml_backend_sched {
|
|
|
1389
670
|
char * context_buffer;
|
|
1390
671
|
size_t context_buffer_size;
|
|
1391
672
|
|
|
1392
|
-
|
|
673
|
+
int debug;
|
|
1393
674
|
};
|
|
1394
675
|
|
|
1395
676
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -1408,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
|
1408
689
|
}
|
|
1409
690
|
|
|
1410
691
|
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
|
1411
|
-
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
692
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1412
693
|
if (buffer == NULL) {
|
|
1413
694
|
return -1;
|
|
1414
695
|
}
|
|
@@ -1422,7 +703,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
|
1422
703
|
}
|
|
1423
704
|
|
|
1424
705
|
#ifndef NDEBUG
|
|
1425
|
-
|
|
706
|
+
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
|
1426
707
|
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
|
1427
708
|
#endif
|
|
1428
709
|
|
|
@@ -1441,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
|
|
|
1441
722
|
|
|
1442
723
|
// returns the backend that should be used for the node based on the current locations
|
|
1443
724
|
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
|
1444
|
-
// TODO: use supports_op to check if the backend supports the op
|
|
1445
|
-
|
|
1446
725
|
// assign pre-allocated nodes to their backend
|
|
1447
726
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
|
1448
727
|
if (cur_backend_id != -1) {
|
|
@@ -1461,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1461
740
|
|
|
1462
741
|
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
|
1463
742
|
// since the tensor is pre-allocated, it cannot be moved to another backend
|
|
1464
|
-
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
|
743
|
+
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
|
|
1465
744
|
}
|
|
1466
745
|
|
|
1467
746
|
// graph input
|
|
@@ -1477,7 +756,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1477
756
|
if (src == NULL) {
|
|
1478
757
|
continue;
|
|
1479
758
|
}
|
|
1480
|
-
|
|
759
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
|
760
|
+
// not an ideal solution
|
|
761
|
+
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1481
762
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
|
1482
763
|
// check if a backend with higher prio wants to offload the op
|
|
1483
764
|
if (src_backend_id == sched->n_backends - 1) {
|
|
@@ -1511,32 +792,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1511
792
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1512
793
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
|
1513
794
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
|
1514
|
-
|
|
795
|
+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
|
1515
796
|
sched->splits[cur_split].n_inputs);
|
|
1516
797
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
|
1517
|
-
|
|
798
|
+
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
|
1518
799
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
|
1519
800
|
}
|
|
1520
|
-
|
|
801
|
+
GGML_LOG_DEBUG("\n");
|
|
1521
802
|
cur_split++;
|
|
1522
803
|
}
|
|
1523
804
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1524
805
|
if (ggml_is_view_op(node->op)) {
|
|
1525
806
|
continue;
|
|
1526
807
|
}
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
808
|
+
if (sched->debug > 1) {
|
|
809
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
810
|
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
|
811
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
|
812
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
813
|
+
struct ggml_tensor * src = node->src[j];
|
|
814
|
+
if (src == NULL) {
|
|
815
|
+
continue;
|
|
816
|
+
}
|
|
817
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
818
|
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
819
|
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
1534
820
|
}
|
|
1535
|
-
|
|
1536
|
-
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
1537
|
-
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
821
|
+
GGML_LOG_DEBUG("\n");
|
|
1538
822
|
}
|
|
1539
|
-
fprintf(stderr, "\n");
|
|
1540
823
|
}
|
|
1541
824
|
}
|
|
1542
825
|
|
|
@@ -1601,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1601
884
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1602
885
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1603
886
|
int * node_backend_id = &tensor_backend_id(node);
|
|
887
|
+
if (ggml_is_view_op(node->op)) {
|
|
888
|
+
continue;
|
|
889
|
+
}
|
|
1604
890
|
// do not overwrite user assignments
|
|
1605
891
|
if (*node_backend_id == -1) {
|
|
1606
892
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
|
@@ -1828,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1828
1114
|
if (src == NULL) {
|
|
1829
1115
|
continue;
|
|
1830
1116
|
}
|
|
1831
|
-
// check if a weight is on a different backend
|
|
1117
|
+
// check if a weight is on a different and incompatible backend
|
|
1832
1118
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
|
1833
1119
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1834
1120
|
int src_backend_id = tensor_backend_id(src);
|
|
1835
|
-
if (src_backend_id != cur_backend_id) {
|
|
1121
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
|
1836
1122
|
need_new_split = true;
|
|
1837
1123
|
break;
|
|
1838
1124
|
}
|
|
@@ -1844,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1844
1130
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
|
1845
1131
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1846
1132
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
|
1847
|
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
1848
1133
|
need_new_split = true;
|
|
1849
1134
|
break;
|
|
1850
1135
|
}
|
|
@@ -2050,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
2050
1335
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
2051
1336
|
ggml_backend_sched_synchronize(sched);
|
|
2052
1337
|
#ifndef NDEBUG
|
|
2053
|
-
|
|
1338
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
2054
1339
|
#endif
|
|
2055
1340
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
2056
1341
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
2057
|
-
|
|
1342
|
+
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
2058
1343
|
return false;
|
|
2059
1344
|
}
|
|
2060
1345
|
}
|
|
@@ -2165,11 +1450,12 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
2165
1450
|
bool parallel) {
|
|
2166
1451
|
GGML_ASSERT(n_backends > 0);
|
|
2167
1452
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
|
2168
|
-
GGML_ASSERT(
|
|
1453
|
+
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2169
1454
|
|
|
2170
1455
|
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
|
2171
1456
|
|
|
2172
|
-
|
|
1457
|
+
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
1458
|
+
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
2173
1459
|
sched->n_backends = n_backends;
|
|
2174
1460
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
2175
1461
|
|
|
@@ -2197,6 +1483,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
2197
1483
|
sched->backends[b] = backends[b];
|
|
2198
1484
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
|
2199
1485
|
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1486
|
+
|
|
2200
1487
|
if (sched->n_copies > 1) {
|
|
2201
1488
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
2202
1489
|
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
|
@@ -2252,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
2252
1539
|
|
|
2253
1540
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
2254
1541
|
|
|
1542
|
+
ggml_backend_sched_synchronize(sched);
|
|
1543
|
+
|
|
2255
1544
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
2256
1545
|
return false;
|
|
2257
1546
|
}
|
|
2258
1547
|
|
|
2259
1548
|
ggml_backend_sched_reset(sched);
|
|
2260
|
-
ggml_backend_sched_synchronize(sched);
|
|
2261
1549
|
|
|
2262
1550
|
return true;
|
|
2263
1551
|
}
|
|
@@ -2448,7 +1736,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2448
1736
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
|
2449
1737
|
|
|
2450
1738
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
|
2451
|
-
|
|
1739
|
+
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
|
2452
1740
|
ggml_hash_set_free(&hash_set);
|
|
2453
1741
|
free(node_copies);
|
|
2454
1742
|
free(node_init);
|
|
@@ -2471,7 +1759,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2471
1759
|
// allocate nodes
|
|
2472
1760
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
2473
1761
|
if (buffer == NULL) {
|
|
2474
|
-
|
|
1762
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
|
2475
1763
|
ggml_hash_set_free(&hash_set);
|
|
2476
1764
|
free(node_copies);
|
|
2477
1765
|
free(node_init);
|
|
@@ -2558,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
2558
1846
|
|
|
2559
1847
|
return true;
|
|
2560
1848
|
}
|
|
1849
|
+
|
|
1850
|
+
// CPU backend - buffer
|
|
1851
|
+
|
|
1852
|
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1853
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
|
1854
|
+
|
|
1855
|
+
// align the buffer
|
|
1856
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
|
1857
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
return (void *)data;
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1864
|
+
ggml_aligned_free(buffer->context, buffer->size);
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
1868
|
+
memset((char *)tensor->data + offset, value, size);
|
|
1869
|
+
|
|
1870
|
+
GGML_UNUSED(buffer);
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
1874
|
+
memcpy((char *)tensor->data + offset, data, size);
|
|
1875
|
+
|
|
1876
|
+
GGML_UNUSED(buffer);
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
1880
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1881
|
+
|
|
1882
|
+
GGML_UNUSED(buffer);
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
1886
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
1887
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
1888
|
+
return true;
|
|
1889
|
+
}
|
|
1890
|
+
return false;
|
|
1891
|
+
|
|
1892
|
+
GGML_UNUSED(buffer);
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1896
|
+
memset(buffer->context, value, buffer->size);
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
1900
|
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
1901
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1902
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1903
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1904
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1905
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1906
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1907
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1908
|
+
/* .reset = */ NULL,
|
|
1909
|
+
};
|
|
1910
|
+
|
|
1911
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
1912
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
1913
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1914
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1915
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1916
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1917
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1918
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1919
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1920
|
+
/* .reset = */ NULL,
|
|
1921
|
+
};
|
|
1922
|
+
|
|
1923
|
+
// CPU backend buffer type
|
|
1924
|
+
|
|
1925
|
+
// this buffer type is defined here to make it available to all backends
|
|
1926
|
+
|
|
1927
|
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1928
|
+
return "CPU";
|
|
1929
|
+
|
|
1930
|
+
GGML_UNUSED(buft);
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1934
|
+
void * data = ggml_aligned_malloc(size);
|
|
1935
|
+
|
|
1936
|
+
if (data == NULL) {
|
|
1937
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
1938
|
+
return NULL;
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1945
|
+
return TENSOR_ALIGNMENT;
|
|
1946
|
+
|
|
1947
|
+
GGML_UNUSED(buft);
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1950
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
1951
|
+
return true;
|
|
1952
|
+
|
|
1953
|
+
GGML_UNUSED(buft);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
1957
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1958
|
+
/* .iface = */ {
|
|
1959
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
1960
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1961
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1962
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1963
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1964
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1965
|
+
},
|
|
1966
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1967
|
+
/* .context = */ NULL,
|
|
1968
|
+
};
|
|
1969
|
+
|
|
1970
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1974
|
+
return "CPU_Mapped";
|
|
1975
|
+
|
|
1976
|
+
GGML_UNUSED(buft);
|
|
1977
|
+
}
|
|
1978
|
+
|
|
1979
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
|
1980
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1981
|
+
/* .iface = */ {
|
|
1982
|
+
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
|
1983
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1984
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1985
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1986
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1987
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1988
|
+
},
|
|
1989
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1990
|
+
/* .context = */ NULL,
|
|
1991
|
+
};
|
|
1992
|
+
|
|
1993
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
1997
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
1998
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
1999
|
+
}
|