@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
// Note: porting this file to C++ is a work in progress
|
|
2
|
+
|
|
3
|
+
#ifdef _WIN32
|
|
4
|
+
#define WIN32_LEAN_AND_MEAN
|
|
5
|
+
#ifndef NOMINMAX
|
|
6
|
+
# define NOMINMAX
|
|
7
|
+
#endif
|
|
8
|
+
#include <windows.h>
|
|
9
|
+
#endif
|
|
10
|
+
|
|
1
11
|
#include "ggml-backend-impl.h"
|
|
2
12
|
#include "ggml-alloc.h"
|
|
3
13
|
#include "ggml-impl.h"
|
|
@@ -8,9 +18,14 @@
|
|
|
8
18
|
#include <stdio.h>
|
|
9
19
|
#include <stdlib.h>
|
|
10
20
|
#include <string.h>
|
|
21
|
+
#include <string>
|
|
22
|
+
#include <vector>
|
|
11
23
|
|
|
24
|
+
#ifdef __APPLE__
|
|
25
|
+
#include <sys/types.h>
|
|
26
|
+
#include <sys/sysctl.h>
|
|
27
|
+
#endif
|
|
12
28
|
|
|
13
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
14
29
|
|
|
15
30
|
// backend buffer type
|
|
16
31
|
|
|
@@ -18,7 +33,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
18
33
|
return buft->iface.get_name(buft);
|
|
19
34
|
}
|
|
20
35
|
|
|
21
|
-
|
|
36
|
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
22
37
|
return buft->iface.alloc_buffer(buft, size);
|
|
23
38
|
}
|
|
24
39
|
|
|
@@ -34,7 +49,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
|
34
49
|
return SIZE_MAX;
|
|
35
50
|
}
|
|
36
51
|
|
|
37
|
-
|
|
52
|
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
|
38
53
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
|
39
54
|
if (buft->iface.get_alloc_size) {
|
|
40
55
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -51,16 +66,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
|
51
66
|
return false;
|
|
52
67
|
}
|
|
53
68
|
|
|
54
|
-
|
|
69
|
+
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
|
70
|
+
return buft->device;
|
|
71
|
+
}
|
|
55
72
|
|
|
56
|
-
|
|
57
|
-
ggml_backend_buffer_type_t buft,
|
|
58
|
-
struct ggml_backend_buffer_i iface,
|
|
59
|
-
ggml_backend_buffer_context_t context,
|
|
60
|
-
size_t size) {
|
|
61
|
-
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
|
73
|
+
// backend buffer
|
|
62
74
|
|
|
63
|
-
|
|
75
|
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
76
|
+
ggml_backend_buffer_type_t buft,
|
|
77
|
+
struct ggml_backend_buffer_i iface,
|
|
78
|
+
void * context,
|
|
79
|
+
size_t size) {
|
|
80
|
+
ggml_backend_buffer_t buffer = new ggml_backend_buffer {
|
|
64
81
|
/* .interface = */ iface,
|
|
65
82
|
/* .buft = */ buft,
|
|
66
83
|
/* .context = */ context,
|
|
@@ -83,7 +100,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
|
83
100
|
if (buffer->iface.free_buffer != NULL) {
|
|
84
101
|
buffer->iface.free_buffer(buffer);
|
|
85
102
|
}
|
|
86
|
-
|
|
103
|
+
delete buffer;
|
|
87
104
|
}
|
|
88
105
|
|
|
89
106
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
@@ -98,14 +115,14 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
98
115
|
return base;
|
|
99
116
|
}
|
|
100
117
|
|
|
101
|
-
|
|
118
|
+
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
102
119
|
// init_tensor is optional
|
|
103
120
|
if (buffer->iface.init_tensor) {
|
|
104
121
|
buffer->iface.init_tensor(buffer, tensor);
|
|
105
122
|
}
|
|
106
123
|
}
|
|
107
124
|
|
|
108
|
-
size_t ggml_backend_buffer_get_alignment
|
|
125
|
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|
109
126
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
|
110
127
|
}
|
|
111
128
|
|
|
@@ -218,7 +235,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
|
218
235
|
}
|
|
219
236
|
}
|
|
220
237
|
|
|
221
|
-
|
|
238
|
+
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
222
239
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
223
240
|
|
|
224
241
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
@@ -232,7 +249,7 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
|
|
232
249
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
|
233
250
|
}
|
|
234
251
|
|
|
235
|
-
|
|
252
|
+
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
236
253
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
237
254
|
|
|
238
255
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
@@ -246,6 +263,22 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
|
|
246
263
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
|
247
264
|
}
|
|
248
265
|
|
|
266
|
+
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
267
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
268
|
+
|
|
269
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
270
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
271
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
272
|
+
|
|
273
|
+
if (!size) {
|
|
274
|
+
return;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
|
278
|
+
|
|
279
|
+
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
|
280
|
+
}
|
|
281
|
+
|
|
249
282
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
250
283
|
if (backend->iface.synchronize == NULL) {
|
|
251
284
|
return;
|
|
@@ -283,20 +316,39 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|
|
283
316
|
}
|
|
284
317
|
|
|
285
318
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
319
|
+
// helper to ease transition to device interface
|
|
320
|
+
if (backend->device) {
|
|
321
|
+
return ggml_backend_dev_supports_op(backend->device, op);
|
|
322
|
+
}
|
|
323
|
+
|
|
286
324
|
return backend->iface.supports_op(backend, op);
|
|
287
325
|
}
|
|
288
326
|
|
|
289
327
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
328
|
+
// helper to ease transition to device interface
|
|
329
|
+
if (backend->device) {
|
|
330
|
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
331
|
+
}
|
|
332
|
+
|
|
290
333
|
return backend->iface.supports_buft(backend, buft);
|
|
291
334
|
}
|
|
292
335
|
|
|
293
336
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
337
|
+
// helper to ease transition to device interface
|
|
338
|
+
if (backend->device) {
|
|
339
|
+
return ggml_backend_dev_offload_op(backend->device, op);
|
|
340
|
+
}
|
|
341
|
+
|
|
294
342
|
if (backend->iface.offload_op != NULL) {
|
|
295
343
|
return backend->iface.offload_op(backend, op);
|
|
296
344
|
}
|
|
297
345
|
return false;
|
|
298
346
|
}
|
|
299
347
|
|
|
348
|
+
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
349
|
+
return backend->device;
|
|
350
|
+
}
|
|
351
|
+
|
|
300
352
|
// backend copy
|
|
301
353
|
|
|
302
354
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
|
@@ -351,43 +403,39 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|
|
351
403
|
}
|
|
352
404
|
|
|
353
405
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
|
354
|
-
//
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
} else {
|
|
359
|
-
ggml_backend_synchronize(backend_src);
|
|
360
|
-
ggml_backend_tensor_copy(src, dst);
|
|
361
|
-
ggml_backend_synchronize(backend_dst);
|
|
362
|
-
}
|
|
406
|
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
|
407
|
+
ggml_backend_synchronize(backend_src);
|
|
408
|
+
ggml_backend_synchronize(backend_dst);
|
|
409
|
+
ggml_backend_tensor_copy(src, dst);
|
|
363
410
|
}
|
|
364
411
|
|
|
365
412
|
// events
|
|
366
413
|
|
|
367
|
-
ggml_backend_event_t ggml_backend_event_new(
|
|
368
|
-
|
|
414
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
415
|
+
// null device is allowed for the transition period to the device interface
|
|
416
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
|
369
417
|
return NULL;
|
|
370
418
|
}
|
|
371
|
-
return
|
|
419
|
+
return device->iface.event_new(device);
|
|
372
420
|
}
|
|
373
421
|
|
|
374
422
|
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
375
423
|
if (event == NULL) {
|
|
376
424
|
return;
|
|
377
425
|
}
|
|
378
|
-
event->
|
|
426
|
+
event->device->iface.event_free(event->device, event);
|
|
379
427
|
}
|
|
380
428
|
|
|
381
|
-
void ggml_backend_event_record(ggml_backend_event_t event) {
|
|
382
|
-
GGML_ASSERT(
|
|
429
|
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
430
|
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
383
431
|
|
|
384
|
-
|
|
432
|
+
backend->iface.event_record(backend, event);
|
|
385
433
|
}
|
|
386
434
|
|
|
387
435
|
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
388
|
-
GGML_ASSERT(event->
|
|
436
|
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
389
437
|
|
|
390
|
-
event->
|
|
438
|
+
event->device->iface.event_synchronize(event->device, event);
|
|
391
439
|
}
|
|
392
440
|
|
|
393
441
|
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
@@ -396,170 +444,223 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
|
|
396
444
|
backend->iface.event_wait(backend, event);
|
|
397
445
|
}
|
|
398
446
|
|
|
399
|
-
//
|
|
447
|
+
// Backend device
|
|
400
448
|
|
|
401
|
-
|
|
449
|
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
450
|
+
return device->iface.get_name(device);
|
|
451
|
+
}
|
|
402
452
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
ggml_backend_buffer_type_t default_buffer_type;
|
|
407
|
-
void * user_data;
|
|
408
|
-
};
|
|
453
|
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
454
|
+
return device->iface.get_description(device);
|
|
455
|
+
}
|
|
409
456
|
|
|
410
|
-
|
|
411
|
-
|
|
457
|
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
458
|
+
device->iface.get_memory(device, free, total);
|
|
459
|
+
}
|
|
412
460
|
|
|
413
|
-
|
|
461
|
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
462
|
+
return device->iface.get_type(device);
|
|
463
|
+
}
|
|
414
464
|
|
|
415
|
-
|
|
416
|
-
|
|
465
|
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
466
|
+
device->iface.get_props(device, props);
|
|
467
|
+
}
|
|
417
468
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
469
|
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
470
|
+
return device->reg;
|
|
471
|
+
}
|
|
421
472
|
|
|
422
|
-
|
|
473
|
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
474
|
+
return device->iface.init_backend(device, params);
|
|
475
|
+
}
|
|
423
476
|
|
|
424
|
-
|
|
477
|
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
478
|
+
return device->iface.get_buffer_type(device);
|
|
479
|
+
}
|
|
425
480
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
ggml_backend_cuda_reg_devices();
|
|
430
|
-
#endif
|
|
481
|
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
482
|
+
return device->iface.get_host_buffer_type(device);
|
|
483
|
+
}
|
|
431
484
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
#endif
|
|
485
|
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
486
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
487
|
+
}
|
|
436
488
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
|
441
|
-
#endif
|
|
489
|
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
490
|
+
return device->iface.supports_op(device, op);
|
|
491
|
+
}
|
|
442
492
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
#endif
|
|
493
|
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
494
|
+
return device->iface.supports_buft(device, buft);
|
|
495
|
+
}
|
|
447
496
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
#endif
|
|
497
|
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
498
|
+
return device->iface.offload_op(device, op);
|
|
499
|
+
}
|
|
452
500
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
501
|
+
// Backend (reg)
|
|
502
|
+
|
|
503
|
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
504
|
+
return reg->iface.get_name(reg);
|
|
457
505
|
}
|
|
458
506
|
|
|
459
|
-
|
|
460
|
-
|
|
507
|
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
508
|
+
return reg->iface.get_device_count(reg);
|
|
509
|
+
}
|
|
461
510
|
|
|
462
|
-
|
|
511
|
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
512
|
+
return reg->iface.get_device(reg, index);
|
|
513
|
+
}
|
|
463
514
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
515
|
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
516
|
+
if (!reg->iface.get_proc_address) {
|
|
517
|
+
return NULL;
|
|
518
|
+
}
|
|
519
|
+
return reg->iface.get_proc_address(reg, name);
|
|
520
|
+
}
|
|
470
521
|
|
|
471
|
-
|
|
522
|
+
// Backend registry
|
|
472
523
|
|
|
473
|
-
#
|
|
474
|
-
|
|
524
|
+
#ifdef GGML_USE_CUDA
|
|
525
|
+
#include "ggml-cuda.h"
|
|
475
526
|
#endif
|
|
476
527
|
|
|
477
|
-
|
|
478
|
-
|
|
528
|
+
struct ggml_backend_registry {
|
|
529
|
+
std::vector<ggml_backend_reg_t> backends;
|
|
530
|
+
std::vector<ggml_backend_dev_t> devices;
|
|
479
531
|
|
|
480
|
-
|
|
481
|
-
|
|
532
|
+
ggml_backend_registry() {
|
|
533
|
+
#ifdef GGML_USE_CUDA
|
|
534
|
+
register_backend(ggml_backend_cuda_reg());
|
|
535
|
+
#endif
|
|
482
536
|
|
|
483
|
-
|
|
484
|
-
}
|
|
537
|
+
register_backend(ggml_backend_cpu_reg());
|
|
485
538
|
|
|
486
|
-
|
|
487
|
-
|
|
539
|
+
// TODO: sycl, metal, vulkan, kompute, cann
|
|
540
|
+
}
|
|
488
541
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
542
|
+
void register_backend(ggml_backend_reg_t reg) {
|
|
543
|
+
#ifndef NDEBUG
|
|
544
|
+
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
|
545
|
+
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
|
546
|
+
#endif
|
|
547
|
+
backends.push_back(reg);
|
|
548
|
+
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
|
549
|
+
register_device(ggml_backend_reg_dev_get(reg, i));
|
|
493
550
|
}
|
|
494
551
|
}
|
|
495
552
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
553
|
+
void register_device(ggml_backend_dev_t device) {
|
|
554
|
+
#ifndef NDEBUG
|
|
555
|
+
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
|
556
|
+
#endif
|
|
557
|
+
devices.push_back(device);
|
|
558
|
+
}
|
|
559
|
+
};
|
|
499
560
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
561
|
+
static ggml_backend_registry & get_reg() {
|
|
562
|
+
static ggml_backend_registry reg;
|
|
563
|
+
return reg;
|
|
564
|
+
}
|
|
503
565
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
params = "";
|
|
509
|
-
} else {
|
|
510
|
-
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
|
511
|
-
params++;
|
|
512
|
-
}
|
|
566
|
+
// Internal API
|
|
567
|
+
void ggml_backend_register(ggml_backend_reg_t reg) {
|
|
568
|
+
get_reg().register_backend(reg);
|
|
569
|
+
}
|
|
513
570
|
|
|
514
|
-
|
|
571
|
+
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
|
572
|
+
get_reg().register_device(device);
|
|
573
|
+
}
|
|
515
574
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
575
|
+
// Backend (reg) enumeration
|
|
576
|
+
size_t ggml_backend_reg_count() {
|
|
577
|
+
return get_reg().backends.size();
|
|
578
|
+
}
|
|
520
579
|
|
|
521
|
-
|
|
580
|
+
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
|
581
|
+
GGML_ASSERT(index < ggml_backend_reg_count());
|
|
582
|
+
return get_reg().backends[index];
|
|
522
583
|
}
|
|
523
584
|
|
|
524
|
-
const char *
|
|
525
|
-
|
|
585
|
+
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
|
586
|
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
587
|
+
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
|
588
|
+
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
|
589
|
+
return reg;
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
return NULL;
|
|
593
|
+
}
|
|
526
594
|
|
|
527
|
-
|
|
528
|
-
|
|
595
|
+
// Device enumeration
|
|
596
|
+
size_t ggml_backend_dev_count() {
|
|
597
|
+
return get_reg().devices.size();
|
|
529
598
|
}
|
|
530
599
|
|
|
531
|
-
|
|
532
|
-
|
|
600
|
+
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
|
601
|
+
GGML_ASSERT(index < ggml_backend_dev_count());
|
|
602
|
+
return get_reg().devices[index];
|
|
603
|
+
}
|
|
533
604
|
|
|
534
|
-
|
|
535
|
-
|
|
605
|
+
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
|
606
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
607
|
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
608
|
+
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
|
609
|
+
return dev;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
return NULL;
|
|
536
613
|
}
|
|
537
614
|
|
|
538
|
-
|
|
539
|
-
|
|
615
|
+
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
|
616
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
617
|
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
618
|
+
if (ggml_backend_dev_type(dev) == type) {
|
|
619
|
+
return dev;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
return NULL;
|
|
623
|
+
}
|
|
540
624
|
|
|
541
|
-
|
|
542
|
-
|
|
625
|
+
// Convenience functions
|
|
626
|
+
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
|
627
|
+
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
|
628
|
+
if (!dev) {
|
|
629
|
+
return NULL;
|
|
630
|
+
}
|
|
631
|
+
return ggml_backend_dev_init(dev, params);
|
|
543
632
|
}
|
|
544
633
|
|
|
545
|
-
|
|
546
|
-
|
|
634
|
+
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
|
635
|
+
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
|
636
|
+
if (!dev) {
|
|
637
|
+
return NULL;
|
|
638
|
+
}
|
|
639
|
+
return ggml_backend_dev_init(dev, params);
|
|
640
|
+
}
|
|
547
641
|
|
|
548
|
-
|
|
549
|
-
|
|
642
|
+
ggml_backend_t ggml_backend_init_best(void) {
|
|
643
|
+
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
|
644
|
+
if (!dev) {
|
|
645
|
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
|
646
|
+
}
|
|
647
|
+
if (!dev) {
|
|
648
|
+
return NULL;
|
|
649
|
+
}
|
|
650
|
+
return ggml_backend_dev_init(dev, NULL);
|
|
550
651
|
}
|
|
551
652
|
|
|
552
653
|
// backend CPU
|
|
553
654
|
|
|
554
655
|
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
|
555
656
|
|
|
556
|
-
|
|
657
|
+
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
557
658
|
return "CPU";
|
|
558
659
|
|
|
559
660
|
GGML_UNUSED(buffer);
|
|
560
661
|
}
|
|
561
662
|
|
|
562
|
-
|
|
663
|
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
563
664
|
uintptr_t data = (uintptr_t)buffer->context;
|
|
564
665
|
|
|
565
666
|
// align the buffer
|
|
@@ -570,23 +671,29 @@ GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t b
|
|
|
570
671
|
return (void *)data;
|
|
571
672
|
}
|
|
572
673
|
|
|
573
|
-
|
|
674
|
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
574
675
|
free(buffer->context);
|
|
575
676
|
}
|
|
576
677
|
|
|
577
|
-
|
|
678
|
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
679
|
+
memset((char *)tensor->data + offset, value, size);
|
|
680
|
+
|
|
681
|
+
GGML_UNUSED(buffer);
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
578
685
|
memcpy((char *)tensor->data + offset, data, size);
|
|
579
686
|
|
|
580
687
|
GGML_UNUSED(buffer);
|
|
581
688
|
}
|
|
582
689
|
|
|
583
|
-
|
|
690
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
584
691
|
memcpy(data, (const char *)tensor->data + offset, size);
|
|
585
692
|
|
|
586
693
|
GGML_UNUSED(buffer);
|
|
587
694
|
}
|
|
588
695
|
|
|
589
|
-
|
|
696
|
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
590
697
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
591
698
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
592
699
|
return true;
|
|
@@ -596,15 +703,16 @@ GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t b
|
|
|
596
703
|
GGML_UNUSED(buffer);
|
|
597
704
|
}
|
|
598
705
|
|
|
599
|
-
|
|
706
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
600
707
|
memset(buffer->context, value, buffer->size);
|
|
601
708
|
}
|
|
602
709
|
|
|
603
|
-
static struct ggml_backend_buffer_i
|
|
604
|
-
/* .get_name = */
|
|
710
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
711
|
+
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
605
712
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
606
713
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
607
714
|
/* .init_tensor = */ NULL, // no initialization required
|
|
715
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
608
716
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
609
717
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
610
718
|
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
@@ -612,12 +720,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
|
612
720
|
/* .reset = */ NULL,
|
|
613
721
|
};
|
|
614
722
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
|
723
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
724
|
+
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
|
|
618
725
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
619
726
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
620
727
|
/* .init_tensor = */ NULL, // no initialization required
|
|
728
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
621
729
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
622
730
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
623
731
|
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
@@ -625,13 +733,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
|
625
733
|
/* .reset = */ NULL,
|
|
626
734
|
};
|
|
627
735
|
|
|
628
|
-
|
|
736
|
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
629
737
|
return "CPU";
|
|
630
738
|
|
|
631
739
|
GGML_UNUSED(buft);
|
|
632
740
|
}
|
|
633
741
|
|
|
634
|
-
|
|
742
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
635
743
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
|
636
744
|
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
|
637
745
|
if (data == NULL) {
|
|
@@ -639,24 +747,24 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer
|
|
|
639
747
|
return NULL;
|
|
640
748
|
}
|
|
641
749
|
|
|
642
|
-
return ggml_backend_buffer_init(buft,
|
|
750
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
643
751
|
}
|
|
644
752
|
|
|
645
|
-
|
|
753
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
646
754
|
return TENSOR_ALIGNMENT;
|
|
647
755
|
|
|
648
756
|
GGML_UNUSED(buft);
|
|
649
757
|
}
|
|
650
758
|
|
|
651
|
-
|
|
759
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
652
760
|
return true;
|
|
653
761
|
|
|
654
762
|
GGML_UNUSED(buft);
|
|
655
763
|
}
|
|
656
764
|
|
|
657
|
-
|
|
765
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
658
766
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
659
|
-
/* .iface
|
|
767
|
+
/* .iface = */ {
|
|
660
768
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
661
769
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
662
770
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
@@ -664,6 +772,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
|
664
772
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
665
773
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
666
774
|
},
|
|
775
|
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
667
776
|
/* .context = */ NULL,
|
|
668
777
|
};
|
|
669
778
|
|
|
@@ -676,23 +785,23 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
|
676
785
|
|
|
677
786
|
#include <hbwmalloc.h>
|
|
678
787
|
|
|
679
|
-
|
|
788
|
+
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
680
789
|
return "CPU_HBM";
|
|
681
790
|
|
|
682
791
|
GGML_UNUSED(buft);
|
|
683
792
|
}
|
|
684
793
|
|
|
685
|
-
|
|
794
|
+
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
|
686
795
|
return "CPU_HBM";
|
|
687
796
|
|
|
688
797
|
GGML_UNUSED(buf);
|
|
689
798
|
}
|
|
690
799
|
|
|
691
|
-
|
|
800
|
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
692
801
|
hbw_free(buffer->context);
|
|
693
802
|
}
|
|
694
803
|
|
|
695
|
-
|
|
804
|
+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
696
805
|
//void * ptr = hbw_malloc(size);
|
|
697
806
|
void * ptr;
|
|
698
807
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
@@ -727,28 +836,30 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
|
727
836
|
#endif
|
|
728
837
|
|
|
729
838
|
struct ggml_backend_cpu_context {
|
|
730
|
-
int
|
|
731
|
-
|
|
732
|
-
|
|
839
|
+
int n_threads;
|
|
840
|
+
ggml_threadpool_t threadpool;
|
|
841
|
+
|
|
842
|
+
uint8_t * work_data;
|
|
843
|
+
size_t work_size;
|
|
733
844
|
|
|
734
845
|
ggml_abort_callback abort_callback;
|
|
735
846
|
void * abort_callback_data;
|
|
736
847
|
};
|
|
737
848
|
|
|
738
|
-
|
|
849
|
+
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
|
739
850
|
return "CPU";
|
|
740
851
|
|
|
741
852
|
GGML_UNUSED(backend);
|
|
742
853
|
}
|
|
743
854
|
|
|
744
|
-
|
|
855
|
+
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
|
745
856
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
857
|
+
delete[] cpu_ctx->work_data;
|
|
858
|
+
delete cpu_ctx;
|
|
859
|
+
delete backend;
|
|
749
860
|
}
|
|
750
861
|
|
|
751
|
-
|
|
862
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
|
752
863
|
return ggml_backend_cpu_buffer_type();
|
|
753
864
|
|
|
754
865
|
GGML_UNUSED(backend);
|
|
@@ -759,18 +870,18 @@ struct ggml_backend_plan_cpu {
|
|
|
759
870
|
struct ggml_cgraph cgraph;
|
|
760
871
|
};
|
|
761
872
|
|
|
762
|
-
|
|
873
|
+
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
763
874
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
764
875
|
|
|
765
|
-
struct ggml_backend_plan_cpu * cpu_plan =
|
|
876
|
+
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
|
766
877
|
|
|
767
|
-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
878
|
+
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
768
879
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
|
769
880
|
|
|
770
881
|
if (cpu_plan->cplan.work_size > 0) {
|
|
771
|
-
cpu_plan->cplan.work_data =
|
|
882
|
+
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
|
772
883
|
if (cpu_plan->cplan.work_data == NULL) {
|
|
773
|
-
|
|
884
|
+
delete cpu_plan;
|
|
774
885
|
return NULL;
|
|
775
886
|
}
|
|
776
887
|
}
|
|
@@ -781,16 +892,16 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|
|
781
892
|
return cpu_plan;
|
|
782
893
|
}
|
|
783
894
|
|
|
784
|
-
|
|
895
|
+
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
785
896
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
786
897
|
|
|
787
|
-
|
|
788
|
-
|
|
898
|
+
delete[] cpu_plan->cplan.work_data;
|
|
899
|
+
delete cpu_plan;
|
|
789
900
|
|
|
790
901
|
GGML_UNUSED(backend);
|
|
791
902
|
}
|
|
792
903
|
|
|
793
|
-
|
|
904
|
+
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
|
794
905
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
795
906
|
|
|
796
907
|
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
@@ -798,21 +909,21 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
|
|
798
909
|
GGML_UNUSED(backend);
|
|
799
910
|
}
|
|
800
911
|
|
|
801
|
-
|
|
912
|
+
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
802
913
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
803
914
|
|
|
804
|
-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
915
|
+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
805
916
|
|
|
806
917
|
if (cpu_ctx->work_size < cplan.work_size) {
|
|
807
|
-
|
|
808
|
-
cpu_ctx->work_data =
|
|
918
|
+
delete[] cpu_ctx->work_data;
|
|
919
|
+
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
|
809
920
|
if (cpu_ctx->work_data == NULL) {
|
|
810
921
|
cpu_ctx->work_size = 0;
|
|
811
922
|
return GGML_STATUS_ALLOC_FAILED;
|
|
812
923
|
}
|
|
813
924
|
cpu_ctx->work_size = cplan.work_size;
|
|
814
925
|
}
|
|
815
|
-
cplan.work_data = cpu_ctx->work_data;
|
|
926
|
+
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
|
816
927
|
|
|
817
928
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
|
818
929
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
@@ -820,31 +931,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
|
820
931
|
return ggml_graph_compute(cgraph, &cplan);
|
|
821
932
|
}
|
|
822
933
|
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
case GGML_OP_CPY:
|
|
826
|
-
return
|
|
827
|
-
op->type != GGML_TYPE_IQ2_XXS &&
|
|
828
|
-
op->type != GGML_TYPE_IQ2_XS &&
|
|
829
|
-
op->type != GGML_TYPE_IQ1_S &&
|
|
830
|
-
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
831
|
-
case GGML_OP_MUL_MAT:
|
|
832
|
-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
833
|
-
default:
|
|
834
|
-
return true;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
GGML_UNUSED(backend);
|
|
838
|
-
}
|
|
839
|
-
|
|
840
|
-
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
841
|
-
return ggml_backend_buft_is_host(buft);
|
|
842
|
-
|
|
843
|
-
GGML_UNUSED(backend);
|
|
844
|
-
}
|
|
845
|
-
|
|
846
|
-
static struct ggml_backend_i cpu_backend_i = {
|
|
847
|
-
/* .get_name = */ ggml_backend_cpu_name,
|
|
934
|
+
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
|
935
|
+
/* .get_name = */ ggml_backend_cpu_get_name,
|
|
848
936
|
/* .free = */ ggml_backend_cpu_free,
|
|
849
937
|
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
850
938
|
/* .set_tensor_async = */ NULL,
|
|
@@ -856,14 +944,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
|
856
944
|
/* .graph_plan_update = */ NULL,
|
|
857
945
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
858
946
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
859
|
-
/* .supports_op = */
|
|
860
|
-
/* .supports_buft = */
|
|
947
|
+
/* .supports_op = */ NULL,
|
|
948
|
+
/* .supports_buft = */ NULL,
|
|
861
949
|
/* .offload_op = */ NULL,
|
|
862
|
-
/* .event_new = */ NULL,
|
|
863
|
-
/* .event_free = */ NULL,
|
|
864
950
|
/* .event_record = */ NULL,
|
|
865
951
|
/* .event_wait = */ NULL,
|
|
866
|
-
/* .event_synchronize = */ NULL,
|
|
867
952
|
};
|
|
868
953
|
|
|
869
954
|
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
@@ -872,32 +957,34 @@ static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
|
872
957
|
}
|
|
873
958
|
|
|
874
959
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
|
875
|
-
struct ggml_backend_cpu_context * ctx =
|
|
960
|
+
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
|
876
961
|
if (ctx == NULL) {
|
|
877
962
|
return NULL;
|
|
878
963
|
}
|
|
879
964
|
|
|
880
965
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
|
966
|
+
ctx->threadpool = NULL;
|
|
881
967
|
ctx->work_data = NULL;
|
|
882
968
|
ctx->work_size = 0;
|
|
883
969
|
ctx->abort_callback = NULL;
|
|
884
970
|
ctx->abort_callback_data = NULL;
|
|
885
971
|
|
|
886
|
-
ggml_backend_t cpu_backend =
|
|
972
|
+
ggml_backend_t cpu_backend = new ggml_backend {
|
|
973
|
+
/* .guid = */ ggml_backend_cpu_guid(),
|
|
974
|
+
/* .interface = */ ggml_backend_cpu_i,
|
|
975
|
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
976
|
+
/* .context = */ ctx,
|
|
977
|
+
};
|
|
978
|
+
|
|
887
979
|
if (cpu_backend == NULL) {
|
|
888
|
-
|
|
980
|
+
delete ctx;
|
|
889
981
|
return NULL;
|
|
890
982
|
}
|
|
891
983
|
|
|
892
|
-
*cpu_backend = (struct ggml_backend) {
|
|
893
|
-
/* .guid = */ ggml_backend_cpu_guid(),
|
|
894
|
-
/* .interface = */ cpu_backend_i,
|
|
895
|
-
/* .context = */ ctx
|
|
896
|
-
};
|
|
897
984
|
return cpu_backend;
|
|
898
985
|
}
|
|
899
986
|
|
|
900
|
-
|
|
987
|
+
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
901
988
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
902
989
|
}
|
|
903
990
|
|
|
@@ -908,6 +995,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
|
908
995
|
ctx->n_threads = n_threads;
|
|
909
996
|
}
|
|
910
997
|
|
|
998
|
+
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
|
999
|
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
1000
|
+
|
|
1001
|
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
1002
|
+
|
|
1003
|
+
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
|
1004
|
+
// already had a different threadpool, pause/suspend it before switching
|
|
1005
|
+
ggml_threadpool_pause(ctx->threadpool);
|
|
1006
|
+
}
|
|
1007
|
+
ctx->threadpool = threadpool;
|
|
1008
|
+
}
|
|
1009
|
+
|
|
911
1010
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
|
912
1011
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
913
1012
|
|
|
@@ -916,16 +1015,226 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
|
|
916
1015
|
ctx->abort_callback_data = abort_callback_data;
|
|
917
1016
|
}
|
|
918
1017
|
|
|
919
|
-
|
|
1018
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
920
1019
|
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
921
|
-
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(),
|
|
1020
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
922
1021
|
}
|
|
923
1022
|
|
|
924
|
-
|
|
1023
|
+
////////////////////////
|
|
1024
|
+
|
|
1025
|
+
struct ggml_backend_cpu_device_context {
|
|
1026
|
+
std::string description = "CPU";
|
|
1027
|
+
|
|
1028
|
+
ggml_backend_cpu_device_context() {
|
|
1029
|
+
#ifdef __APPLE__
|
|
1030
|
+
size_t len = 0;
|
|
1031
|
+
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
|
1032
|
+
description.resize(len);
|
|
1033
|
+
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
|
1034
|
+
}
|
|
1035
|
+
#elif defined(__linux__)
|
|
1036
|
+
FILE * f = fopen("/proc/cpuinfo", "r");
|
|
1037
|
+
if (f) {
|
|
1038
|
+
char buf[1024];
|
|
1039
|
+
while (fgets(buf, sizeof(buf), f)) {
|
|
1040
|
+
if (strncmp(buf, "model name", 10) == 0) {
|
|
1041
|
+
char * p = strchr(buf, ':');
|
|
1042
|
+
if (p) {
|
|
1043
|
+
p++;
|
|
1044
|
+
while (std::isspace(*p)) {
|
|
1045
|
+
p++;
|
|
1046
|
+
}
|
|
1047
|
+
while (std::isspace(p[strlen(p) - 1])) {
|
|
1048
|
+
p[strlen(p) - 1] = '\0';
|
|
1049
|
+
}
|
|
1050
|
+
description = p;
|
|
1051
|
+
break;
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
fclose(f);
|
|
1056
|
+
}
|
|
1057
|
+
#elif defined(_WIN32)
|
|
1058
|
+
HKEY hKey;
|
|
1059
|
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
1060
|
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
1061
|
+
0,
|
|
1062
|
+
KEY_READ,
|
|
1063
|
+
&hKey) == ERROR_SUCCESS) {
|
|
1064
|
+
DWORD cpu_brand_size = 0;
|
|
1065
|
+
if (RegQueryValueExA(hKey,
|
|
1066
|
+
TEXT("ProcessorNameString"),
|
|
1067
|
+
NULL,
|
|
1068
|
+
NULL,
|
|
1069
|
+
NULL,
|
|
1070
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1071
|
+
description.resize(cpu_brand_size);
|
|
1072
|
+
if (RegQueryValueExA(hKey,
|
|
1073
|
+
TEXT("ProcessorNameString"),
|
|
1074
|
+
NULL,
|
|
1075
|
+
NULL,
|
|
1076
|
+
(LPBYTE)&description[0], // NOLINT
|
|
1077
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
1078
|
+
if (description.find('\0') != std::string::npos) {
|
|
1079
|
+
description.resize(description.find('\0'));
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
RegCloseKey(hKey);
|
|
1084
|
+
}
|
|
1085
|
+
#endif
|
|
1086
|
+
}
|
|
1087
|
+
};
|
|
1088
|
+
|
|
1089
|
+
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
|
1090
|
+
return "CPU";
|
|
1091
|
+
|
|
1092
|
+
GGML_UNUSED(dev);
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
|
1096
|
+
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
|
1097
|
+
|
|
1098
|
+
return ctx->description.c_str();
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
1102
|
+
// TODO
|
|
1103
|
+
*free = 0;
|
|
1104
|
+
*total = 0;
|
|
1105
|
+
|
|
1106
|
+
GGML_UNUSED(dev);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
|
1110
|
+
return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
|
1111
|
+
|
|
1112
|
+
GGML_UNUSED(dev);
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
1116
|
+
props->name = ggml_backend_cpu_device_get_name(dev);
|
|
1117
|
+
props->description = ggml_backend_cpu_device_get_description(dev);
|
|
1118
|
+
props->type = ggml_backend_cpu_device_get_type(dev);
|
|
1119
|
+
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
1120
|
+
props->caps = {
|
|
1121
|
+
/* async */ false,
|
|
1122
|
+
/* host_buffer */ false,
|
|
1123
|
+
/* events */ false,
|
|
1124
|
+
};
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
925
1128
|
return ggml_backend_cpu_init();
|
|
926
1129
|
|
|
1130
|
+
GGML_UNUSED(dev);
|
|
927
1131
|
GGML_UNUSED(params);
|
|
928
|
-
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
1135
|
+
return ggml_backend_cpu_buffer_type();
|
|
1136
|
+
|
|
1137
|
+
GGML_UNUSED(dev);
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
1141
|
+
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
1142
|
+
|
|
1143
|
+
GGML_UNUSED(dev);
|
|
1144
|
+
GGML_UNUSED(max_tensor_size);
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
1148
|
+
switch (op->op) {
|
|
1149
|
+
case GGML_OP_CPY:
|
|
1150
|
+
return
|
|
1151
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
|
1152
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
|
1153
|
+
op->type != GGML_TYPE_IQ1_S &&
|
|
1154
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
1155
|
+
case GGML_OP_MUL_MAT:
|
|
1156
|
+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
1157
|
+
case GGML_OP_ROPE_BACK:
|
|
1158
|
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
|
1159
|
+
case GGML_OP_IM2COL_BACK:
|
|
1160
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
1161
|
+
case GGML_OP_OUT_PROD:
|
|
1162
|
+
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
|
1163
|
+
default:
|
|
1164
|
+
return true;
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
GGML_UNUSED(dev);
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
1171
|
+
return ggml_backend_buft_is_host(buft);
|
|
1172
|
+
|
|
1173
|
+
GGML_UNUSED(dev);
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
|
1177
|
+
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
|
1178
|
+
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
|
1179
|
+
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
|
1180
|
+
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
|
1181
|
+
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
|
1182
|
+
/* .init_backend = */ ggml_backend_cpu_device_init,
|
|
1183
|
+
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
|
1184
|
+
/* .get_host_buffer_type = */ NULL,
|
|
1185
|
+
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
|
|
1186
|
+
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
|
1187
|
+
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
|
1188
|
+
/* .offload_op = */ NULL,
|
|
1189
|
+
/* .event_new = */ NULL,
|
|
1190
|
+
/* .event_free = */ NULL,
|
|
1191
|
+
/* .event_synchronize = */ NULL,
|
|
1192
|
+
};
|
|
1193
|
+
|
|
1194
|
+
////////////////////////
|
|
1195
|
+
|
|
1196
|
+
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
|
1197
|
+
return "CPU";
|
|
1198
|
+
|
|
1199
|
+
GGML_UNUSED(reg);
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
1203
|
+
return 1;
|
|
1204
|
+
|
|
1205
|
+
GGML_UNUSED(reg);
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
1209
|
+
GGML_ASSERT(index == 0);
|
|
1210
|
+
|
|
1211
|
+
static ggml_backend_cpu_device_context ctx;
|
|
1212
|
+
static ggml_backend_device ggml_backend_cpu_device = {
|
|
1213
|
+
/* .iface = */ ggml_backend_cpu_device_i,
|
|
1214
|
+
/* .reg = */ reg,
|
|
1215
|
+
/* .context = */ &ctx,
|
|
1216
|
+
};
|
|
1217
|
+
|
|
1218
|
+
return &ggml_backend_cpu_device;
|
|
1219
|
+
|
|
1220
|
+
GGML_UNUSED(reg);
|
|
1221
|
+
GGML_UNUSED(index);
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
|
1225
|
+
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
|
1226
|
+
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
|
1227
|
+
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
|
1228
|
+
/* .get_proc_address = */ NULL,
|
|
1229
|
+
};
|
|
1230
|
+
|
|
1231
|
+
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
|
1232
|
+
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
|
1233
|
+
/* .iface = */ ggml_backend_cpu_reg_i,
|
|
1234
|
+
/* .context = */ NULL,
|
|
1235
|
+
};
|
|
1236
|
+
|
|
1237
|
+
return &ggml_backend_cpu_reg;
|
|
929
1238
|
}
|
|
930
1239
|
|
|
931
1240
|
// multi-buffer buffer
|
|
@@ -935,16 +1244,14 @@ struct ggml_backend_multi_buffer_context {
|
|
|
935
1244
|
size_t n_buffers;
|
|
936
1245
|
};
|
|
937
1246
|
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
941
|
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
|
1247
|
+
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
1248
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
942
1249
|
|
|
943
1250
|
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
|
944
1251
|
}
|
|
945
1252
|
|
|
946
|
-
|
|
947
|
-
|
|
1253
|
+
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1254
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
948
1255
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
949
1256
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
|
950
1257
|
}
|
|
@@ -953,31 +1260,28 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_
|
|
|
953
1260
|
free(ctx);
|
|
954
1261
|
}
|
|
955
1262
|
|
|
956
|
-
|
|
957
|
-
|
|
1263
|
+
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1264
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
958
1265
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
959
1266
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
|
960
1267
|
}
|
|
961
1268
|
}
|
|
962
1269
|
|
|
963
|
-
static struct ggml_backend_buffer_i
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
return multi_backend_buffer_i;
|
|
977
|
-
}
|
|
1270
|
+
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
|
1271
|
+
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
|
1272
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
|
1273
|
+
/* .get_base = */ NULL,
|
|
1274
|
+
/* .init_tensor = */ NULL,
|
|
1275
|
+
/* .memset_tensor = */ NULL,
|
|
1276
|
+
/* .set_tensor = */ NULL,
|
|
1277
|
+
/* .get_tensor = */ NULL,
|
|
1278
|
+
/* .cpy_tensor = */ NULL,
|
|
1279
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
|
1280
|
+
/* .reset = */ NULL,
|
|
1281
|
+
};
|
|
978
1282
|
|
|
979
|
-
|
|
980
|
-
|
|
1283
|
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
|
1284
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
|
981
1285
|
ctx->n_buffers = n_buffers;
|
|
982
1286
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
|
983
1287
|
|
|
@@ -989,16 +1293,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
|
|
989
1293
|
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
|
990
1294
|
}
|
|
991
1295
|
|
|
992
|
-
return ggml_backend_buffer_init(buffers[0]->buft,
|
|
1296
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
|
|
993
1297
|
}
|
|
994
1298
|
|
|
995
|
-
|
|
1299
|
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
|
996
1300
|
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
|
997
1301
|
}
|
|
998
1302
|
|
|
999
|
-
|
|
1303
|
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
1000
1304
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
|
1001
|
-
|
|
1305
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1002
1306
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
1003
1307
|
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
|
1004
1308
|
}
|
|
@@ -1023,10 +1327,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
|
1023
1327
|
#define GGML_SCHED_MAX_BACKENDS 16
|
|
1024
1328
|
#endif
|
|
1025
1329
|
|
|
1026
|
-
#ifndef GGML_SCHED_MAX_SPLITS
|
|
1027
|
-
#define GGML_SCHED_MAX_SPLITS 2048
|
|
1028
|
-
#endif
|
|
1029
|
-
|
|
1030
1330
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
|
1031
1331
|
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
|
1032
1332
|
#endif
|
|
@@ -1130,7 +1430,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
|
1130
1430
|
}
|
|
1131
1431
|
|
|
1132
1432
|
#if 0
|
|
1133
|
-
|
|
1433
|
+
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
|
1434
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
|
1134
1435
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
1135
1436
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
|
1136
1437
|
#else
|
|
@@ -1158,6 +1459,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1158
1459
|
}
|
|
1159
1460
|
}
|
|
1160
1461
|
|
|
1462
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
|
1463
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
|
1464
|
+
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1161
1467
|
// graph input
|
|
1162
1468
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
1163
1469
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
|
@@ -1551,10 +1857,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1551
1857
|
i_split++;
|
|
1552
1858
|
if (i_split >= sched->splits_capacity) {
|
|
1553
1859
|
sched->splits_capacity *= 2;
|
|
1554
|
-
sched->splits =
|
|
1860
|
+
sched->splits = (ggml_backend_sched_split *)
|
|
1861
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
|
1555
1862
|
GGML_ASSERT(sched->splits != NULL);
|
|
1556
1863
|
}
|
|
1557
|
-
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
|
1558
1864
|
split = &sched->splits[i_split];
|
|
1559
1865
|
split->backend_id = node_backend_id;
|
|
1560
1866
|
split->i_start = i;
|
|
@@ -1638,11 +1944,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1638
1944
|
sched->prev_leaf_backend_ids = tmp;
|
|
1639
1945
|
}
|
|
1640
1946
|
|
|
1641
|
-
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1947
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
|
1642
1948
|
if (sched->graph.size < graph_size) {
|
|
1643
1949
|
sched->graph.size = graph_size;
|
|
1644
|
-
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
1645
|
-
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
|
1950
|
+
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
1951
|
+
sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
|
1646
1952
|
GGML_ASSERT(sched->graph.nodes != NULL);
|
|
1647
1953
|
GGML_ASSERT(sched->graph.leafs != NULL);
|
|
1648
1954
|
}
|
|
@@ -1690,6 +1996,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1690
1996
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1691
1997
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1692
1998
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
1999
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1693
2000
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1694
2001
|
}
|
|
1695
2002
|
}
|
|
@@ -1703,6 +2010,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1703
2010
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1704
2011
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1705
2012
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
2013
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1706
2014
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1707
2015
|
}
|
|
1708
2016
|
}
|
|
@@ -1713,6 +2021,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1713
2021
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
1714
2022
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
1715
2023
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
|
2024
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1716
2025
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
|
1717
2026
|
}
|
|
1718
2027
|
}
|
|
@@ -1782,7 +2091,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1782
2091
|
} else {
|
|
1783
2092
|
ggml_backend_synchronize(split_backend);
|
|
1784
2093
|
}
|
|
1785
|
-
|
|
2094
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
|
2095
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
|
2096
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
|
2097
|
+
ggml_backend_synchronize(input_backend);
|
|
2098
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
2099
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
|
2100
|
+
} else {
|
|
2101
|
+
ggml_backend_synchronize(split_backend);
|
|
2102
|
+
}
|
|
2103
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
|
2104
|
+
}
|
|
1786
2105
|
}
|
|
1787
2106
|
}
|
|
1788
2107
|
|
|
@@ -1828,7 +2147,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1828
2147
|
// record the event of this copy
|
|
1829
2148
|
if (split->n_inputs > 0) {
|
|
1830
2149
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
1831
|
-
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
|
2150
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
|
1832
2151
|
}
|
|
1833
2152
|
}
|
|
1834
2153
|
}
|
|
@@ -1848,7 +2167,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1848
2167
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
|
1849
2168
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
|
1850
2169
|
|
|
1851
|
-
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
|
2170
|
+
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
|
1852
2171
|
|
|
1853
2172
|
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
|
1854
2173
|
sched->n_backends = n_backends;
|
|
@@ -1857,20 +2176,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1857
2176
|
// initialize hash table
|
|
1858
2177
|
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
|
1859
2178
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
|
1860
|
-
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
1861
|
-
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
2179
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
2180
|
+
sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
1862
2181
|
|
|
1863
|
-
const size_t
|
|
1864
|
-
|
|
1865
|
-
sched->
|
|
1866
|
-
sched->
|
|
1867
|
-
sched->
|
|
2182
|
+
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
|
2183
|
+
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
2184
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
|
2185
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
|
2186
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
2187
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1868
2188
|
|
|
1869
|
-
sched->context_buffer_size =
|
|
1870
|
-
sched->context_buffer = malloc(sched->context_buffer_size);
|
|
2189
|
+
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
2190
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
|
1871
2191
|
|
|
1872
2192
|
const int initial_splits_capacity = 16;
|
|
1873
|
-
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
|
2193
|
+
sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
|
1874
2194
|
sched->splits_capacity = initial_splits_capacity;
|
|
1875
2195
|
|
|
1876
2196
|
for (int b = 0; b < n_backends; b++) {
|
|
@@ -1879,7 +2199,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1879
2199
|
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1880
2200
|
if (sched->n_copies > 1) {
|
|
1881
2201
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1882
|
-
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
2202
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
|
1883
2203
|
}
|
|
1884
2204
|
}
|
|
1885
2205
|
}
|
|
@@ -2115,8 +2435,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
|
|
2115
2435
|
|
|
2116
2436
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
|
2117
2437
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
|
2118
|
-
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
2119
|
-
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
|
2438
|
+
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
2439
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
|
2120
2440
|
|
|
2121
2441
|
struct ggml_init_params params = {
|
|
2122
2442
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
|
@@ -2134,7 +2454,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2134
2454
|
free(node_init);
|
|
2135
2455
|
ggml_free(ctx_allocated);
|
|
2136
2456
|
ggml_free(ctx_unallocated);
|
|
2137
|
-
return
|
|
2457
|
+
return {
|
|
2138
2458
|
/* .buffer = */ NULL,
|
|
2139
2459
|
/* .ctx_allocated = */ NULL,
|
|
2140
2460
|
/* .ctx_unallocated = */ NULL,
|
|
@@ -2157,7 +2477,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2157
2477
|
free(node_init);
|
|
2158
2478
|
ggml_free(ctx_allocated);
|
|
2159
2479
|
ggml_free(ctx_unallocated);
|
|
2160
|
-
return
|
|
2480
|
+
return {
|
|
2161
2481
|
/* .buffer = */ NULL,
|
|
2162
2482
|
/* .ctx_allocated = */ NULL,
|
|
2163
2483
|
/* .ctx_unallocated = */ NULL,
|
|
@@ -2186,7 +2506,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2186
2506
|
free(node_copies);
|
|
2187
2507
|
free(node_init);
|
|
2188
2508
|
|
|
2189
|
-
return
|
|
2509
|
+
return {
|
|
2190
2510
|
/* .buffer = */ buffer,
|
|
2191
2511
|
/* .ctx_allocated = */ ctx_allocated,
|
|
2192
2512
|
/* .ctx_unallocated = */ ctx_unallocated,
|