@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
// Note: porting this file to C++ is a work in progress
|
|
2
|
+
|
|
3
|
+
#ifdef _WIN32
|
|
4
|
+
#define WIN32_LEAN_AND_MEAN
|
|
5
|
+
#ifndef NOMINMAX
|
|
6
|
+
# define NOMINMAX
|
|
7
|
+
#endif
|
|
8
|
+
#include <windows.h>
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#include "ggml-backend.h"
|
|
1
12
|
#include "ggml-backend-impl.h"
|
|
2
13
|
#include "ggml-alloc.h"
|
|
3
14
|
#include "ggml-impl.h"
|
|
@@ -8,9 +19,14 @@
|
|
|
8
19
|
#include <stdio.h>
|
|
9
20
|
#include <stdlib.h>
|
|
10
21
|
#include <string.h>
|
|
22
|
+
#include <string>
|
|
23
|
+
#include <vector>
|
|
11
24
|
|
|
25
|
+
#ifdef __APPLE__
|
|
26
|
+
#include <sys/types.h>
|
|
27
|
+
#include <sys/sysctl.h>
|
|
28
|
+
#endif
|
|
12
29
|
|
|
13
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
14
30
|
|
|
15
31
|
// backend buffer type
|
|
16
32
|
|
|
@@ -18,7 +34,12 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
|
|
18
34
|
return buft->iface.get_name(buft);
|
|
19
35
|
}
|
|
20
36
|
|
|
21
|
-
|
|
37
|
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
38
|
+
if (size == 0) {
|
|
39
|
+
// return a dummy buffer for zero-sized allocations
|
|
40
|
+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
|
41
|
+
}
|
|
42
|
+
|
|
22
43
|
return buft->iface.alloc_buffer(buft, size);
|
|
23
44
|
}
|
|
24
45
|
|
|
@@ -34,7 +55,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
|
34
55
|
return SIZE_MAX;
|
|
35
56
|
}
|
|
36
57
|
|
|
37
|
-
|
|
58
|
+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
|
38
59
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
|
39
60
|
if (buft->iface.get_alloc_size) {
|
|
40
61
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
|
@@ -51,16 +72,18 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
|
|
51
72
|
return false;
|
|
52
73
|
}
|
|
53
74
|
|
|
54
|
-
|
|
75
|
+
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
|
76
|
+
return buft->device;
|
|
77
|
+
}
|
|
55
78
|
|
|
56
|
-
|
|
57
|
-
ggml_backend_buffer_type_t buft,
|
|
58
|
-
struct ggml_backend_buffer_i iface,
|
|
59
|
-
ggml_backend_buffer_context_t context,
|
|
60
|
-
size_t size) {
|
|
61
|
-
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
|
79
|
+
// backend buffer
|
|
62
80
|
|
|
63
|
-
|
|
81
|
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
82
|
+
ggml_backend_buffer_type_t buft,
|
|
83
|
+
struct ggml_backend_buffer_i iface,
|
|
84
|
+
void * context,
|
|
85
|
+
size_t size) {
|
|
86
|
+
ggml_backend_buffer_t buffer = new ggml_backend_buffer {
|
|
64
87
|
/* .interface = */ iface,
|
|
65
88
|
/* .buft = */ buft,
|
|
66
89
|
/* .context = */ context,
|
|
@@ -72,7 +95,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
72
95
|
}
|
|
73
96
|
|
|
74
97
|
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
|
75
|
-
return
|
|
98
|
+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
|
|
76
99
|
}
|
|
77
100
|
|
|
78
101
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
@@ -83,7 +106,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
|
83
106
|
if (buffer->iface.free_buffer != NULL) {
|
|
84
107
|
buffer->iface.free_buffer(buffer);
|
|
85
108
|
}
|
|
86
|
-
|
|
109
|
+
delete buffer;
|
|
87
110
|
}
|
|
88
111
|
|
|
89
112
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
@@ -91,6 +114,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
|
91
114
|
}
|
|
92
115
|
|
|
93
116
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
117
|
+
// get_base is optional if the buffer is zero-sized
|
|
118
|
+
if (buffer->size == 0) {
|
|
119
|
+
return NULL;
|
|
120
|
+
}
|
|
121
|
+
|
|
94
122
|
void * base = buffer->iface.get_base(buffer);
|
|
95
123
|
|
|
96
124
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
|
@@ -98,14 +126,23 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
98
126
|
return base;
|
|
99
127
|
}
|
|
100
128
|
|
|
101
|
-
|
|
129
|
+
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
102
130
|
// init_tensor is optional
|
|
103
131
|
if (buffer->iface.init_tensor) {
|
|
104
132
|
buffer->iface.init_tensor(buffer, tensor);
|
|
105
133
|
}
|
|
106
134
|
}
|
|
107
135
|
|
|
108
|
-
|
|
136
|
+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
137
|
+
// clear is optional if the buffer is zero-sized
|
|
138
|
+
if (buffer->size == 0) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
buffer->iface.clear(buffer, value);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
|
109
146
|
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
|
110
147
|
}
|
|
111
148
|
|
|
@@ -117,10 +154,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|
|
117
154
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
|
118
155
|
}
|
|
119
156
|
|
|
120
|
-
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
121
|
-
buffer->iface.clear(buffer, value);
|
|
122
|
-
}
|
|
123
|
-
|
|
124
157
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|
125
158
|
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
|
126
159
|
}
|
|
@@ -181,7 +214,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|
|
181
214
|
}
|
|
182
215
|
|
|
183
216
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
|
184
|
-
return backend->
|
|
217
|
+
return ggml_backend_dev_buffer_type(backend->device);
|
|
185
218
|
}
|
|
186
219
|
|
|
187
220
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
|
@@ -218,32 +251,47 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|
|
218
251
|
}
|
|
219
252
|
}
|
|
220
253
|
|
|
221
|
-
|
|
254
|
+
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
222
255
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
223
256
|
|
|
257
|
+
if (size == 0) {
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
|
|
224
261
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
225
262
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
226
263
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
227
264
|
|
|
228
|
-
if (!size) {
|
|
229
|
-
return;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
265
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
|
233
266
|
}
|
|
234
267
|
|
|
235
|
-
|
|
268
|
+
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
236
269
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
237
270
|
|
|
271
|
+
if (size == 0) {
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
238
275
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
239
276
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
240
277
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
241
278
|
|
|
242
|
-
|
|
279
|
+
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
283
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
284
|
+
|
|
285
|
+
if (size == 0) {
|
|
243
286
|
return;
|
|
244
287
|
}
|
|
245
288
|
|
|
246
|
-
|
|
289
|
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
290
|
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
291
|
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
292
|
+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
|
|
293
|
+
|
|
294
|
+
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
|
247
295
|
}
|
|
248
296
|
|
|
249
297
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
@@ -283,18 +331,19 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
|
|
|
283
331
|
}
|
|
284
332
|
|
|
285
333
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
286
|
-
return backend->
|
|
334
|
+
return ggml_backend_dev_supports_op(backend->device, op);
|
|
287
335
|
}
|
|
288
336
|
|
|
289
337
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
290
|
-
return backend->
|
|
338
|
+
return ggml_backend_dev_supports_buft(backend->device, buft);
|
|
291
339
|
}
|
|
292
340
|
|
|
293
341
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
342
|
+
return ggml_backend_dev_offload_op(backend->device, op);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
|
346
|
+
return backend->device;
|
|
298
347
|
}
|
|
299
348
|
|
|
300
349
|
// backend copy
|
|
@@ -327,7 +376,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
|
327
376
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
|
328
377
|
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
|
329
378
|
#ifndef NDEBUG
|
|
330
|
-
|
|
379
|
+
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
|
331
380
|
#endif
|
|
332
381
|
size_t nbytes = ggml_nbytes(src);
|
|
333
382
|
void * data = malloc(nbytes);
|
|
@@ -351,43 +400,39 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|
|
351
400
|
}
|
|
352
401
|
|
|
353
402
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
|
354
|
-
//
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
} else {
|
|
359
|
-
ggml_backend_synchronize(backend_src);
|
|
360
|
-
ggml_backend_tensor_copy(src, dst);
|
|
361
|
-
ggml_backend_synchronize(backend_dst);
|
|
362
|
-
}
|
|
403
|
+
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
|
404
|
+
ggml_backend_synchronize(backend_src);
|
|
405
|
+
ggml_backend_synchronize(backend_dst);
|
|
406
|
+
ggml_backend_tensor_copy(src, dst);
|
|
363
407
|
}
|
|
364
408
|
|
|
365
409
|
// events
|
|
366
410
|
|
|
367
|
-
ggml_backend_event_t ggml_backend_event_new(
|
|
368
|
-
|
|
411
|
+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
|
|
412
|
+
// null device is allowed for the transition period to the device interface
|
|
413
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
|
369
414
|
return NULL;
|
|
370
415
|
}
|
|
371
|
-
return
|
|
416
|
+
return device->iface.event_new(device);
|
|
372
417
|
}
|
|
373
418
|
|
|
374
419
|
void ggml_backend_event_free(ggml_backend_event_t event) {
|
|
375
420
|
if (event == NULL) {
|
|
376
421
|
return;
|
|
377
422
|
}
|
|
378
|
-
event->
|
|
423
|
+
event->device->iface.event_free(event->device, event);
|
|
379
424
|
}
|
|
380
425
|
|
|
381
|
-
void ggml_backend_event_record(ggml_backend_event_t event) {
|
|
382
|
-
GGML_ASSERT(
|
|
426
|
+
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
|
427
|
+
GGML_ASSERT(backend->iface.event_record != NULL);
|
|
383
428
|
|
|
384
|
-
|
|
429
|
+
backend->iface.event_record(backend, event);
|
|
385
430
|
}
|
|
386
431
|
|
|
387
432
|
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
|
388
|
-
GGML_ASSERT(event->
|
|
433
|
+
GGML_ASSERT(event->device->iface.event_synchronize);
|
|
389
434
|
|
|
390
|
-
event->
|
|
435
|
+
event->device->iface.event_synchronize(event->device, event);
|
|
391
436
|
}
|
|
392
437
|
|
|
393
438
|
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
@@ -396,536 +441,88 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
|
|
396
441
|
backend->iface.event_wait(backend, event);
|
|
397
442
|
}
|
|
398
443
|
|
|
399
|
-
//
|
|
400
|
-
|
|
401
|
-
#define GGML_REG_MAX_BACKENDS 64
|
|
402
|
-
|
|
403
|
-
struct ggml_backend_reg {
|
|
404
|
-
char name[128];
|
|
405
|
-
ggml_backend_init_fn init_fn;
|
|
406
|
-
ggml_backend_buffer_type_t default_buffer_type;
|
|
407
|
-
void * user_data;
|
|
408
|
-
};
|
|
409
|
-
|
|
410
|
-
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
|
411
|
-
static size_t ggml_backend_registry_count = 0;
|
|
412
|
-
|
|
413
|
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
|
414
|
-
|
|
415
|
-
GGML_CALL static void ggml_backend_registry_init(void) {
|
|
416
|
-
static bool initialized = false;
|
|
417
|
-
|
|
418
|
-
if (initialized) {
|
|
419
|
-
return;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
initialized = true;
|
|
423
|
-
|
|
424
|
-
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
|
425
|
-
|
|
426
|
-
// add forward decls here to avoid including the backend headers
|
|
427
|
-
#ifdef GGML_USE_CUDA
|
|
428
|
-
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
|
429
|
-
ggml_backend_cuda_reg_devices();
|
|
430
|
-
#endif
|
|
431
|
-
|
|
432
|
-
#ifdef GGML_USE_SYCL
|
|
433
|
-
extern void ggml_backend_sycl_reg_devices(void);
|
|
434
|
-
ggml_backend_sycl_reg_devices();
|
|
435
|
-
#endif
|
|
436
|
-
|
|
437
|
-
#ifdef GGML_USE_METAL
|
|
438
|
-
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
|
439
|
-
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
440
|
-
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
|
441
|
-
#endif
|
|
442
|
-
|
|
443
|
-
#ifdef GGML_USE_VULKAN
|
|
444
|
-
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
|
445
|
-
ggml_backend_vk_reg_devices();
|
|
446
|
-
#endif
|
|
447
|
-
|
|
448
|
-
#ifdef GGML_USE_KOMPUTE
|
|
449
|
-
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
|
450
|
-
ggml_backend_kompute_reg_devices();
|
|
451
|
-
#endif
|
|
452
|
-
|
|
453
|
-
#ifdef GGML_USE_CANN
|
|
454
|
-
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
|
|
455
|
-
ggml_backend_cann_reg_devices();
|
|
456
|
-
#endif
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
|
460
|
-
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
|
461
|
-
|
|
462
|
-
size_t id = ggml_backend_registry_count;
|
|
463
|
-
|
|
464
|
-
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
|
465
|
-
/* .name = */ {0},
|
|
466
|
-
/* .fn = */ init_fn,
|
|
467
|
-
/* .default_buffer_type = */ default_buffer_type,
|
|
468
|
-
/* .user_data = */ user_data,
|
|
469
|
-
};
|
|
470
|
-
|
|
471
|
-
snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
|
|
472
|
-
|
|
473
|
-
#ifndef NDEBUG
|
|
474
|
-
fprintf(stderr, "%s: registered backend %s\n", __func__, name);
|
|
475
|
-
#endif
|
|
476
|
-
|
|
477
|
-
ggml_backend_registry_count++;
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
size_t ggml_backend_reg_get_count(void) {
|
|
481
|
-
ggml_backend_registry_init();
|
|
444
|
+
// Backend device
|
|
482
445
|
|
|
483
|
-
|
|
446
|
+
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
|
447
|
+
return device->iface.get_name(device);
|
|
484
448
|
}
|
|
485
449
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
|
|
490
|
-
// TODO: case insensitive in a portable way
|
|
491
|
-
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
|
|
492
|
-
return i;
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
// not found
|
|
497
|
-
return SIZE_MAX;
|
|
450
|
+
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
|
451
|
+
return device->iface.get_description(device);
|
|
498
452
|
}
|
|
499
453
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
ggml_backend_registry_init();
|
|
503
|
-
|
|
504
|
-
const char * params = strchr(backend_str, ':');
|
|
505
|
-
char backend_name[128];
|
|
506
|
-
if (params == NULL) {
|
|
507
|
-
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
|
508
|
-
params = "";
|
|
509
|
-
} else {
|
|
510
|
-
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
|
511
|
-
params++;
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
|
515
|
-
|
|
516
|
-
if (backend_i == SIZE_MAX) {
|
|
517
|
-
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
|
518
|
-
return NULL;
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
return ggml_backend_reg_init_backend(backend_i, params);
|
|
454
|
+
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
|
455
|
+
device->iface.get_memory(device, free, total);
|
|
522
456
|
}
|
|
523
457
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
|
528
|
-
return ggml_backend_registry[i].name;
|
|
458
|
+
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
|
459
|
+
return device->iface.get_type(device);
|
|
529
460
|
}
|
|
530
461
|
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
|
535
|
-
return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
|
|
462
|
+
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
|
|
463
|
+
memset(props, 0, sizeof(*props));
|
|
464
|
+
device->iface.get_props(device, props);
|
|
536
465
|
}
|
|
537
466
|
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
|
542
|
-
return ggml_backend_registry[i].default_buffer_type;
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
546
|
-
ggml_backend_registry_init();
|
|
547
|
-
|
|
548
|
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
|
549
|
-
return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
// backend CPU
|
|
553
|
-
|
|
554
|
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
|
555
|
-
|
|
556
|
-
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
|
557
|
-
return "CPU";
|
|
558
|
-
|
|
559
|
-
GGML_UNUSED(buffer);
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
563
|
-
uintptr_t data = (uintptr_t)buffer->context;
|
|
564
|
-
|
|
565
|
-
// align the buffer
|
|
566
|
-
if (data % TENSOR_ALIGNMENT != 0) {
|
|
567
|
-
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
return (void *)data;
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
574
|
-
free(buffer->context);
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
578
|
-
memcpy((char *)tensor->data + offset, data, size);
|
|
579
|
-
|
|
580
|
-
GGML_UNUSED(buffer);
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
584
|
-
memcpy(data, (const char *)tensor->data + offset, size);
|
|
585
|
-
|
|
586
|
-
GGML_UNUSED(buffer);
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
590
|
-
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
591
|
-
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
592
|
-
return true;
|
|
593
|
-
}
|
|
594
|
-
return false;
|
|
595
|
-
|
|
596
|
-
GGML_UNUSED(buffer);
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
600
|
-
memset(buffer->context, value, buffer->size);
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
604
|
-
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
|
605
|
-
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
606
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
607
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
608
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
609
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
610
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
611
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
612
|
-
/* .reset = */ NULL,
|
|
613
|
-
};
|
|
614
|
-
|
|
615
|
-
// for buffers from ptr, free is not called
|
|
616
|
-
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
617
|
-
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
|
618
|
-
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
619
|
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
620
|
-
/* .init_tensor = */ NULL, // no initialization required
|
|
621
|
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
622
|
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
623
|
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
624
|
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
625
|
-
/* .reset = */ NULL,
|
|
626
|
-
};
|
|
627
|
-
|
|
628
|
-
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
629
|
-
return "CPU";
|
|
630
|
-
|
|
631
|
-
GGML_UNUSED(buft);
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
635
|
-
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
|
636
|
-
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
|
637
|
-
if (data == NULL) {
|
|
638
|
-
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
639
|
-
return NULL;
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
646
|
-
return TENSOR_ALIGNMENT;
|
|
647
|
-
|
|
648
|
-
GGML_UNUSED(buft);
|
|
649
|
-
}
|
|
650
|
-
|
|
651
|
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
652
|
-
return true;
|
|
653
|
-
|
|
654
|
-
GGML_UNUSED(buft);
|
|
655
|
-
}
|
|
656
|
-
|
|
657
|
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
658
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
659
|
-
/* .iface = */ {
|
|
660
|
-
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
661
|
-
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
662
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
663
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
664
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
665
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
666
|
-
},
|
|
667
|
-
/* .context = */ NULL,
|
|
668
|
-
};
|
|
669
|
-
|
|
670
|
-
return &ggml_backend_cpu_buffer_type;
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
#ifdef GGML_USE_CPU_HBM
|
|
674
|
-
|
|
675
|
-
// buffer type HBM
|
|
676
|
-
|
|
677
|
-
#include <hbwmalloc.h>
|
|
678
|
-
|
|
679
|
-
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
680
|
-
return "CPU_HBM";
|
|
681
|
-
|
|
682
|
-
GGML_UNUSED(buft);
|
|
467
|
+
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
|
468
|
+
return device->reg;
|
|
683
469
|
}
|
|
684
470
|
|
|
685
|
-
|
|
686
|
-
return
|
|
687
|
-
|
|
688
|
-
GGML_UNUSED(buf);
|
|
471
|
+
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
|
472
|
+
return device->iface.init_backend(device, params);
|
|
689
473
|
}
|
|
690
474
|
|
|
691
|
-
|
|
692
|
-
|
|
475
|
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
476
|
+
return device->iface.get_buffer_type(device);
|
|
693
477
|
}
|
|
694
478
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
void * ptr;
|
|
698
|
-
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
|
699
|
-
if (result != 0) {
|
|
700
|
-
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
|
479
|
+
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
|
480
|
+
if (device->iface.get_host_buffer_type == NULL) {
|
|
701
481
|
return NULL;
|
|
702
482
|
}
|
|
703
483
|
|
|
704
|
-
|
|
705
|
-
buffer->buft = buft;
|
|
706
|
-
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
|
707
|
-
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
|
708
|
-
|
|
709
|
-
return buffer;
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
713
|
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
|
714
|
-
/* .iface = */ {
|
|
715
|
-
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
|
716
|
-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
717
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
718
|
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
719
|
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
720
|
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
721
|
-
},
|
|
722
|
-
/* .context = */ NULL,
|
|
723
|
-
};
|
|
724
|
-
|
|
725
|
-
return &ggml_backend_cpu_buffer_type_hbm;
|
|
484
|
+
return device->iface.get_host_buffer_type(device);
|
|
726
485
|
}
|
|
727
|
-
#endif
|
|
728
|
-
|
|
729
|
-
struct ggml_backend_cpu_context {
|
|
730
|
-
int n_threads;
|
|
731
|
-
void * work_data;
|
|
732
|
-
size_t work_size;
|
|
733
|
-
|
|
734
|
-
ggml_abort_callback abort_callback;
|
|
735
|
-
void * abort_callback_data;
|
|
736
|
-
};
|
|
737
|
-
|
|
738
|
-
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
|
739
|
-
return "CPU";
|
|
740
486
|
|
|
741
|
-
|
|
487
|
+
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
|
488
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
|
742
489
|
}
|
|
743
490
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
free(cpu_ctx->work_data);
|
|
747
|
-
free(cpu_ctx);
|
|
748
|
-
free(backend);
|
|
491
|
+
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
492
|
+
return device->iface.supports_op(device, op);
|
|
749
493
|
}
|
|
750
494
|
|
|
751
|
-
|
|
752
|
-
return
|
|
753
|
-
|
|
754
|
-
GGML_UNUSED(backend);
|
|
495
|
+
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
|
496
|
+
return device->iface.supports_buft(device, buft);
|
|
755
497
|
}
|
|
756
498
|
|
|
757
|
-
struct
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
};
|
|
761
|
-
|
|
762
|
-
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
|
763
|
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
|
764
|
-
|
|
765
|
-
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
766
|
-
|
|
767
|
-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
768
|
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
|
769
|
-
|
|
770
|
-
if (cpu_plan->cplan.work_size > 0) {
|
|
771
|
-
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
|
772
|
-
if (cpu_plan->cplan.work_data == NULL) {
|
|
773
|
-
free(cpu_plan);
|
|
774
|
-
return NULL;
|
|
775
|
-
}
|
|
499
|
+
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
|
500
|
+
if (device->iface.offload_op != NULL) {
|
|
501
|
+
return device->iface.offload_op(device, op);
|
|
776
502
|
}
|
|
777
503
|
|
|
778
|
-
|
|
779
|
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
780
|
-
|
|
781
|
-
return cpu_plan;
|
|
504
|
+
return false;
|
|
782
505
|
}
|
|
783
506
|
|
|
784
|
-
|
|
785
|
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
|
786
|
-
|
|
787
|
-
free(cpu_plan->cplan.work_data);
|
|
788
|
-
free(cpu_plan);
|
|
507
|
+
// Backend (reg)
|
|
789
508
|
|
|
790
|
-
|
|
509
|
+
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
|
510
|
+
return reg->iface.get_name(reg);
|
|
791
511
|
}
|
|
792
512
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
|
797
|
-
|
|
798
|
-
GGML_UNUSED(backend);
|
|
513
|
+
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
|
514
|
+
return reg->iface.get_device_count(reg);
|
|
799
515
|
}
|
|
800
516
|
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
|
805
|
-
|
|
806
|
-
if (cpu_ctx->work_size < cplan.work_size) {
|
|
807
|
-
free(cpu_ctx->work_data);
|
|
808
|
-
cpu_ctx->work_data = malloc(cplan.work_size);
|
|
809
|
-
if (cpu_ctx->work_data == NULL) {
|
|
810
|
-
cpu_ctx->work_size = 0;
|
|
811
|
-
return GGML_STATUS_ALLOC_FAILED;
|
|
812
|
-
}
|
|
813
|
-
cpu_ctx->work_size = cplan.work_size;
|
|
814
|
-
}
|
|
815
|
-
cplan.work_data = cpu_ctx->work_data;
|
|
816
|
-
|
|
817
|
-
cplan.abort_callback = cpu_ctx->abort_callback;
|
|
818
|
-
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
|
819
|
-
|
|
820
|
-
return ggml_graph_compute(cgraph, &cplan);
|
|
821
|
-
}
|
|
822
|
-
|
|
823
|
-
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
824
|
-
switch (op->op) {
|
|
825
|
-
case GGML_OP_CPY:
|
|
826
|
-
return
|
|
827
|
-
op->type != GGML_TYPE_IQ2_XXS &&
|
|
828
|
-
op->type != GGML_TYPE_IQ2_XS &&
|
|
829
|
-
op->type != GGML_TYPE_IQ1_S &&
|
|
830
|
-
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
|
831
|
-
case GGML_OP_MUL_MAT:
|
|
832
|
-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
|
833
|
-
default:
|
|
834
|
-
return true;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
GGML_UNUSED(backend);
|
|
838
|
-
}
|
|
839
|
-
|
|
840
|
-
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
841
|
-
return ggml_backend_buft_is_host(buft);
|
|
842
|
-
|
|
843
|
-
GGML_UNUSED(backend);
|
|
844
|
-
}
|
|
845
|
-
|
|
846
|
-
static struct ggml_backend_i cpu_backend_i = {
|
|
847
|
-
/* .get_name = */ ggml_backend_cpu_name,
|
|
848
|
-
/* .free = */ ggml_backend_cpu_free,
|
|
849
|
-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
|
850
|
-
/* .set_tensor_async = */ NULL,
|
|
851
|
-
/* .get_tensor_async = */ NULL,
|
|
852
|
-
/* .cpy_tensor_async = */ NULL,
|
|
853
|
-
/* .synchronize = */ NULL,
|
|
854
|
-
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
|
855
|
-
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
856
|
-
/* .graph_plan_update = */ NULL,
|
|
857
|
-
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
|
858
|
-
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
|
859
|
-
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
860
|
-
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
|
861
|
-
/* .offload_op = */ NULL,
|
|
862
|
-
/* .event_new = */ NULL,
|
|
863
|
-
/* .event_free = */ NULL,
|
|
864
|
-
/* .event_record = */ NULL,
|
|
865
|
-
/* .event_wait = */ NULL,
|
|
866
|
-
/* .event_synchronize = */ NULL,
|
|
867
|
-
};
|
|
868
|
-
|
|
869
|
-
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
|
870
|
-
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
|
871
|
-
return &guid;
|
|
517
|
+
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
|
518
|
+
return reg->iface.get_device(reg, index);
|
|
872
519
|
}
|
|
873
520
|
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
if (ctx == NULL) {
|
|
877
|
-
return NULL;
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
|
881
|
-
ctx->work_data = NULL;
|
|
882
|
-
ctx->work_size = 0;
|
|
883
|
-
ctx->abort_callback = NULL;
|
|
884
|
-
ctx->abort_callback_data = NULL;
|
|
885
|
-
|
|
886
|
-
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
|
887
|
-
if (cpu_backend == NULL) {
|
|
888
|
-
free(ctx);
|
|
521
|
+
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
522
|
+
if (!reg->iface.get_proc_address) {
|
|
889
523
|
return NULL;
|
|
890
524
|
}
|
|
891
|
-
|
|
892
|
-
*cpu_backend = (struct ggml_backend) {
|
|
893
|
-
/* .guid = */ ggml_backend_cpu_guid(),
|
|
894
|
-
/* .interface = */ cpu_backend_i,
|
|
895
|
-
/* .context = */ ctx
|
|
896
|
-
};
|
|
897
|
-
return cpu_backend;
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
|
901
|
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
905
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
906
|
-
|
|
907
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
908
|
-
ctx->n_threads = n_threads;
|
|
909
|
-
}
|
|
910
|
-
|
|
911
|
-
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
|
912
|
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
|
913
|
-
|
|
914
|
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
|
915
|
-
ctx->abort_callback = abort_callback;
|
|
916
|
-
ctx->abort_callback_data = abort_callback_data;
|
|
917
|
-
}
|
|
918
|
-
|
|
919
|
-
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
920
|
-
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
921
|
-
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
|
925
|
-
return ggml_backend_cpu_init();
|
|
926
|
-
|
|
927
|
-
GGML_UNUSED(params);
|
|
928
|
-
GGML_UNUSED(user_data);
|
|
525
|
+
return reg->iface.get_proc_address(reg, name);
|
|
929
526
|
}
|
|
930
527
|
|
|
931
528
|
// multi-buffer buffer
|
|
@@ -935,16 +532,8 @@ struct ggml_backend_multi_buffer_context {
|
|
|
935
532
|
size_t n_buffers;
|
|
936
533
|
};
|
|
937
534
|
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
941
|
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
|
942
|
-
|
|
943
|
-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
947
|
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
|
535
|
+
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
536
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
948
537
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
949
538
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
|
950
539
|
}
|
|
@@ -953,31 +542,27 @@ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_
|
|
|
953
542
|
free(ctx);
|
|
954
543
|
}
|
|
955
544
|
|
|
956
|
-
|
|
957
|
-
|
|
545
|
+
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
546
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
958
547
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
959
548
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
|
960
549
|
}
|
|
961
550
|
}
|
|
962
551
|
|
|
963
|
-
static struct ggml_backend_buffer_i
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
};
|
|
975
|
-
|
|
976
|
-
return multi_backend_buffer_i;
|
|
977
|
-
}
|
|
552
|
+
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
|
|
553
|
+
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
|
554
|
+
/* .get_base = */ NULL,
|
|
555
|
+
/* .init_tensor = */ NULL,
|
|
556
|
+
/* .memset_tensor = */ NULL,
|
|
557
|
+
/* .set_tensor = */ NULL,
|
|
558
|
+
/* .get_tensor = */ NULL,
|
|
559
|
+
/* .cpy_tensor = */ NULL,
|
|
560
|
+
/* .clear = */ ggml_backend_multi_buffer_clear,
|
|
561
|
+
/* .reset = */ NULL,
|
|
562
|
+
};
|
|
978
563
|
|
|
979
|
-
|
|
980
|
-
|
|
564
|
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
|
565
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
|
981
566
|
ctx->n_buffers = n_buffers;
|
|
982
567
|
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
|
983
568
|
|
|
@@ -989,16 +574,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back
|
|
|
989
574
|
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
|
990
575
|
}
|
|
991
576
|
|
|
992
|
-
return ggml_backend_buffer_init(buffers[0]->buft,
|
|
577
|
+
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
|
|
993
578
|
}
|
|
994
579
|
|
|
995
|
-
|
|
996
|
-
return buffer->iface.
|
|
580
|
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
|
581
|
+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
|
997
582
|
}
|
|
998
583
|
|
|
999
|
-
|
|
584
|
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
|
1000
585
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
|
1001
|
-
|
|
586
|
+
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
|
1002
587
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
|
1003
588
|
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
|
1004
589
|
}
|
|
@@ -1023,10 +608,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|
|
1023
608
|
#define GGML_SCHED_MAX_BACKENDS 16
|
|
1024
609
|
#endif
|
|
1025
610
|
|
|
1026
|
-
#ifndef GGML_SCHED_MAX_SPLITS
|
|
1027
|
-
#define GGML_SCHED_MAX_SPLITS 2048
|
|
1028
|
-
#endif
|
|
1029
|
-
|
|
1030
611
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
|
1031
612
|
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
|
1032
613
|
#endif
|
|
@@ -1089,7 +670,7 @@ struct ggml_backend_sched {
|
|
|
1089
670
|
char * context_buffer;
|
|
1090
671
|
size_t context_buffer_size;
|
|
1091
672
|
|
|
1092
|
-
|
|
673
|
+
int debug;
|
|
1093
674
|
};
|
|
1094
675
|
|
|
1095
676
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
|
@@ -1108,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
|
1108
689
|
}
|
|
1109
690
|
|
|
1110
691
|
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
|
1111
|
-
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
692
|
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1112
693
|
if (buffer == NULL) {
|
|
1113
694
|
return -1;
|
|
1114
695
|
}
|
|
@@ -1122,7 +703,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
|
1122
703
|
}
|
|
1123
704
|
|
|
1124
705
|
#ifndef NDEBUG
|
|
1125
|
-
|
|
706
|
+
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
|
1126
707
|
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
|
1127
708
|
#endif
|
|
1128
709
|
|
|
@@ -1130,7 +711,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
|
|
1130
711
|
}
|
|
1131
712
|
|
|
1132
713
|
#if 0
|
|
1133
|
-
|
|
714
|
+
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
|
715
|
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
|
1134
716
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
|
1135
717
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
|
1136
718
|
#else
|
|
@@ -1140,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
|
|
|
1140
722
|
|
|
1141
723
|
// returns the backend that should be used for the node based on the current locations
|
|
1142
724
|
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
|
1143
|
-
// TODO: use supports_op to check if the backend supports the op
|
|
1144
|
-
|
|
1145
725
|
// assign pre-allocated nodes to their backend
|
|
1146
726
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
|
1147
727
|
if (cur_backend_id != -1) {
|
|
@@ -1158,6 +738,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1158
738
|
}
|
|
1159
739
|
}
|
|
1160
740
|
|
|
741
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
|
742
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
|
743
|
+
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
|
|
744
|
+
}
|
|
745
|
+
|
|
1161
746
|
// graph input
|
|
1162
747
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
|
1163
748
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
|
@@ -1171,7 +756,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1171
756
|
if (src == NULL) {
|
|
1172
757
|
continue;
|
|
1173
758
|
}
|
|
1174
|
-
|
|
759
|
+
// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
|
|
760
|
+
// not an ideal solution
|
|
761
|
+
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1175
762
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
|
1176
763
|
// check if a backend with higher prio wants to offload the op
|
|
1177
764
|
if (src_backend_id == sched->n_backends - 1) {
|
|
@@ -1205,32 +792,34 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
|
1205
792
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1206
793
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
|
1207
794
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
|
1208
|
-
|
|
795
|
+
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
|
1209
796
|
sched->splits[cur_split].n_inputs);
|
|
1210
797
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
|
1211
|
-
|
|
798
|
+
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
|
1212
799
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
|
1213
800
|
}
|
|
1214
|
-
|
|
801
|
+
GGML_LOG_DEBUG("\n");
|
|
1215
802
|
cur_split++;
|
|
1216
803
|
}
|
|
1217
804
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1218
805
|
if (ggml_is_view_op(node->op)) {
|
|
1219
806
|
continue;
|
|
1220
807
|
}
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
808
|
+
if (sched->debug > 1) {
|
|
809
|
+
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
810
|
+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
|
811
|
+
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
|
812
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
813
|
+
struct ggml_tensor * src = node->src[j];
|
|
814
|
+
if (src == NULL) {
|
|
815
|
+
continue;
|
|
816
|
+
}
|
|
817
|
+
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
818
|
+
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
819
|
+
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
1228
820
|
}
|
|
1229
|
-
|
|
1230
|
-
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
|
1231
|
-
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
|
821
|
+
GGML_LOG_DEBUG("\n");
|
|
1232
822
|
}
|
|
1233
|
-
fprintf(stderr, "\n");
|
|
1234
823
|
}
|
|
1235
824
|
}
|
|
1236
825
|
|
|
@@ -1295,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1295
884
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
1296
885
|
struct ggml_tensor * node = graph->nodes[i];
|
|
1297
886
|
int * node_backend_id = &tensor_backend_id(node);
|
|
887
|
+
if (ggml_is_view_op(node->op)) {
|
|
888
|
+
continue;
|
|
889
|
+
}
|
|
1298
890
|
// do not overwrite user assignments
|
|
1299
891
|
if (*node_backend_id == -1) {
|
|
1300
892
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
|
@@ -1522,11 +1114,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1522
1114
|
if (src == NULL) {
|
|
1523
1115
|
continue;
|
|
1524
1116
|
}
|
|
1525
|
-
// check if a weight is on a different backend
|
|
1117
|
+
// check if a weight is on a different and incompatible backend
|
|
1526
1118
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
|
1527
1119
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
|
1528
1120
|
int src_backend_id = tensor_backend_id(src);
|
|
1529
|
-
if (src_backend_id != cur_backend_id) {
|
|
1121
|
+
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
|
1530
1122
|
need_new_split = true;
|
|
1531
1123
|
break;
|
|
1532
1124
|
}
|
|
@@ -1538,7 +1130,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1538
1130
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
|
1539
1131
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
|
1540
1132
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
|
1541
|
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
|
1542
1133
|
need_new_split = true;
|
|
1543
1134
|
break;
|
|
1544
1135
|
}
|
|
@@ -1551,10 +1142,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1551
1142
|
i_split++;
|
|
1552
1143
|
if (i_split >= sched->splits_capacity) {
|
|
1553
1144
|
sched->splits_capacity *= 2;
|
|
1554
|
-
sched->splits =
|
|
1145
|
+
sched->splits = (ggml_backend_sched_split *)
|
|
1146
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
|
1555
1147
|
GGML_ASSERT(sched->splits != NULL);
|
|
1556
1148
|
}
|
|
1557
|
-
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
|
1558
1149
|
split = &sched->splits[i_split];
|
|
1559
1150
|
split->backend_id = node_backend_id;
|
|
1560
1151
|
split->i_start = i;
|
|
@@ -1638,11 +1229,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1638
1229
|
sched->prev_leaf_backend_ids = tmp;
|
|
1639
1230
|
}
|
|
1640
1231
|
|
|
1641
|
-
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1232
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
|
1642
1233
|
if (sched->graph.size < graph_size) {
|
|
1643
1234
|
sched->graph.size = graph_size;
|
|
1644
|
-
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
1645
|
-
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
|
1235
|
+
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
|
1236
|
+
sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
|
1646
1237
|
GGML_ASSERT(sched->graph.nodes != NULL);
|
|
1647
1238
|
GGML_ASSERT(sched->graph.leafs != NULL);
|
|
1648
1239
|
}
|
|
@@ -1690,6 +1281,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1690
1281
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1691
1282
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1692
1283
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
1284
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1693
1285
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1694
1286
|
}
|
|
1695
1287
|
}
|
|
@@ -1703,6 +1295,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1703
1295
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1704
1296
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
|
1705
1297
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
|
1298
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1706
1299
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
|
1707
1300
|
}
|
|
1708
1301
|
}
|
|
@@ -1713,6 +1306,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
|
1713
1306
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
1714
1307
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
1715
1308
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
|
1309
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
|
1716
1310
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
|
1717
1311
|
}
|
|
1718
1312
|
}
|
|
@@ -1741,11 +1335,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
1741
1335
|
// the re-allocation may cause the split inputs to be moved to a different address
|
|
1742
1336
|
ggml_backend_sched_synchronize(sched);
|
|
1743
1337
|
#ifndef NDEBUG
|
|
1744
|
-
|
|
1338
|
+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
|
1745
1339
|
#endif
|
|
1746
1340
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
|
1747
1341
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
|
1748
|
-
|
|
1342
|
+
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
1749
1343
|
return false;
|
|
1750
1344
|
}
|
|
1751
1345
|
}
|
|
@@ -1782,7 +1376,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1782
1376
|
} else {
|
|
1783
1377
|
ggml_backend_synchronize(split_backend);
|
|
1784
1378
|
}
|
|
1785
|
-
|
|
1379
|
+
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
|
1380
|
+
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
|
1381
|
+
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
|
1382
|
+
ggml_backend_synchronize(input_backend);
|
|
1383
|
+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
1384
|
+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
|
1385
|
+
} else {
|
|
1386
|
+
ggml_backend_synchronize(split_backend);
|
|
1387
|
+
}
|
|
1388
|
+
ggml_backend_tensor_copy(input, input_cpy);
|
|
1389
|
+
}
|
|
1786
1390
|
}
|
|
1787
1391
|
}
|
|
1788
1392
|
|
|
@@ -1828,7 +1432,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|
|
1828
1432
|
// record the event of this copy
|
|
1829
1433
|
if (split->n_inputs > 0) {
|
|
1830
1434
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
|
1831
|
-
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
|
1435
|
+
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
|
1832
1436
|
}
|
|
1833
1437
|
}
|
|
1834
1438
|
}
|
|
@@ -1846,40 +1450,43 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
|
1846
1450
|
bool parallel) {
|
|
1847
1451
|
GGML_ASSERT(n_backends > 0);
|
|
1848
1452
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
|
1849
|
-
GGML_ASSERT(
|
|
1453
|
+
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1850
1454
|
|
|
1851
|
-
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
|
1455
|
+
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
|
1852
1456
|
|
|
1853
|
-
|
|
1457
|
+
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
|
|
1458
|
+
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
|
|
1854
1459
|
sched->n_backends = n_backends;
|
|
1855
1460
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
1856
1461
|
|
|
1857
1462
|
// initialize hash table
|
|
1858
1463
|
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
|
1859
1464
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
|
1860
|
-
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
1861
|
-
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
1465
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
|
1466
|
+
sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
|
1862
1467
|
|
|
1863
|
-
const size_t
|
|
1864
|
-
|
|
1865
|
-
sched->
|
|
1866
|
-
sched->
|
|
1867
|
-
sched->
|
|
1468
|
+
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
|
1469
|
+
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
|
1470
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
|
1471
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
|
1472
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
|
1473
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
|
1868
1474
|
|
|
1869
|
-
sched->context_buffer_size =
|
|
1870
|
-
sched->context_buffer = malloc(sched->context_buffer_size);
|
|
1475
|
+
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
|
1476
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
|
1871
1477
|
|
|
1872
1478
|
const int initial_splits_capacity = 16;
|
|
1873
|
-
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
|
1479
|
+
sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
|
1874
1480
|
sched->splits_capacity = initial_splits_capacity;
|
|
1875
1481
|
|
|
1876
1482
|
for (int b = 0; b < n_backends; b++) {
|
|
1877
1483
|
sched->backends[b] = backends[b];
|
|
1878
1484
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
|
1879
1485
|
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
|
1486
|
+
|
|
1880
1487
|
if (sched->n_copies > 1) {
|
|
1881
1488
|
for (int c = 0; c < sched->n_copies; c++) {
|
|
1882
|
-
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
1489
|
+
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
|
1883
1490
|
}
|
|
1884
1491
|
}
|
|
1885
1492
|
}
|
|
@@ -1932,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
|
|
1932
1539
|
|
|
1933
1540
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
|
1934
1541
|
|
|
1542
|
+
ggml_backend_sched_synchronize(sched);
|
|
1543
|
+
|
|
1935
1544
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
1936
1545
|
return false;
|
|
1937
1546
|
}
|
|
1938
1547
|
|
|
1939
1548
|
ggml_backend_sched_reset(sched);
|
|
1940
|
-
ggml_backend_sched_synchronize(sched);
|
|
1941
1549
|
|
|
1942
1550
|
return true;
|
|
1943
1551
|
}
|
|
@@ -2115,8 +1723,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
|
|
2115
1723
|
|
|
2116
1724
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
|
2117
1725
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
|
2118
|
-
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
2119
|
-
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
|
1726
|
+
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
|
1727
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
|
2120
1728
|
|
|
2121
1729
|
struct ggml_init_params params = {
|
|
2122
1730
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
|
@@ -2128,13 +1736,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2128
1736
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
|
2129
1737
|
|
|
2130
1738
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
|
2131
|
-
|
|
1739
|
+
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
|
2132
1740
|
ggml_hash_set_free(&hash_set);
|
|
2133
1741
|
free(node_copies);
|
|
2134
1742
|
free(node_init);
|
|
2135
1743
|
ggml_free(ctx_allocated);
|
|
2136
1744
|
ggml_free(ctx_unallocated);
|
|
2137
|
-
return
|
|
1745
|
+
return {
|
|
2138
1746
|
/* .buffer = */ NULL,
|
|
2139
1747
|
/* .ctx_allocated = */ NULL,
|
|
2140
1748
|
/* .ctx_unallocated = */ NULL,
|
|
@@ -2151,13 +1759,13 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2151
1759
|
// allocate nodes
|
|
2152
1760
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
2153
1761
|
if (buffer == NULL) {
|
|
2154
|
-
|
|
1762
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
|
2155
1763
|
ggml_hash_set_free(&hash_set);
|
|
2156
1764
|
free(node_copies);
|
|
2157
1765
|
free(node_init);
|
|
2158
1766
|
ggml_free(ctx_allocated);
|
|
2159
1767
|
ggml_free(ctx_unallocated);
|
|
2160
|
-
return
|
|
1768
|
+
return {
|
|
2161
1769
|
/* .buffer = */ NULL,
|
|
2162
1770
|
/* .ctx_allocated = */ NULL,
|
|
2163
1771
|
/* .ctx_unallocated = */ NULL,
|
|
@@ -2186,7 +1794,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
|
2186
1794
|
free(node_copies);
|
|
2187
1795
|
free(node_init);
|
|
2188
1796
|
|
|
2189
|
-
return
|
|
1797
|
+
return {
|
|
2190
1798
|
/* .buffer = */ buffer,
|
|
2191
1799
|
/* .ctx_allocated = */ ctx_allocated,
|
|
2192
1800
|
/* .ctx_unallocated = */ ctx_unallocated,
|
|
@@ -2238,3 +1846,154 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
|
2238
1846
|
|
|
2239
1847
|
return true;
|
|
2240
1848
|
}
|
|
1849
|
+
|
|
1850
|
+
// CPU backend - buffer
|
|
1851
|
+
|
|
1852
|
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1853
|
+
uintptr_t data = (uintptr_t)buffer->context;
|
|
1854
|
+
|
|
1855
|
+
// align the buffer
|
|
1856
|
+
if (data % TENSOR_ALIGNMENT != 0) {
|
|
1857
|
+
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
return (void *)data;
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1864
|
+
ggml_aligned_free(buffer->context, buffer->size);
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
1868
|
+
memset((char *)tensor->data + offset, value, size);
|
|
1869
|
+
|
|
1870
|
+
GGML_UNUSED(buffer);
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
1874
|
+
memcpy((char *)tensor->data + offset, data, size);
|
|
1875
|
+
|
|
1876
|
+
GGML_UNUSED(buffer);
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
1880
|
+
memcpy(data, (const char *)tensor->data + offset, size);
|
|
1881
|
+
|
|
1882
|
+
GGML_UNUSED(buffer);
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
1886
|
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
1887
|
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
|
1888
|
+
return true;
|
|
1889
|
+
}
|
|
1890
|
+
return false;
|
|
1891
|
+
|
|
1892
|
+
GGML_UNUSED(buffer);
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1896
|
+
memset(buffer->context, value, buffer->size);
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
1900
|
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
|
1901
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1902
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1903
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1904
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1905
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1906
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1907
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1908
|
+
/* .reset = */ NULL,
|
|
1909
|
+
};
|
|
1910
|
+
|
|
1911
|
+
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
1912
|
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
1913
|
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
1914
|
+
/* .init_tensor = */ NULL, // no initialization required
|
|
1915
|
+
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
1916
|
+
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
|
1917
|
+
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
|
1918
|
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
1919
|
+
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
1920
|
+
/* .reset = */ NULL,
|
|
1921
|
+
};
|
|
1922
|
+
|
|
1923
|
+
// CPU backend buffer type
|
|
1924
|
+
|
|
1925
|
+
// this buffer type is defined here to make it available to all backends
|
|
1926
|
+
|
|
1927
|
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1928
|
+
return "CPU";
|
|
1929
|
+
|
|
1930
|
+
GGML_UNUSED(buft);
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1934
|
+
void * data = ggml_aligned_malloc(size);
|
|
1935
|
+
|
|
1936
|
+
if (data == NULL) {
|
|
1937
|
+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
|
1938
|
+
return NULL;
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1945
|
+
return TENSOR_ALIGNMENT;
|
|
1946
|
+
|
|
1947
|
+
GGML_UNUSED(buft);
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1950
|
+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
1951
|
+
return true;
|
|
1952
|
+
|
|
1953
|
+
GGML_UNUSED(buft);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
1957
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1958
|
+
/* .iface = */ {
|
|
1959
|
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
|
1960
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1961
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1962
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1963
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1964
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1965
|
+
},
|
|
1966
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1967
|
+
/* .context = */ NULL,
|
|
1968
|
+
};
|
|
1969
|
+
|
|
1970
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
|
1974
|
+
return "CPU_Mapped";
|
|
1975
|
+
|
|
1976
|
+
GGML_UNUSED(buft);
|
|
1977
|
+
}
|
|
1978
|
+
|
|
1979
|
+
static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
|
1980
|
+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
|
1981
|
+
/* .iface = */ {
|
|
1982
|
+
/* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
|
|
1983
|
+
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
|
1984
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
1985
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1986
|
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1987
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
|
1988
|
+
},
|
|
1989
|
+
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
1990
|
+
/* .context = */ NULL,
|
|
1991
|
+
};
|
|
1992
|
+
|
|
1993
|
+
return &ggml_backend_cpu_buffer_type;
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
|
1997
|
+
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
|
1998
|
+
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
|
1999
|
+
}
|