@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
|
|
294
294
|
alloc->free_blocks[0].offset = 0;
|
|
295
295
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
|
296
296
|
alloc->max_size = 0;
|
|
297
|
+
|
|
298
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
|
299
|
+
for (int i = 0; i < 1024; i++) {
|
|
300
|
+
alloc->allocated_tensors[i].tensor = NULL;
|
|
301
|
+
}
|
|
302
|
+
#endif
|
|
297
303
|
}
|
|
298
304
|
|
|
299
305
|
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
|
@@ -9,144 +9,226 @@ extern "C" {
|
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
11
|
//
|
|
12
|
-
// Backend buffer
|
|
12
|
+
// Backend buffer type
|
|
13
13
|
//
|
|
14
14
|
|
|
15
|
-
// buffer type
|
|
16
|
-
typedef void * ggml_backend_buffer_type_context_t;
|
|
17
|
-
|
|
18
15
|
struct ggml_backend_buffer_type_i {
|
|
19
|
-
const char * (*
|
|
16
|
+
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
|
20
17
|
// allocate a buffer of this type
|
|
21
|
-
ggml_backend_buffer_t (*
|
|
18
|
+
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
|
22
19
|
// tensor alignment
|
|
23
|
-
size_t (*
|
|
24
|
-
// max buffer size that can be allocated
|
|
25
|
-
size_t (*
|
|
26
|
-
// data size needed to allocate the tensor, including padding
|
|
27
|
-
size_t (*
|
|
28
|
-
// check if tensor data is in host memory
|
|
29
|
-
bool (*
|
|
20
|
+
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
|
|
21
|
+
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
|
|
22
|
+
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
|
23
|
+
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
|
24
|
+
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
|
25
|
+
// (optional) check if tensor data is in host memory (defaults to false)
|
|
26
|
+
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
|
30
27
|
};
|
|
31
28
|
|
|
32
29
|
struct ggml_backend_buffer_type {
|
|
33
30
|
struct ggml_backend_buffer_type_i iface;
|
|
34
|
-
|
|
31
|
+
ggml_backend_dev_t device;
|
|
32
|
+
void * context;
|
|
35
33
|
};
|
|
36
34
|
|
|
37
|
-
//
|
|
38
|
-
|
|
35
|
+
//
|
|
36
|
+
// Backend buffer
|
|
37
|
+
//
|
|
39
38
|
|
|
40
39
|
struct ggml_backend_buffer_i {
|
|
41
|
-
const char * (*
|
|
42
|
-
|
|
43
|
-
void
|
|
44
|
-
|
|
45
|
-
void
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
void (*
|
|
40
|
+
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
|
41
|
+
// (optional) free the buffer
|
|
42
|
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
|
43
|
+
// base address of the buffer
|
|
44
|
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
|
45
|
+
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
|
46
|
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
47
|
+
// tensor data access
|
|
48
|
+
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
49
|
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
50
|
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
51
|
+
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
|
|
52
|
+
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
53
|
+
// clear the entire buffer
|
|
54
|
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
|
55
|
+
// (optional) reset any internal state due to tensor initialization, such as tensor extras
|
|
56
|
+
void (*reset) (ggml_backend_buffer_t buffer);
|
|
50
57
|
};
|
|
51
58
|
|
|
52
59
|
struct ggml_backend_buffer {
|
|
53
60
|
struct ggml_backend_buffer_i iface;
|
|
54
61
|
ggml_backend_buffer_type_t buft;
|
|
55
|
-
|
|
62
|
+
void * context;
|
|
56
63
|
size_t size;
|
|
57
64
|
enum ggml_backend_buffer_usage usage;
|
|
58
65
|
};
|
|
59
66
|
|
|
60
|
-
|
|
61
|
-
ggml_backend_buffer_type_t
|
|
62
|
-
struct ggml_backend_buffer_i
|
|
63
|
-
|
|
64
|
-
size_t
|
|
67
|
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
68
|
+
ggml_backend_buffer_type_t buft,
|
|
69
|
+
struct ggml_backend_buffer_i iface,
|
|
70
|
+
void * context,
|
|
71
|
+
size_t size);
|
|
65
72
|
|
|
66
73
|
// do not use directly, use ggml_backend_tensor_copy instead
|
|
67
74
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
68
75
|
|
|
76
|
+
// multi-buffer
|
|
69
77
|
// buffer that contains a collection of buffers
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
78
|
+
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
|
79
|
+
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
|
80
|
+
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
73
81
|
|
|
74
82
|
//
|
|
75
|
-
// Backend
|
|
83
|
+
// Backend (stream)
|
|
76
84
|
//
|
|
77
85
|
|
|
78
|
-
typedef void * ggml_backend_context_t;
|
|
79
|
-
|
|
80
86
|
struct ggml_backend_i {
|
|
81
|
-
const char * (*
|
|
87
|
+
const char * (*get_name)(ggml_backend_t backend);
|
|
82
88
|
|
|
83
|
-
void (*
|
|
89
|
+
void (*free)(ggml_backend_t backend);
|
|
84
90
|
|
|
85
91
|
// buffer allocation
|
|
86
|
-
ggml_backend_buffer_type_t (*
|
|
92
|
+
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
|
87
93
|
|
|
88
94
|
// (optional) asynchronous tensor data access
|
|
89
|
-
void (*
|
|
90
|
-
void (*
|
|
91
|
-
bool (*
|
|
95
|
+
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
96
|
+
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
97
|
+
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
92
98
|
|
|
93
99
|
// (optional) complete all pending operations
|
|
94
|
-
void (*
|
|
100
|
+
void (*synchronize)(ggml_backend_t backend);
|
|
95
101
|
|
|
96
|
-
// compute graph with a plan (not used currently)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
102
|
+
// (optional) compute graph with a plan (not used currently)
|
|
103
|
+
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
|
104
|
+
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
100
105
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
|
101
|
-
void (*
|
|
106
|
+
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
|
102
107
|
// compute the graph with the plan
|
|
103
|
-
enum ggml_status (*
|
|
108
|
+
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
109
|
+
|
|
110
|
+
// compute graph (always async if supported by the backend)
|
|
111
|
+
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
104
112
|
|
|
105
|
-
//
|
|
106
|
-
|
|
113
|
+
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
|
114
|
+
// new backends should implement the device interface instead
|
|
107
115
|
|
|
116
|
+
// These functions are being moved to the device interface
|
|
108
117
|
// check if the backend can compute an operation
|
|
109
|
-
bool (*
|
|
118
|
+
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
|
110
119
|
|
|
111
120
|
// check if the backend can use tensors allocated in a buffer type
|
|
112
|
-
bool (*
|
|
121
|
+
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
113
122
|
|
|
114
123
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
115
124
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
116
125
|
// even if the weight has to be copied from the CPU temporarily
|
|
117
|
-
bool (*
|
|
126
|
+
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
|
118
127
|
|
|
119
128
|
// (optional) event synchronization
|
|
120
|
-
//
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
125
|
-
// wait for an event on on a different backend instance
|
|
126
|
-
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
127
|
-
// block until an event is recorded
|
|
128
|
-
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
|
129
|
+
// record an event on this stream
|
|
130
|
+
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
|
131
|
+
// wait for an event on on a different stream
|
|
132
|
+
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
129
133
|
};
|
|
130
134
|
|
|
131
135
|
struct ggml_backend {
|
|
132
136
|
ggml_guid_t guid;
|
|
133
|
-
|
|
134
137
|
struct ggml_backend_i iface;
|
|
135
|
-
|
|
138
|
+
ggml_backend_dev_t device;
|
|
139
|
+
void * context;
|
|
136
140
|
};
|
|
137
141
|
|
|
138
142
|
struct ggml_backend_event {
|
|
139
|
-
|
|
143
|
+
struct ggml_backend_device * device;
|
|
144
|
+
void * context;
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
//
|
|
148
|
+
// Backend device
|
|
149
|
+
//
|
|
150
|
+
|
|
151
|
+
// Note: if additional properties are needed, we should add a struct with all of them
|
|
152
|
+
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
|
153
|
+
struct ggml_backend_device_i {
|
|
154
|
+
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
|
155
|
+
const char * (*get_name)(ggml_backend_dev_t dev);
|
|
156
|
+
|
|
157
|
+
// device description: short informative description of the device, could be the model name
|
|
158
|
+
const char * (*get_description)(ggml_backend_dev_t dev);
|
|
159
|
+
|
|
160
|
+
// device memory in bytes
|
|
161
|
+
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
|
162
|
+
|
|
163
|
+
// device type
|
|
164
|
+
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
|
165
|
+
|
|
166
|
+
// device properties
|
|
167
|
+
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
|
168
|
+
|
|
169
|
+
// backend (stream) initialization
|
|
170
|
+
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
|
|
171
|
+
|
|
172
|
+
// preferred buffer type
|
|
173
|
+
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
|
|
174
|
+
|
|
175
|
+
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
|
|
176
|
+
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
|
|
177
|
+
|
|
178
|
+
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
|
|
179
|
+
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
|
180
|
+
|
|
181
|
+
// check if the backend can compute an operation
|
|
182
|
+
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
|
183
|
+
|
|
184
|
+
// check if the backend can use tensors allocated in a buffer type
|
|
185
|
+
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
|
|
186
|
+
|
|
187
|
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
188
|
+
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
189
|
+
// even if the weight has to be copied from the CPU temporarily
|
|
190
|
+
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
|
191
|
+
|
|
192
|
+
// (optional) event synchronization
|
|
193
|
+
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
|
194
|
+
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
195
|
+
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
struct ggml_backend_device {
|
|
199
|
+
struct ggml_backend_device_i iface;
|
|
200
|
+
ggml_backend_reg_t reg;
|
|
140
201
|
void * context;
|
|
141
202
|
};
|
|
142
203
|
|
|
143
204
|
//
|
|
144
|
-
// Backend
|
|
205
|
+
// Backend (reg)
|
|
145
206
|
//
|
|
146
207
|
|
|
147
|
-
|
|
208
|
+
struct ggml_backend_reg_i {
|
|
209
|
+
const char * (*get_name)(ggml_backend_reg_t reg);
|
|
210
|
+
|
|
211
|
+
// enumerate available devices
|
|
212
|
+
size_t (*get_device_count)(ggml_backend_reg_t reg);
|
|
213
|
+
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
|
|
214
|
+
|
|
215
|
+
// (optional) get a pointer to a function in the backend
|
|
216
|
+
// backends can add custom functions that are not part of the standard ggml-backend interface
|
|
217
|
+
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
|
218
|
+
};
|
|
219
|
+
|
|
220
|
+
struct ggml_backend_reg {
|
|
221
|
+
// int api_version; // TODO: for dynamic loading
|
|
222
|
+
struct ggml_backend_reg_i iface;
|
|
223
|
+
void * context;
|
|
224
|
+
};
|
|
225
|
+
|
|
148
226
|
|
|
149
|
-
|
|
227
|
+
// Internal backend registry API
|
|
228
|
+
void ggml_backend_register(ggml_backend_reg_t reg);
|
|
229
|
+
void ggml_backend_device_register(ggml_backend_dev_t device);
|
|
230
|
+
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
|
231
|
+
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
|
150
232
|
|
|
151
233
|
#ifdef __cplusplus
|
|
152
234
|
}
|