@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
#include <cstring>
|
|
31
31
|
#include <mutex>
|
|
32
32
|
|
|
33
|
+
#include "ggml-impl.h"
|
|
33
34
|
#include "ggml-backend-impl.h"
|
|
34
35
|
#include "ggml-cann/aclnn_ops.h"
|
|
35
36
|
#include "ggml-cann/common.h"
|
|
@@ -38,69 +39,6 @@
|
|
|
38
39
|
|
|
39
40
|
#include "ggml-common.h"
|
|
40
41
|
|
|
41
|
-
/**
|
|
42
|
-
* @brief Default logging callback for GGML.
|
|
43
|
-
*
|
|
44
|
-
* This function is the default logging callback that logs messages to stderr.
|
|
45
|
-
*
|
|
46
|
-
* @param level The log level.
|
|
47
|
-
* @param msg The log message.
|
|
48
|
-
* @param user_data User data passed to the callback.
|
|
49
|
-
*/
|
|
50
|
-
static void ggml_cann_default_log_callback(enum ggml_log_level level,
|
|
51
|
-
const char* msg, void* user_data) {
|
|
52
|
-
GGML_UNUSED(level);
|
|
53
|
-
GGML_UNUSED(user_data);
|
|
54
|
-
fprintf(stderr, "%s", msg);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
|
|
58
|
-
void* ggml_cann_log_user_data = NULL;
|
|
59
|
-
|
|
60
|
-
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
|
61
|
-
void* user_data) {
|
|
62
|
-
ggml_cann_log_callback = log_callback;
|
|
63
|
-
ggml_cann_log_user_data = user_data;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
|
67
|
-
#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
|
68
|
-
#define GGML_CANN_LOG_ERROR(...) \
|
|
69
|
-
ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
70
|
-
|
|
71
|
-
GGML_ATTRIBUTE_FORMAT(2, 3)
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* @brief Log a message using the current logging callback.
|
|
75
|
-
*
|
|
76
|
-
* This function formats a log message and passes it to the current logging
|
|
77
|
-
* callback.
|
|
78
|
-
*
|
|
79
|
-
* @param level The log level.
|
|
80
|
-
* @param format The format string for the log message.
|
|
81
|
-
* @param ... The arguments for the format string.
|
|
82
|
-
*/
|
|
83
|
-
static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
|
84
|
-
if (ggml_cann_log_callback != NULL) {
|
|
85
|
-
va_list args;
|
|
86
|
-
va_start(args, format);
|
|
87
|
-
char buffer[128];
|
|
88
|
-
int len = vsnprintf(buffer, 128, format, args);
|
|
89
|
-
if (len < 128) {
|
|
90
|
-
ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
|
|
91
|
-
} else {
|
|
92
|
-
// vsnprintf adds a null terminator
|
|
93
|
-
std::vector<char> buffer2(len + 1);
|
|
94
|
-
va_end(args);
|
|
95
|
-
va_start(args, format);
|
|
96
|
-
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
|
97
|
-
ggml_cann_log_callback(level, buffer2.data(),
|
|
98
|
-
ggml_cann_log_user_data);
|
|
99
|
-
}
|
|
100
|
-
va_end(args);
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
42
|
/**
|
|
105
43
|
* @brief Handles CANN errors by printing an error message and aborting.
|
|
106
44
|
*
|
|
@@ -115,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
|
|
115
53
|
int32_t id = -1;
|
|
116
54
|
aclrtGetDevice(&id);
|
|
117
55
|
|
|
118
|
-
|
|
119
|
-
|
|
56
|
+
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
|
57
|
+
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
|
120
58
|
file, line);
|
|
121
|
-
|
|
59
|
+
GGML_LOG_ERROR(" %s\n", stmt);
|
|
122
60
|
// abort with GGML_ASSERT to get a stack trace
|
|
123
61
|
GGML_ABORT("CANN error");
|
|
124
62
|
}
|
|
@@ -164,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
|
164
102
|
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
|
165
103
|
|
|
166
104
|
if (err != ACL_SUCCESS) {
|
|
167
|
-
|
|
105
|
+
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
|
168
106
|
__func__, aclGetRecentErrMsg());
|
|
169
107
|
return info;
|
|
170
108
|
}
|
|
@@ -314,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|
|
314
252
|
*actual_size = look_ahead_size;
|
|
315
253
|
pool_size += look_ahead_size;
|
|
316
254
|
#ifdef DEBUG_CANN_MALLOC
|
|
317
|
-
|
|
255
|
+
GGML_LOG_INFO(
|
|
318
256
|
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
|
319
257
|
"requested %u MB\n",
|
|
320
258
|
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
|
@@ -469,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
469
407
|
// add to the pool
|
|
470
408
|
pool_size += reserve_size;
|
|
471
409
|
|
|
472
|
-
//
|
|
410
|
+
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
|
473
411
|
// reserved %llu MB)\n",
|
|
474
412
|
// device, (unsigned long long) (pool_size/1024/1024),
|
|
475
413
|
// (unsigned long long) (reserve_size/1024/1024));
|
|
@@ -482,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
482
420
|
pool_used += size;
|
|
483
421
|
|
|
484
422
|
#ifdef DEBUG_CANN_MALLOC
|
|
485
|
-
|
|
423
|
+
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
|
486
424
|
(unsigned long long)size, (unsigned long long)ptr);
|
|
487
425
|
#endif
|
|
488
426
|
return ptr;
|
|
@@ -496,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
496
434
|
*/
|
|
497
435
|
void free(void* ptr, size_t size) override {
|
|
498
436
|
#ifdef DEBUG_CANN_MALLOC
|
|
499
|
-
|
|
437
|
+
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
|
500
438
|
(unsigned long long)size, (unsigned long long)ptr);
|
|
501
439
|
#endif
|
|
502
440
|
|
|
@@ -559,7 +497,7 @@ struct ggml_backend_cann_buffer_context {
|
|
|
559
497
|
* @return A pointer to a C-string containing the name of the buffer.
|
|
560
498
|
*/
|
|
561
499
|
|
|
562
|
-
|
|
500
|
+
static const char* ggml_backend_cann_buffer_get_name(
|
|
563
501
|
ggml_backend_buffer_t buffer) {
|
|
564
502
|
return "CANN";
|
|
565
503
|
|
|
@@ -575,7 +513,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
|
|
575
513
|
* @param buffer The buffer to check.
|
|
576
514
|
* @return true if the buffer is a CANN buffer, false otherwise.
|
|
577
515
|
*/
|
|
578
|
-
|
|
516
|
+
static bool ggml_backend_buffer_is_cann(
|
|
579
517
|
ggml_backend_buffer_t buffer) {
|
|
580
518
|
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
|
581
519
|
}
|
|
@@ -588,7 +526,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
|
|
|
588
526
|
*
|
|
589
527
|
* @param buffer The CANN buffer to free.
|
|
590
528
|
*/
|
|
591
|
-
|
|
529
|
+
static void ggml_backend_cann_buffer_free_buffer(
|
|
592
530
|
ggml_backend_buffer_t buffer) {
|
|
593
531
|
ggml_backend_cann_buffer_context* ctx =
|
|
594
532
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -604,7 +542,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
|
|
604
542
|
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
|
605
543
|
* @return A pointer to the base of the device memory allocated for the buffer.
|
|
606
544
|
*/
|
|
607
|
-
|
|
545
|
+
static void* ggml_backend_cann_buffer_get_base(
|
|
608
546
|
ggml_backend_buffer_t buffer) {
|
|
609
547
|
ggml_backend_cann_buffer_context* ctx =
|
|
610
548
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -624,10 +562,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
|
|
624
562
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
625
563
|
* stored.
|
|
626
564
|
*/
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
|
565
|
+
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
566
|
+
const void* src,
|
|
567
|
+
void* dst) {
|
|
631
568
|
|
|
632
569
|
int64_t n_elems = ggml_nelements(tensor);
|
|
633
570
|
int64_t groups = n_elems / QK4_0;
|
|
@@ -677,9 +614,8 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
|
677
614
|
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
|
678
615
|
* will be stored.
|
|
679
616
|
*/
|
|
680
|
-
|
|
617
|
+
static void ggml_backend_cann_transform_back_q4_0(
|
|
681
618
|
const ggml_tensor* tensor, void* src, void* dst) {
|
|
682
|
-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
|
683
619
|
|
|
684
620
|
int64_t n_elems = ggml_nelements(tensor);
|
|
685
621
|
int64_t groups = n_elems / QK4_0;
|
|
@@ -727,9 +663,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
|
|
727
663
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
728
664
|
* stored.
|
|
729
665
|
*/
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
666
|
+
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
667
|
+
const void* src,
|
|
668
|
+
void* dst) {
|
|
733
669
|
int64_t n_elems = ggml_nelements(tensor);
|
|
734
670
|
int64_t groups = n_elems / QK8_0;
|
|
735
671
|
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
@@ -761,7 +697,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
|
761
697
|
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
|
762
698
|
* will be stored.
|
|
763
699
|
*/
|
|
764
|
-
|
|
700
|
+
static void ggml_backend_cann_transform_back_q8_0(
|
|
765
701
|
const ggml_tensor* tensor, const void* src, void* dst) {
|
|
766
702
|
int64_t n_elems = ggml_nelements(tensor);
|
|
767
703
|
int64_t groups = n_elems / QK8_0;
|
|
@@ -793,8 +729,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
|
|
793
729
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
794
730
|
* stored.
|
|
795
731
|
*/
|
|
796
|
-
|
|
797
|
-
|
|
732
|
+
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
733
|
+
const void* src, void* dst) {
|
|
798
734
|
switch (tensor->type) {
|
|
799
735
|
case GGML_TYPE_Q4_0:
|
|
800
736
|
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
|
@@ -819,7 +755,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
|
819
755
|
* @param dst Pointer to the destination buffer where transformed tensor data
|
|
820
756
|
* will be stored.
|
|
821
757
|
*/
|
|
822
|
-
|
|
758
|
+
static void ggml_backend_cann_transform_back(
|
|
823
759
|
const ggml_tensor* tensor, void* src, void* dst) {
|
|
824
760
|
switch (tensor->type) {
|
|
825
761
|
case GGML_TYPE_Q4_0:
|
|
@@ -842,7 +778,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
|
|
|
842
778
|
* @param type The tensor type to check.
|
|
843
779
|
* @return true if transformation is needed, false otherwise.
|
|
844
780
|
*/
|
|
845
|
-
|
|
781
|
+
static bool need_transform(ggml_type type) {
|
|
846
782
|
switch (type) {
|
|
847
783
|
case GGML_TYPE_Q4_0:
|
|
848
784
|
case GGML_TYPE_Q8_0:
|
|
@@ -861,7 +797,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
|
|
|
861
797
|
* @param buffer The CANN buffer from which to initialize the tensor.
|
|
862
798
|
* @param tensor Pointer to the tensor to be initialized.
|
|
863
799
|
*/
|
|
864
|
-
|
|
800
|
+
static void ggml_backend_cann_buffer_init_tensor(
|
|
865
801
|
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
|
866
802
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
867
803
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
@@ -897,12 +833,11 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
|
|
897
833
|
* @param offset Offset in the source data from where to start copying.
|
|
898
834
|
* @param size Size of the data to be copied, in bytes.
|
|
899
835
|
*/
|
|
900
|
-
|
|
901
|
-
ggml_backend_buffer_t buffer, ggml_tensor*
|
|
836
|
+
static void ggml_backend_cann_buffer_set_tensor(
|
|
837
|
+
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
|
902
838
|
size_t offset, size_t size) {
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
839
|
+
ggml_backend_cann_buffer_context *ctx =
|
|
840
|
+
(ggml_backend_cann_buffer_context *)buffer->context;
|
|
906
841
|
|
|
907
842
|
ggml_cann_set_device(ctx->device);
|
|
908
843
|
// TODO: refer to cann(#6017), it use thread's default stream.
|
|
@@ -910,22 +845,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
|
|
910
845
|
// Why aclrtSynchronizeDevice?
|
|
911
846
|
|
|
912
847
|
if (!need_transform(tensor->type)) {
|
|
913
|
-
ACL_CHECK(aclrtMemcpy(tensor->data, size,
|
|
914
|
-
|
|
848
|
+
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
|
849
|
+
ACL_MEMCPY_HOST_TO_DEVICE));
|
|
915
850
|
} else {
|
|
916
|
-
void*
|
|
917
|
-
ggml_backend_cann_transform(tensor,
|
|
918
|
-
transform_buffer);
|
|
851
|
+
void *transform_buffer = malloc(size);
|
|
852
|
+
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
|
919
853
|
|
|
920
854
|
#ifndef NDEBUG
|
|
921
|
-
void*
|
|
855
|
+
void *check_buffer = malloc(size);
|
|
922
856
|
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
|
923
857
|
check_buffer);
|
|
924
|
-
GGML_ASSERT(memcmp(
|
|
925
|
-
0);
|
|
858
|
+
GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
|
|
926
859
|
free(check_buffer);
|
|
927
860
|
#endif
|
|
928
|
-
ACL_CHECK(aclrtMemcpy(tensor->data
|
|
861
|
+
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
|
|
862
|
+
transform_buffer, size,
|
|
929
863
|
ACL_MEMCPY_HOST_TO_DEVICE));
|
|
930
864
|
free(transform_buffer);
|
|
931
865
|
}
|
|
@@ -944,24 +878,23 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
|
|
944
878
|
* @param offset Offset in the destination buffer where to start copying.
|
|
945
879
|
* @param size Size of the data to be copied, in bytes.
|
|
946
880
|
*/
|
|
947
|
-
|
|
881
|
+
static void ggml_backend_cann_buffer_get_tensor(
|
|
948
882
|
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
|
949
883
|
size_t offset, size_t size) {
|
|
950
|
-
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
951
884
|
ggml_backend_cann_buffer_context* ctx =
|
|
952
885
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
953
886
|
|
|
954
887
|
ggml_cann_set_device(ctx->device);
|
|
955
888
|
|
|
956
889
|
if (!need_transform(tensor->type)) {
|
|
957
|
-
ACL_CHECK(aclrtMemcpy((char*)data + offset, size,
|
|
890
|
+
ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
|
|
958
891
|
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
959
892
|
} else {
|
|
960
893
|
void* transform_buffer = malloc(size);
|
|
961
|
-
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
|
|
894
|
+
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
|
|
895
|
+
(char*)tensor->data + offset, size,
|
|
962
896
|
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
963
|
-
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
|
964
|
-
(char*)data + offset);
|
|
897
|
+
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
|
965
898
|
free(transform_buffer);
|
|
966
899
|
}
|
|
967
900
|
}
|
|
@@ -979,7 +912,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
|
|
979
912
|
* @param dst Pointer to the destination tensor where the data will be copied.
|
|
980
913
|
* @return true if the copy operation succeeded, false otherwise.
|
|
981
914
|
*/
|
|
982
|
-
|
|
915
|
+
static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
983
916
|
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
|
984
917
|
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
|
985
918
|
ggml_backend_cann_buffer_context* src_ctx =
|
|
@@ -1021,7 +954,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
|
1021
954
|
* @param buffer The CANN buffer to be cleared.
|
|
1022
955
|
* @param value The value to which each byte in the buffer will be set.
|
|
1023
956
|
*/
|
|
1024
|
-
|
|
957
|
+
static void ggml_backend_cann_buffer_clear(
|
|
1025
958
|
ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1026
959
|
ggml_backend_cann_buffer_context* ctx =
|
|
1027
960
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
@@ -1041,6 +974,7 @@ static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
|
|
1041
974
|
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
|
1042
975
|
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
|
1043
976
|
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
|
977
|
+
/* .memset_tensor = */ NULL,
|
|
1044
978
|
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
|
|
1045
979
|
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
|
|
1046
980
|
/* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
|
|
@@ -1068,7 +1002,7 @@ struct ggml_backend_cann_buffer_type_context {
|
|
|
1068
1002
|
* @param buft Pointer to the buffer type context.
|
|
1069
1003
|
* @return Const pointer to the C-style string containing the name.
|
|
1070
1004
|
*/
|
|
1071
|
-
|
|
1005
|
+
static const char* ggml_backend_cann_buffer_type_name(
|
|
1072
1006
|
ggml_backend_buffer_type_t buft) {
|
|
1073
1007
|
return "CANN";
|
|
1074
1008
|
|
|
@@ -1085,7 +1019,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
|
|
1085
1019
|
* @param size Size in bytes of the buffer to allocate.
|
|
1086
1020
|
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
|
1087
1021
|
*/
|
|
1088
|
-
|
|
1022
|
+
static ggml_backend_buffer_t
|
|
1089
1023
|
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
1090
1024
|
size_t size) {
|
|
1091
1025
|
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
@@ -1098,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
1098
1032
|
void* dev_ptr;
|
|
1099
1033
|
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
|
1100
1034
|
if (err != ACL_SUCCESS) {
|
|
1101
|
-
|
|
1035
|
+
GGML_LOG_ERROR(
|
|
1102
1036
|
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
|
1103
1037
|
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
|
1104
1038
|
aclGetRecentErrMsg());
|
|
@@ -1124,7 +1058,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
1124
1058
|
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
|
1125
1059
|
* buffers).
|
|
1126
1060
|
*/
|
|
1127
|
-
|
|
1061
|
+
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
1128
1062
|
ggml_backend_buffer_type_t buft) {
|
|
1129
1063
|
return 128;
|
|
1130
1064
|
|
|
@@ -1145,7 +1079,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
|
1145
1079
|
* @return The total allocation size in bytes required for the tensor in the
|
|
1146
1080
|
* CANN buffer.
|
|
1147
1081
|
*/
|
|
1148
|
-
|
|
1082
|
+
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
1149
1083
|
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
|
1150
1084
|
size_t size = ggml_nbytes(tensor);
|
|
1151
1085
|
int64_t ne0 = tensor->ne[0];
|
|
@@ -1196,7 +1130,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
|
|
|
1196
1130
|
* @return A pointer to the buffer type interface for the specified device, or
|
|
1197
1131
|
* nullptr if the device index is out of range.
|
|
1198
1132
|
*/
|
|
1199
|
-
|
|
1133
|
+
ggml_backend_buffer_type_t
|
|
1200
1134
|
ggml_backend_cann_buffer_type(int32_t device) {
|
|
1201
1135
|
static std::mutex mutex;
|
|
1202
1136
|
std::lock_guard<std::mutex> lock(mutex);
|
|
@@ -1225,6 +1159,117 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|
|
1225
1159
|
return &ggml_backend_cann_buffer_types[device];
|
|
1226
1160
|
}
|
|
1227
1161
|
|
|
1162
|
+
/**
|
|
1163
|
+
* @brief Retrieves the name associated with a CANN host buffer type.
|
|
1164
|
+
*
|
|
1165
|
+
* This function returns the descriptive name associated with the specified
|
|
1166
|
+
* CANN host buffer type context.
|
|
1167
|
+
*
|
|
1168
|
+
* @param buft Pointer to the host buffer type context.
|
|
1169
|
+
* @return Const pointer to the C-style string containing the name.
|
|
1170
|
+
*/
|
|
1171
|
+
static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
1172
|
+
return "CANN_Host";
|
|
1173
|
+
|
|
1174
|
+
GGML_UNUSED(buft);
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
/**
|
|
1178
|
+
* @brief Retrieves the name associated with a CANN host buffer.
|
|
1179
|
+
*
|
|
1180
|
+
* This function returns the descriptive name associated with the specified
|
|
1181
|
+
* CANN host buffer context.
|
|
1182
|
+
*
|
|
1183
|
+
* @param buft Pointer to the host buffer context.
|
|
1184
|
+
* @return Const pointer to the C-style string containing the name.
|
|
1185
|
+
*/
|
|
1186
|
+
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
|
1187
|
+
return "CANN_Host";
|
|
1188
|
+
|
|
1189
|
+
GGML_UNUSED(buffer);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
/**
|
|
1193
|
+
* @brief Free resources associated with a CANN host buffer.
|
|
1194
|
+
*
|
|
1195
|
+
* This function frees the resources associated with a CANN host buffer, including
|
|
1196
|
+
* its context.
|
|
1197
|
+
*
|
|
1198
|
+
* @param buffer The CANN host buffer to free.
|
|
1199
|
+
*/
|
|
1200
|
+
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
|
1201
|
+
ACL_CHECK(aclrtFreeHost(buffer->context));
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
/**
|
|
1205
|
+
* @brief Allocates a new CANN host buffer of the specified size.
|
|
1206
|
+
*
|
|
1207
|
+
* This function allocates a new CANN host buffer with the given size.
|
|
1208
|
+
* @param size Size in bytes of the host buffer to allocate.
|
|
1209
|
+
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
|
1210
|
+
*/
|
|
1211
|
+
static void * ggml_cann_host_malloc(size_t size) {
|
|
1212
|
+
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
|
1213
|
+
return nullptr;
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
void * hostPtr = nullptr;
|
|
1217
|
+
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
|
1218
|
+
if (err != ACL_SUCCESS) {
|
|
1219
|
+
|
|
1220
|
+
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
|
1221
|
+
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
|
1222
|
+
return nullptr;
|
|
1223
|
+
}
|
|
1224
|
+
return hostPtr;
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
/**
|
|
1228
|
+
* @brief Allocates a new CANN host buffer of the specified type and size.
|
|
1229
|
+
*
|
|
1230
|
+
* @param buft Pointer to the host buffer type context.
|
|
1231
|
+
* @param size Size in bytes of the host buffer to allocate.
|
|
1232
|
+
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
|
1233
|
+
*/
|
|
1234
|
+
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1235
|
+
void * hostPtr = ggml_cann_host_malloc(size);
|
|
1236
|
+
|
|
1237
|
+
if (hostPtr == nullptr) {
|
|
1238
|
+
// fallback to cpu buffer
|
|
1239
|
+
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
|
1243
|
+
buffer->buft = buft;
|
|
1244
|
+
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
|
1245
|
+
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
|
1246
|
+
|
|
1247
|
+
return buffer;
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
/**
|
|
1251
|
+
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
|
1252
|
+
*
|
|
1253
|
+
* Provides function pointers for allocating, querying properties, and managing
|
|
1254
|
+
* memory for CANN buffer types in the GGML backend.
|
|
1255
|
+
*/
|
|
1256
|
+
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
1257
|
+
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
|
1258
|
+
/* .iface = */ {
|
|
1259
|
+
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
|
1260
|
+
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
|
1261
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
1262
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1263
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
1264
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
1265
|
+
},
|
|
1266
|
+
/* .device = */ nullptr,
|
|
1267
|
+
/* .context = */ nullptr,
|
|
1268
|
+
};
|
|
1269
|
+
|
|
1270
|
+
return &ggml_backend_cann_buffer_type_host;
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1228
1273
|
/**
|
|
1229
1274
|
* @brief Computes the forward operation for a given tensor using CANN
|
|
1230
1275
|
* operations.
|
|
@@ -1388,7 +1433,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1388
1433
|
* @param backend Pointer to the CANN backend structure.
|
|
1389
1434
|
* @return A pointer to a constant string representing the backend name.
|
|
1390
1435
|
*/
|
|
1391
|
-
|
|
1436
|
+
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
1392
1437
|
ggml_backend_cann_context* cann_ctx =
|
|
1393
1438
|
(ggml_backend_cann_context*)backend->context;
|
|
1394
1439
|
|
|
@@ -1403,7 +1448,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
|
1403
1448
|
*
|
|
1404
1449
|
* @param backend Pointer to the CANN backend structure to be freed.
|
|
1405
1450
|
*/
|
|
1406
|
-
|
|
1451
|
+
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
1407
1452
|
ggml_backend_cann_context* cann_ctx =
|
|
1408
1453
|
(ggml_backend_cann_context*)backend->context;
|
|
1409
1454
|
ACL_CHECK(aclrtSynchronizeDevice());
|
|
@@ -1428,7 +1473,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
|
1428
1473
|
* @param backend Pointer to the CANN backend structure.
|
|
1429
1474
|
* @return Pointer to the buffer type structure for the CANN backend.
|
|
1430
1475
|
*/
|
|
1431
|
-
|
|
1476
|
+
static ggml_backend_buffer_type_t
|
|
1432
1477
|
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
|
1433
1478
|
ggml_backend_cann_context* cann_ctx =
|
|
1434
1479
|
(ggml_backend_cann_context*)backend->context;
|
|
@@ -1449,43 +1494,42 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
|
|
1449
1494
|
* @param offset Offset in bytes within the host data.
|
|
1450
1495
|
* @param size Size of the data to copy in bytes.
|
|
1451
1496
|
*/
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
ggml_backend_cann_context*
|
|
1458
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1497
|
+
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
1498
|
+
ggml_tensor *tensor,
|
|
1499
|
+
const void *data,
|
|
1500
|
+
size_t offset,
|
|
1501
|
+
size_t size) {
|
|
1502
|
+
ggml_backend_cann_context *cann_ctx =
|
|
1503
|
+
(ggml_backend_cann_context *)backend->context;
|
|
1459
1504
|
|
|
1460
1505
|
if (!need_transform(tensor->type)) {
|
|
1461
|
-
ACL_CHECK(aclrtMemcpyAsync(
|
|
1462
|
-
|
|
1463
|
-
|
|
1506
|
+
ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
|
|
1507
|
+
size, ACL_MEMCPY_HOST_TO_DEVICE,
|
|
1508
|
+
cann_ctx->stream()));
|
|
1464
1509
|
} else {
|
|
1465
|
-
void*
|
|
1466
|
-
ggml_backend_cann_transform(tensor,
|
|
1467
|
-
transform_buffer);
|
|
1510
|
+
void *transform_buffer = malloc(size);
|
|
1511
|
+
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
|
1468
1512
|
|
|
1469
1513
|
#ifndef NDEBUG
|
|
1470
|
-
void*
|
|
1514
|
+
void *check_buffer = malloc(size);
|
|
1471
1515
|
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
|
1472
1516
|
check_buffer);
|
|
1473
|
-
GGML_ASSERT(memcmp(
|
|
1517
|
+
GGML_ASSERT(memcmp(data, check_buffer, size));
|
|
1474
1518
|
free(check_buffer);
|
|
1475
1519
|
#endif
|
|
1476
|
-
ACL_CHECK(aclrtMemcpyAsync(
|
|
1477
|
-
|
|
1478
|
-
|
|
1520
|
+
ACL_CHECK(aclrtMemcpyAsync(
|
|
1521
|
+
(char *)tensor->data + offset, size, transform_buffer, size,
|
|
1522
|
+
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
|
|
1479
1523
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
|
1480
1524
|
free(transform_buffer);
|
|
1481
1525
|
}
|
|
1482
1526
|
}
|
|
1483
1527
|
|
|
1484
|
-
|
|
1485
|
-
ggml_backend_t backend, const ggml_tensor*
|
|
1528
|
+
static void ggml_backend_cann_get_tensor_async(
|
|
1529
|
+
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
|
1486
1530
|
size_t offset, size_t size) {
|
|
1487
|
-
ggml_backend_cann_context*
|
|
1488
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1531
|
+
ggml_backend_cann_context *cann_ctx =
|
|
1532
|
+
(ggml_backend_cann_context *)backend->context;
|
|
1489
1533
|
ggml_backend_buffer_t buf =
|
|
1490
1534
|
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1491
1535
|
|
|
@@ -1493,17 +1537,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
|
|
1493
1537
|
"unsupported buffer type");
|
|
1494
1538
|
|
|
1495
1539
|
if (!need_transform(tensor->type)) {
|
|
1496
|
-
ACL_CHECK(aclrtMemcpyAsync((char*)data + offset,
|
|
1540
|
+
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
|
|
1497
1541
|
size, ACL_MEMCPY_DEVICE_TO_HOST,
|
|
1498
1542
|
cann_ctx->stream()));
|
|
1499
1543
|
} else {
|
|
1500
|
-
void*
|
|
1501
|
-
ACL_CHECK(aclrtMemcpyAsync(
|
|
1502
|
-
|
|
1503
|
-
|
|
1544
|
+
void *transform_buffer = malloc(size);
|
|
1545
|
+
ACL_CHECK(aclrtMemcpyAsync(
|
|
1546
|
+
transform_buffer, size, (char *)tensor->data + offset, size,
|
|
1547
|
+
ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
|
|
1504
1548
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
|
1505
|
-
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
|
1506
|
-
(char*)data + offset);
|
|
1549
|
+
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
|
1507
1550
|
free(transform_buffer);
|
|
1508
1551
|
}
|
|
1509
1552
|
}
|
|
@@ -1521,7 +1564,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
|
|
1521
1564
|
* @param dst Pointer to the destination tensor to copy data to.
|
|
1522
1565
|
* @return true if the copy operation succeeds, false otherwise.
|
|
1523
1566
|
*/
|
|
1524
|
-
|
|
1567
|
+
static bool ggml_backend_cann_cpy_tensor_async(
|
|
1525
1568
|
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
|
1526
1569
|
const ggml_tensor* src, ggml_tensor* dst) {
|
|
1527
1570
|
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
|
@@ -1589,7 +1632,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
|
|
1589
1632
|
*
|
|
1590
1633
|
* @param backend Pointer to the CANN backend structure to synchronize.
|
|
1591
1634
|
*/
|
|
1592
|
-
|
|
1635
|
+
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
1593
1636
|
ggml_backend_cann_context* cann_ctx =
|
|
1594
1637
|
(ggml_backend_cann_context*)backend->context;
|
|
1595
1638
|
|
|
@@ -1610,7 +1653,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
|
1610
1653
|
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
|
1611
1654
|
* completes successfully, otherwise an appropriate error status.
|
|
1612
1655
|
*/
|
|
1613
|
-
|
|
1656
|
+
static enum ggml_status ggml_backend_cann_graph_compute(
|
|
1614
1657
|
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
|
1615
1658
|
ggml_backend_cann_context* cann_ctx =
|
|
1616
1659
|
(ggml_backend_cann_context*)backend->context;
|
|
@@ -1627,7 +1670,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
|
|
1627
1670
|
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
|
1628
1671
|
|
|
1629
1672
|
if (!ok) {
|
|
1630
|
-
|
|
1673
|
+
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
|
1631
1674
|
node->name, ggml_op_name(node->op));
|
|
1632
1675
|
}
|
|
1633
1676
|
GGML_ASSERT(ok);
|
|
@@ -1648,7 +1691,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
|
|
1648
1691
|
* @return bool Returns true if the operation is supported by the backend,
|
|
1649
1692
|
* otherwise false.
|
|
1650
1693
|
*/
|
|
1651
|
-
|
|
1694
|
+
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
|
1652
1695
|
const ggml_tensor* op) {
|
|
1653
1696
|
switch (op->op) {
|
|
1654
1697
|
case GGML_OP_UNARY:
|
|
@@ -1666,10 +1709,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
|
|
1666
1709
|
}
|
|
1667
1710
|
case GGML_OP_MUL_MAT: {
|
|
1668
1711
|
switch (op->src[0]->type) {
|
|
1669
|
-
// case GGML_TYPE_Q4_0:
|
|
1670
1712
|
case GGML_TYPE_F16:
|
|
1671
1713
|
case GGML_TYPE_F32:
|
|
1672
1714
|
case GGML_TYPE_Q8_0:
|
|
1715
|
+
// TODO: fix me
|
|
1716
|
+
// Current groupsize should not be greater than k-1 in
|
|
1717
|
+
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
|
|
1718
|
+
case GGML_TYPE_Q4_0:
|
|
1673
1719
|
return true;
|
|
1674
1720
|
default:
|
|
1675
1721
|
return false;
|
|
@@ -1694,6 +1740,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
|
|
1694
1740
|
case GGML_TYPE_F32:
|
|
1695
1741
|
case GGML_TYPE_F16:
|
|
1696
1742
|
case GGML_TYPE_Q8_0:
|
|
1743
|
+
case GGML_TYPE_Q4_0:
|
|
1697
1744
|
return true;
|
|
1698
1745
|
default:
|
|
1699
1746
|
return false;
|
|
@@ -1766,7 +1813,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|
|
1766
1813
|
* @return bool Returns true if the CANN backend supports the buffer type,
|
|
1767
1814
|
* otherwise false.
|
|
1768
1815
|
*/
|
|
1769
|
-
|
|
1816
|
+
static bool ggml_backend_cann_supports_buft(
|
|
1770
1817
|
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
1771
1818
|
if (ggml_backend_buft_is_cann(buft)) {
|
|
1772
1819
|
ggml_backend_cann_context * cann_ctx =
|
|
@@ -1792,7 +1839,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
|
|
|
1792
1839
|
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
1793
1840
|
* false.
|
|
1794
1841
|
*/
|
|
1795
|
-
|
|
1842
|
+
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
|
1796
1843
|
const ggml_tensor* op) {
|
|
1797
1844
|
const int min_batch_size = 32;
|
|
1798
1845
|
GGML_UNUSED(backend);
|
|
@@ -1912,11 +1959,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
|
|
|
1912
1959
|
/* .supports_op = */ ggml_backend_cann_supports_op,
|
|
1913
1960
|
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
|
1914
1961
|
/* .offload_op = */ ggml_backend_cann_offload_op,
|
|
1915
|
-
/* .event_new = */ ggml_backend_cann_event_new,
|
|
1916
|
-
/* .event_free = */ ggml_backend_cann_event_free,
|
|
1917
1962
|
/* .event_record = */ ggml_backend_cann_event_record,
|
|
1918
1963
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
|
1919
|
-
/* .event_synchronize = */ ggml_backend_cann_event_synchronize,
|
|
1920
1964
|
};
|
|
1921
1965
|
|
|
1922
1966
|
/**
|
|
@@ -1933,91 +1977,46 @@ static ggml_guid_t ggml_backend_cann_guid() {
|
|
|
1933
1977
|
return &guid;
|
|
1934
1978
|
}
|
|
1935
1979
|
|
|
1936
|
-
|
|
1980
|
+
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
|
1937
1981
|
aclInit(nullptr);
|
|
1938
1982
|
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
|
1939
|
-
|
|
1983
|
+
GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
|
1940
1984
|
return nullptr;
|
|
1941
1985
|
}
|
|
1942
1986
|
|
|
1943
1987
|
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
|
1944
1988
|
if (ctx == nullptr) {
|
|
1945
|
-
|
|
1989
|
+
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
|
1946
1990
|
return nullptr;
|
|
1947
1991
|
}
|
|
1948
|
-
|
|
1992
|
+
ggml_cann_set_device(ctx->device);
|
|
1949
1993
|
ggml_backend_t cann_backend =
|
|
1950
1994
|
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
|
1951
1995
|
/* .interface = */ ggml_backend_cann_interface,
|
|
1996
|
+
/* .device = */ nullptr,
|
|
1952
1997
|
/* .context = */ ctx};
|
|
1953
1998
|
|
|
1954
1999
|
return cann_backend;
|
|
1955
2000
|
}
|
|
1956
2001
|
|
|
1957
|
-
|
|
2002
|
+
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
|
1958
2003
|
return backend != NULL &&
|
|
1959
2004
|
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
|
1960
2005
|
}
|
|
1961
2006
|
|
|
1962
|
-
|
|
2007
|
+
int32_t ggml_backend_cann_get_device_count() {
|
|
1963
2008
|
return ggml_cann_info().device_count;
|
|
1964
2009
|
}
|
|
1965
2010
|
|
|
1966
|
-
|
|
2011
|
+
void ggml_backend_cann_get_device_description(
|
|
1967
2012
|
int32_t device, char* description, size_t description_size) {
|
|
1968
2013
|
ggml_cann_set_device(device);
|
|
1969
2014
|
const char* soc_name = aclrtGetSocName();
|
|
1970
2015
|
snprintf(description, description_size, "%s", soc_name);
|
|
1971
2016
|
}
|
|
1972
2017
|
|
|
1973
|
-
|
|
1974
|
-
|
|
2018
|
+
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
|
2019
|
+
size_t* total) {
|
|
1975
2020
|
ggml_cann_set_device(device);
|
|
1976
2021
|
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
|
1977
2022
|
}
|
|
1978
|
-
|
|
1979
|
-
// backend registry
|
|
1980
|
-
/**
|
|
1981
|
-
* @brief Initializes a CANN backend based on the provided parameters.
|
|
1982
|
-
*
|
|
1983
|
-
* This function initializes a CANN backend using the device index and then
|
|
1984
|
-
* initializes the backend using `ggml_backend_cann_init`.
|
|
1985
|
-
*
|
|
1986
|
-
* @param params Parameters for initialization (unused in this implementation).
|
|
1987
|
-
* @param user_data User data containing the device index to initialize the
|
|
1988
|
-
* backend.
|
|
1989
|
-
* @return ggml_backend_t The initialized CANN backend.
|
|
1990
|
-
*/
|
|
1991
|
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
|
|
1992
|
-
void* user_data) {
|
|
1993
|
-
ggml_backend_t cann_backend =
|
|
1994
|
-
ggml_backend_cann_init((int)(intptr_t)user_data);
|
|
1995
|
-
return cann_backend;
|
|
1996
|
-
|
|
1997
|
-
GGML_UNUSED(params);
|
|
1998
|
-
}
|
|
1999
|
-
|
|
2000
|
-
extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
|
|
2001
|
-
|
|
2002
|
-
/**
|
|
2003
|
-
* @brief Registers CANN (Ascend) devices as backend options.
|
|
2004
|
-
*
|
|
2005
|
-
* This function initializes ACL, retrieves the number of available CANN
|
|
2006
|
-
* devices, and registers each device as a backend option using
|
|
2007
|
-
* `ggml_backend_register`. Each device is given a unique name based on
|
|
2008
|
-
* `GGML_CANN_NAME` followed by its index.
|
|
2009
|
-
*
|
|
2010
|
-
* @return int The number of CANN devices registered.
|
|
2011
|
-
*/
|
|
2012
|
-
GGML_CALL int ggml_backend_cann_reg_devices() {
|
|
2013
|
-
uint32_t device_count = ggml_backend_cann_get_device_count();
|
|
2014
|
-
// initialization
|
|
2015
|
-
for (uint32_t i = 0; i < device_count; i++) {
|
|
2016
|
-
char name[128];
|
|
2017
|
-
snprintf(name, sizeof(name), "CANN%d", i);
|
|
2018
|
-
ggml_backend_register(name, ggml_backend_reg_cann_init,
|
|
2019
|
-
ggml_backend_cann_buffer_type(i),
|
|
2020
|
-
(void*)(intptr_t)i);
|
|
2021
|
-
}
|
|
2022
|
-
return device_count;
|
|
2023
|
-
}
|