@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -30,6 +30,7 @@
30
30
  #include <cstring>
31
31
  #include <mutex>
32
32
 
33
+ #include "ggml-impl.h"
33
34
  #include "ggml-backend-impl.h"
34
35
  #include "ggml-cann/aclnn_ops.h"
35
36
  #include "ggml-cann/common.h"
@@ -38,69 +39,6 @@
38
39
 
39
40
  #include "ggml-common.h"
40
41
 
41
- /**
42
- * @brief Default logging callback for GGML.
43
- *
44
- * This function is the default logging callback that logs messages to stderr.
45
- *
46
- * @param level The log level.
47
- * @param msg The log message.
48
- * @param user_data User data passed to the callback.
49
- */
50
- static void ggml_cann_default_log_callback(enum ggml_log_level level,
51
- const char* msg, void* user_data) {
52
- GGML_UNUSED(level);
53
- GGML_UNUSED(user_data);
54
- fprintf(stderr, "%s", msg);
55
- }
56
-
57
- ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
58
- void* ggml_cann_log_user_data = NULL;
59
-
60
- GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
61
- void* user_data) {
62
- ggml_cann_log_callback = log_callback;
63
- ggml_cann_log_user_data = user_data;
64
- }
65
-
66
- #define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
67
- #define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
68
- #define GGML_CANN_LOG_ERROR(...) \
69
- ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
-
71
- GGML_ATTRIBUTE_FORMAT(2, 3)
72
-
73
- /**
74
- * @brief Log a message using the current logging callback.
75
- *
76
- * This function formats a log message and passes it to the current logging
77
- * callback.
78
- *
79
- * @param level The log level.
80
- * @param format The format string for the log message.
81
- * @param ... The arguments for the format string.
82
- */
83
- static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
84
- if (ggml_cann_log_callback != NULL) {
85
- va_list args;
86
- va_start(args, format);
87
- char buffer[128];
88
- int len = vsnprintf(buffer, 128, format, args);
89
- if (len < 128) {
90
- ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
91
- } else {
92
- // vsnprintf adds a null terminator
93
- std::vector<char> buffer2(len + 1);
94
- va_end(args);
95
- va_start(args, format);
96
- vsnprintf(&buffer2[0], buffer2.size(), format, args);
97
- ggml_cann_log_callback(level, buffer2.data(),
98
- ggml_cann_log_user_data);
99
- }
100
- va_end(args);
101
- }
102
- }
103
-
104
42
  /**
105
43
  * @brief Handles CANN errors by printing an error message and aborting.
106
44
  *
@@ -115,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
115
53
  int32_t id = -1;
116
54
  aclrtGetDevice(&id);
117
55
 
118
- GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
119
- GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
56
+ GGML_LOG_ERROR("CANN error: %s\n", msg);
57
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
120
58
  file, line);
121
- GGML_CANN_LOG_ERROR(" %s\n", stmt);
59
+ GGML_LOG_ERROR(" %s\n", stmt);
122
60
  // abort with GGML_ASSERT to get a stack trace
123
61
  GGML_ABORT("CANN error");
124
62
  }
@@ -164,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
164
102
  aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
165
103
 
166
104
  if (err != ACL_SUCCESS) {
167
- GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
105
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
168
106
  __func__, aclGetRecentErrMsg());
169
107
  return info;
170
108
  }
@@ -314,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
314
252
  *actual_size = look_ahead_size;
315
253
  pool_size += look_ahead_size;
316
254
  #ifdef DEBUG_CANN_MALLOC
317
- GGML_CANN_LOG_INFO(
255
+ GGML_LOG_INFO(
318
256
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
319
257
  "requested %u MB\n",
320
258
  __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
@@ -469,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
469
407
  // add to the pool
470
408
  pool_size += reserve_size;
471
409
 
472
- // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
410
+ // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
473
411
  // reserved %llu MB)\n",
474
412
  // device, (unsigned long long) (pool_size/1024/1024),
475
413
  // (unsigned long long) (reserve_size/1024/1024));
@@ -482,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
482
420
  pool_used += size;
483
421
 
484
422
  #ifdef DEBUG_CANN_MALLOC
485
- GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
423
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
486
424
  (unsigned long long)size, (unsigned long long)ptr);
487
425
  #endif
488
426
  return ptr;
@@ -496,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
496
434
  */
497
435
  void free(void* ptr, size_t size) override {
498
436
  #ifdef DEBUG_CANN_MALLOC
499
- GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
437
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
500
438
  (unsigned long long)size, (unsigned long long)ptr);
501
439
  #endif
502
440
 
@@ -559,7 +497,7 @@ struct ggml_backend_cann_buffer_context {
559
497
  * @return A pointer to a C-string containing the name of the buffer.
560
498
  */
561
499
 
562
- GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
500
+ static const char* ggml_backend_cann_buffer_get_name(
563
501
  ggml_backend_buffer_t buffer) {
564
502
  return "CANN";
565
503
 
@@ -575,7 +513,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
575
513
  * @param buffer The buffer to check.
576
514
  * @return true if the buffer is a CANN buffer, false otherwise.
577
515
  */
578
- GGML_CALL static bool ggml_backend_buffer_is_cann(
516
+ static bool ggml_backend_buffer_is_cann(
579
517
  ggml_backend_buffer_t buffer) {
580
518
  return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
581
519
  }
@@ -588,7 +526,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
588
526
  *
589
527
  * @param buffer The CANN buffer to free.
590
528
  */
591
- GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
529
+ static void ggml_backend_cann_buffer_free_buffer(
592
530
  ggml_backend_buffer_t buffer) {
593
531
  ggml_backend_cann_buffer_context* ctx =
594
532
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -604,7 +542,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
604
542
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
605
543
  * @return A pointer to the base of the device memory allocated for the buffer.
606
544
  */
607
- GGML_CALL static void* ggml_backend_cann_buffer_get_base(
545
+ static void* ggml_backend_cann_buffer_get_base(
608
546
  ggml_backend_buffer_t buffer) {
609
547
  ggml_backend_cann_buffer_context* ctx =
610
548
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -624,10 +562,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
624
562
  * @param dst Pointer to the destination buffer where transformed data will be
625
563
  * stored.
626
564
  */
627
- GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
628
- const void* src,
629
- void* dst) {
630
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
565
+ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
566
+ const void* src,
567
+ void* dst) {
631
568
 
632
569
  int64_t n_elems = ggml_nelements(tensor);
633
570
  int64_t groups = n_elems / QK4_0;
@@ -677,9 +614,8 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
677
614
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
678
615
  * will be stored.
679
616
  */
680
- GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
617
+ static void ggml_backend_cann_transform_back_q4_0(
681
618
  const ggml_tensor* tensor, void* src, void* dst) {
682
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
683
619
 
684
620
  int64_t n_elems = ggml_nelements(tensor);
685
621
  int64_t groups = n_elems / QK4_0;
@@ -727,9 +663,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
727
663
  * @param dst Pointer to the destination buffer where transformed data will be
728
664
  * stored.
729
665
  */
730
- GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
731
- const void* src,
732
- void* dst) {
666
+ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
667
+ const void* src,
668
+ void* dst) {
733
669
  int64_t n_elems = ggml_nelements(tensor);
734
670
  int64_t groups = n_elems / QK8_0;
735
671
  size_t quant_bytes = n_elems * sizeof(uint8_t);
@@ -761,7 +697,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
761
697
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
762
698
  * will be stored.
763
699
  */
764
- GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
700
+ static void ggml_backend_cann_transform_back_q8_0(
765
701
  const ggml_tensor* tensor, const void* src, void* dst) {
766
702
  int64_t n_elems = ggml_nelements(tensor);
767
703
  int64_t groups = n_elems / QK8_0;
@@ -793,8 +729,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
793
729
  * @param dst Pointer to the destination buffer where transformed data will be
794
730
  * stored.
795
731
  */
796
- GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
797
- const void* src, void* dst) {
732
+ static void ggml_backend_cann_transform(ggml_tensor* tensor,
733
+ const void* src, void* dst) {
798
734
  switch (tensor->type) {
799
735
  case GGML_TYPE_Q4_0:
800
736
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -819,7 +755,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
819
755
  * @param dst Pointer to the destination buffer where transformed tensor data
820
756
  * will be stored.
821
757
  */
822
- GGML_CALL static void ggml_backend_cann_transform_back(
758
+ static void ggml_backend_cann_transform_back(
823
759
  const ggml_tensor* tensor, void* src, void* dst) {
824
760
  switch (tensor->type) {
825
761
  case GGML_TYPE_Q4_0:
@@ -842,7 +778,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
842
778
  * @param type The tensor type to check.
843
779
  * @return true if transformation is needed, false otherwise.
844
780
  */
845
- GGML_CALL static bool need_transform(ggml_type type) {
781
+ static bool need_transform(ggml_type type) {
846
782
  switch (type) {
847
783
  case GGML_TYPE_Q4_0:
848
784
  case GGML_TYPE_Q8_0:
@@ -861,7 +797,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
861
797
  * @param buffer The CANN buffer from which to initialize the tensor.
862
798
  * @param tensor Pointer to the tensor to be initialized.
863
799
  */
864
- GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
800
+ static void ggml_backend_cann_buffer_init_tensor(
865
801
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
866
802
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
867
803
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
@@ -897,12 +833,11 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
897
833
  * @param offset Offset in the source data from where to start copying.
898
834
  * @param size Size of the data to be copied, in bytes.
899
835
  */
900
- GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
901
- ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
836
+ static void ggml_backend_cann_buffer_set_tensor(
837
+ ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
902
838
  size_t offset, size_t size) {
903
- // GGML_ASSERT(size == ggml_nbytes(tensor));
904
- ggml_backend_cann_buffer_context* ctx =
905
- (ggml_backend_cann_buffer_context*)buffer->context;
839
+ ggml_backend_cann_buffer_context *ctx =
840
+ (ggml_backend_cann_buffer_context *)buffer->context;
906
841
 
907
842
  ggml_cann_set_device(ctx->device);
908
843
  // TODO: refer to cann(#6017), it use thread's default stream.
@@ -910,22 +845,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
910
845
  // Why aclrtSynchronizeDevice?
911
846
 
912
847
  if (!need_transform(tensor->type)) {
913
- ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
914
- size, ACL_MEMCPY_HOST_TO_DEVICE));
848
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
849
+ ACL_MEMCPY_HOST_TO_DEVICE));
915
850
  } else {
916
- void* transform_buffer = malloc(size);
917
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
918
- transform_buffer);
851
+ void *transform_buffer = malloc(size);
852
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
919
853
 
920
854
  #ifndef NDEBUG
921
- void* check_buffer = malloc(size);
855
+ void *check_buffer = malloc(size);
922
856
  ggml_backend_cann_transform_back(tensor, transform_buffer,
923
857
  check_buffer);
924
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
925
- 0);
858
+ GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
926
859
  free(check_buffer);
927
860
  #endif
928
- ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
861
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
862
+ transform_buffer, size,
929
863
  ACL_MEMCPY_HOST_TO_DEVICE));
930
864
  free(transform_buffer);
931
865
  }
@@ -944,24 +878,23 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
944
878
  * @param offset Offset in the destination buffer where to start copying.
945
879
  * @param size Size of the data to be copied, in bytes.
946
880
  */
947
- GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
881
+ static void ggml_backend_cann_buffer_get_tensor(
948
882
  ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
949
883
  size_t offset, size_t size) {
950
- GGML_ASSERT(size == ggml_nbytes(tensor));
951
884
  ggml_backend_cann_buffer_context* ctx =
952
885
  (ggml_backend_cann_buffer_context*)buffer->context;
953
886
 
954
887
  ggml_cann_set_device(ctx->device);
955
888
 
956
889
  if (!need_transform(tensor->type)) {
957
- ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
890
+ ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
958
891
  ACL_MEMCPY_DEVICE_TO_HOST));
959
892
  } else {
960
893
  void* transform_buffer = malloc(size);
961
- ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
894
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size,
895
+ (char*)tensor->data + offset, size,
962
896
  ACL_MEMCPY_DEVICE_TO_HOST));
963
- ggml_backend_cann_transform_back(tensor, transform_buffer,
964
- (char*)data + offset);
897
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
965
898
  free(transform_buffer);
966
899
  }
967
900
  }
@@ -979,7 +912,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
979
912
  * @param dst Pointer to the destination tensor where the data will be copied.
980
913
  * @return true if the copy operation succeeded, false otherwise.
981
914
  */
982
- GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
915
+ static bool ggml_backend_cann_buffer_cpy_tensor(
983
916
  ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
984
917
  if (ggml_backend_buffer_is_cann(src->buffer)) {
985
918
  ggml_backend_cann_buffer_context* src_ctx =
@@ -1021,7 +954,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
1021
954
  * @param buffer The CANN buffer to be cleared.
1022
955
  * @param value The value to which each byte in the buffer will be set.
1023
956
  */
1024
- GGML_CALL static void ggml_backend_cann_buffer_clear(
957
+ static void ggml_backend_cann_buffer_clear(
1025
958
  ggml_backend_buffer_t buffer, uint8_t value) {
1026
959
  ggml_backend_cann_buffer_context* ctx =
1027
960
  (ggml_backend_cann_buffer_context*)buffer->context;
@@ -1041,6 +974,7 @@ static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1041
974
  /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
1042
975
  /* .get_base = */ ggml_backend_cann_buffer_get_base,
1043
976
  /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
977
+ /* .memset_tensor = */ NULL,
1044
978
  /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
1045
979
  /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
1046
980
  /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
@@ -1068,7 +1002,7 @@ struct ggml_backend_cann_buffer_type_context {
1068
1002
  * @param buft Pointer to the buffer type context.
1069
1003
  * @return Const pointer to the C-style string containing the name.
1070
1004
  */
1071
- GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
1005
+ static const char* ggml_backend_cann_buffer_type_name(
1072
1006
  ggml_backend_buffer_type_t buft) {
1073
1007
  return "CANN";
1074
1008
 
@@ -1085,7 +1019,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
1085
1019
  * @param size Size in bytes of the buffer to allocate.
1086
1020
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1087
1021
  */
1088
- GGML_CALL static ggml_backend_buffer_t
1022
+ static ggml_backend_buffer_t
1089
1023
  ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1090
1024
  size_t size) {
1091
1025
  ggml_backend_cann_buffer_type_context* buft_ctx =
@@ -1098,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1098
1032
  void* dev_ptr;
1099
1033
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1100
1034
  if (err != ACL_SUCCESS) {
1101
- GGML_CANN_LOG_ERROR(
1035
+ GGML_LOG_ERROR(
1102
1036
  "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1103
1037
  __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1104
1038
  aclGetRecentErrMsg());
@@ -1124,7 +1058,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1124
1058
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1125
1059
  * buffers).
1126
1060
  */
1127
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1061
+ static size_t ggml_backend_cann_buffer_type_get_alignment(
1128
1062
  ggml_backend_buffer_type_t buft) {
1129
1063
  return 128;
1130
1064
 
@@ -1145,7 +1079,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
1145
1079
  * @return The total allocation size in bytes required for the tensor in the
1146
1080
  * CANN buffer.
1147
1081
  */
1148
- GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1082
+ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1149
1083
  ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1150
1084
  size_t size = ggml_nbytes(tensor);
1151
1085
  int64_t ne0 = tensor->ne[0];
@@ -1196,7 +1130,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1196
1130
  * @return A pointer to the buffer type interface for the specified device, or
1197
1131
  * nullptr if the device index is out of range.
1198
1132
  */
1199
- GGML_CALL ggml_backend_buffer_type_t
1133
+ ggml_backend_buffer_type_t
1200
1134
  ggml_backend_cann_buffer_type(int32_t device) {
1201
1135
  static std::mutex mutex;
1202
1136
  std::lock_guard<std::mutex> lock(mutex);
@@ -1225,6 +1159,117 @@ ggml_backend_cann_buffer_type(int32_t device) {
1225
1159
  return &ggml_backend_cann_buffer_types[device];
1226
1160
  }
1227
1161
 
1162
+ /**
1163
+ * @brief Retrieves the name associated with a CANN host buffer type.
1164
+ *
1165
+ * This function returns the descriptive name associated with the specified
1166
+ * CANN host buffer type context.
1167
+ *
1168
+ * @param buft Pointer to the host buffer type context.
1169
+ * @return Const pointer to the C-style string containing the name.
1170
+ */
1171
+ static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1172
+ return "CANN_Host";
1173
+
1174
+ GGML_UNUSED(buft);
1175
+ }
1176
+
1177
+ /**
1178
+ * @brief Retrieves the name associated with a CANN host buffer.
1179
+ *
1180
+ * This function returns the descriptive name associated with the specified
1181
+ * CANN host buffer context.
1182
+ *
1183
+ * @param buft Pointer to the host buffer context.
1184
+ * @return Const pointer to the C-style string containing the name.
1185
+ */
1186
+ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
1187
+ return "CANN_Host";
1188
+
1189
+ GGML_UNUSED(buffer);
1190
+ }
1191
+
1192
+ /**
1193
+ * @brief Free resources associated with a CANN host buffer.
1194
+ *
1195
+ * This function frees the resources associated with a CANN host buffer, including
1196
+ * its context.
1197
+ *
1198
+ * @param buffer The CANN host buffer to free.
1199
+ */
1200
+ static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
1201
+ ACL_CHECK(aclrtFreeHost(buffer->context));
1202
+ }
1203
+
1204
+ /**
1205
+ * @brief Allocates a new CANN host buffer of the specified size.
1206
+ *
1207
+ * This function allocates a new CANN host buffer with the given size.
1208
+ * @param size Size in bytes of the host buffer to allocate.
1209
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
1210
+ */
1211
+ static void * ggml_cann_host_malloc(size_t size) {
1212
+ if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
1213
+ return nullptr;
1214
+ }
1215
+
1216
+ void * hostPtr = nullptr;
1217
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
1218
+ if (err != ACL_SUCCESS) {
1219
+
1220
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1221
+ size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1222
+ return nullptr;
1223
+ }
1224
+ return hostPtr;
1225
+ }
1226
+
1227
+ /**
1228
+ * @brief Allocates a new CANN host buffer of the specified type and size.
1229
+ *
1230
+ * @param buft Pointer to the host buffer type context.
1231
+ * @param size Size in bytes of the host buffer to allocate.
1232
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1233
+ */
1234
+ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1235
+ void * hostPtr = ggml_cann_host_malloc(size);
1236
+
1237
+ if (hostPtr == nullptr) {
1238
+ // fallback to cpu buffer
1239
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1240
+ }
1241
+
1242
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1243
+ buffer->buft = buft;
1244
+ buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
1245
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1246
+
1247
+ return buffer;
1248
+ }
1249
+
1250
+ /**
1251
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
1252
+ *
1253
+ * Provides function pointers for allocating, querying properties, and managing
1254
+ * memory for CANN buffer types in the GGML backend.
1255
+ */
1256
+ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1257
+ static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1258
+ /* .iface = */ {
1259
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1260
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1261
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1262
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1263
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1264
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1265
+ },
1266
+ /* .device = */ nullptr,
1267
+ /* .context = */ nullptr,
1268
+ };
1269
+
1270
+ return &ggml_backend_cann_buffer_type_host;
1271
+ }
1272
+
1228
1273
  /**
1229
1274
  * @brief Computes the forward operation for a given tensor using CANN
1230
1275
  * operations.
@@ -1388,7 +1433,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1388
1433
  * @param backend Pointer to the CANN backend structure.
1389
1434
  * @return A pointer to a constant string representing the backend name.
1390
1435
  */
1391
- GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1436
+ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1392
1437
  ggml_backend_cann_context* cann_ctx =
1393
1438
  (ggml_backend_cann_context*)backend->context;
1394
1439
 
@@ -1403,7 +1448,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1403
1448
  *
1404
1449
  * @param backend Pointer to the CANN backend structure to be freed.
1405
1450
  */
1406
- GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1451
+ static void ggml_backend_cann_free(ggml_backend_t backend) {
1407
1452
  ggml_backend_cann_context* cann_ctx =
1408
1453
  (ggml_backend_cann_context*)backend->context;
1409
1454
  ACL_CHECK(aclrtSynchronizeDevice());
@@ -1428,7 +1473,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
1428
1473
  * @param backend Pointer to the CANN backend structure.
1429
1474
  * @return Pointer to the buffer type structure for the CANN backend.
1430
1475
  */
1431
- GGML_CALL static ggml_backend_buffer_type_t
1476
+ static ggml_backend_buffer_type_t
1432
1477
  ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1433
1478
  ggml_backend_cann_context* cann_ctx =
1434
1479
  (ggml_backend_cann_context*)backend->context;
@@ -1449,43 +1494,42 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1449
1494
  * @param offset Offset in bytes within the host data.
1450
1495
  * @param size Size of the data to copy in bytes.
1451
1496
  */
1452
- GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1453
- ggml_tensor* tensor,
1454
- const void* data,
1455
- size_t offset,
1456
- size_t size) {
1457
- ggml_backend_cann_context* cann_ctx =
1458
- (ggml_backend_cann_context*)backend->context;
1497
+ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1498
+ ggml_tensor *tensor,
1499
+ const void *data,
1500
+ size_t offset,
1501
+ size_t size) {
1502
+ ggml_backend_cann_context *cann_ctx =
1503
+ (ggml_backend_cann_context *)backend->context;
1459
1504
 
1460
1505
  if (!need_transform(tensor->type)) {
1461
- ACL_CHECK(aclrtMemcpyAsync(
1462
- tensor->data, size, (const char*)data + offset, size,
1463
- ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1506
+ ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
1507
+ size, ACL_MEMCPY_HOST_TO_DEVICE,
1508
+ cann_ctx->stream()));
1464
1509
  } else {
1465
- void* transform_buffer = malloc(size);
1466
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
1467
- transform_buffer);
1510
+ void *transform_buffer = malloc(size);
1511
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
1468
1512
 
1469
1513
  #ifndef NDEBUG
1470
- void* check_buffer = malloc(size);
1514
+ void *check_buffer = malloc(size);
1471
1515
  ggml_backend_cann_transform_back(tensor, transform_buffer,
1472
1516
  check_buffer);
1473
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
1517
+ GGML_ASSERT(memcmp(data, check_buffer, size));
1474
1518
  free(check_buffer);
1475
1519
  #endif
1476
- ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
1477
- ACL_MEMCPY_HOST_TO_DEVICE,
1478
- cann_ctx->stream()));
1520
+ ACL_CHECK(aclrtMemcpyAsync(
1521
+ (char *)tensor->data + offset, size, transform_buffer, size,
1522
+ ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
1479
1523
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1480
1524
  free(transform_buffer);
1481
1525
  }
1482
1526
  }
1483
1527
 
1484
- GGML_CALL static void ggml_backend_cann_get_tensor_async(
1485
- ggml_backend_t backend, const ggml_tensor* tensor, void* data,
1528
+ static void ggml_backend_cann_get_tensor_async(
1529
+ ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1486
1530
  size_t offset, size_t size) {
1487
- ggml_backend_cann_context* cann_ctx =
1488
- (ggml_backend_cann_context*)backend->context;
1531
+ ggml_backend_cann_context *cann_ctx =
1532
+ (ggml_backend_cann_context *)backend->context;
1489
1533
  ggml_backend_buffer_t buf =
1490
1534
  tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1491
1535
 
@@ -1493,17 +1537,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
1493
1537
  "unsupported buffer type");
1494
1538
 
1495
1539
  if (!need_transform(tensor->type)) {
1496
- ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
1540
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
1497
1541
  size, ACL_MEMCPY_DEVICE_TO_HOST,
1498
1542
  cann_ctx->stream()));
1499
1543
  } else {
1500
- void* transform_buffer = malloc(size);
1501
- ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
1502
- ACL_MEMCPY_DEVICE_TO_HOST,
1503
- cann_ctx->stream()));
1544
+ void *transform_buffer = malloc(size);
1545
+ ACL_CHECK(aclrtMemcpyAsync(
1546
+ transform_buffer, size, (char *)tensor->data + offset, size,
1547
+ ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
1504
1548
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
1505
- ggml_backend_cann_transform_back(tensor, transform_buffer,
1506
- (char*)data + offset);
1549
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1507
1550
  free(transform_buffer);
1508
1551
  }
1509
1552
  }
@@ -1521,7 +1564,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
1521
1564
  * @param dst Pointer to the destination tensor to copy data to.
1522
1565
  * @return true if the copy operation succeeds, false otherwise.
1523
1566
  */
1524
- GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1567
+ static bool ggml_backend_cann_cpy_tensor_async(
1525
1568
  ggml_backend_t backend_src, ggml_backend_t backend_dst,
1526
1569
  const ggml_tensor* src, ggml_tensor* dst) {
1527
1570
  GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
@@ -1589,7 +1632,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
1589
1632
  *
1590
1633
  * @param backend Pointer to the CANN backend structure to synchronize.
1591
1634
  */
1592
- GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1635
+ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1593
1636
  ggml_backend_cann_context* cann_ctx =
1594
1637
  (ggml_backend_cann_context*)backend->context;
1595
1638
 
@@ -1610,7 +1653,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
1610
1653
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
1611
1654
  * completes successfully, otherwise an appropriate error status.
1612
1655
  */
1613
- GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1656
+ static enum ggml_status ggml_backend_cann_graph_compute(
1614
1657
  ggml_backend_t backend, ggml_cgraph* cgraph) {
1615
1658
  ggml_backend_cann_context* cann_ctx =
1616
1659
  (ggml_backend_cann_context*)backend->context;
@@ -1627,7 +1670,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1627
1670
  bool ok = ggml_cann_compute_forward(*cann_ctx, node);
1628
1671
 
1629
1672
  if (!ok) {
1630
- GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1673
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
1631
1674
  node->name, ggml_op_name(node->op));
1632
1675
  }
1633
1676
  GGML_ASSERT(ok);
@@ -1648,7 +1691,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
1648
1691
  * @return bool Returns true if the operation is supported by the backend,
1649
1692
  * otherwise false.
1650
1693
  */
1651
- GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1694
+ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1652
1695
  const ggml_tensor* op) {
1653
1696
  switch (op->op) {
1654
1697
  case GGML_OP_UNARY:
@@ -1666,10 +1709,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1666
1709
  }
1667
1710
  case GGML_OP_MUL_MAT: {
1668
1711
  switch (op->src[0]->type) {
1669
- // case GGML_TYPE_Q4_0:
1670
1712
  case GGML_TYPE_F16:
1671
1713
  case GGML_TYPE_F32:
1672
1714
  case GGML_TYPE_Q8_0:
1715
+ // TODO: fix me
1716
+ // Current groupsize should not be greater than k-1 in
1717
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
1718
+ case GGML_TYPE_Q4_0:
1673
1719
  return true;
1674
1720
  default:
1675
1721
  return false;
@@ -1694,6 +1740,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1694
1740
  case GGML_TYPE_F32:
1695
1741
  case GGML_TYPE_F16:
1696
1742
  case GGML_TYPE_Q8_0:
1743
+ case GGML_TYPE_Q4_0:
1697
1744
  return true;
1698
1745
  default:
1699
1746
  return false;
@@ -1766,7 +1813,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
1766
1813
  * @return bool Returns true if the CANN backend supports the buffer type,
1767
1814
  * otherwise false.
1768
1815
  */
1769
- GGML_CALL static bool ggml_backend_cann_supports_buft(
1816
+ static bool ggml_backend_cann_supports_buft(
1770
1817
  ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1771
1818
  if (ggml_backend_buft_is_cann(buft)) {
1772
1819
  ggml_backend_cann_context * cann_ctx =
@@ -1792,7 +1839,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
1792
1839
  * @return bool Returns true if the operation should be offloaded, otherwise
1793
1840
  * false.
1794
1841
  */
1795
- GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1842
+ static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1796
1843
  const ggml_tensor* op) {
1797
1844
  const int min_batch_size = 32;
1798
1845
  GGML_UNUSED(backend);
@@ -1912,11 +1959,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
1912
1959
  /* .supports_op = */ ggml_backend_cann_supports_op,
1913
1960
  /* .supports_buft = */ ggml_backend_cann_supports_buft,
1914
1961
  /* .offload_op = */ ggml_backend_cann_offload_op,
1915
- /* .event_new = */ ggml_backend_cann_event_new,
1916
- /* .event_free = */ ggml_backend_cann_event_free,
1917
1962
  /* .event_record = */ ggml_backend_cann_event_record,
1918
1963
  /* .event_wait = */ ggml_backend_cann_event_wait,
1919
- /* .event_synchronize = */ ggml_backend_cann_event_synchronize,
1920
1964
  };
1921
1965
 
1922
1966
  /**
@@ -1933,91 +1977,46 @@ static ggml_guid_t ggml_backend_cann_guid() {
1933
1977
  return &guid;
1934
1978
  }
1935
1979
 
1936
- GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
1980
+ ggml_backend_t ggml_backend_cann_init(int32_t device) {
1937
1981
  aclInit(nullptr);
1938
1982
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
1939
- GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
1983
+ GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
1940
1984
  return nullptr;
1941
1985
  }
1942
1986
 
1943
1987
  ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
1944
1988
  if (ctx == nullptr) {
1945
- GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
1989
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
1946
1990
  return nullptr;
1947
1991
  }
1948
-
1992
+ ggml_cann_set_device(ctx->device);
1949
1993
  ggml_backend_t cann_backend =
1950
1994
  new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
1951
1995
  /* .interface = */ ggml_backend_cann_interface,
1996
+ /* .device = */ nullptr,
1952
1997
  /* .context = */ ctx};
1953
1998
 
1954
1999
  return cann_backend;
1955
2000
  }
1956
2001
 
1957
- GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
2002
+ bool ggml_backend_is_cann(ggml_backend_t backend) {
1958
2003
  return backend != NULL &&
1959
2004
  ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
1960
2005
  }
1961
2006
 
1962
- GGML_CALL int32_t ggml_backend_cann_get_device_count() {
2007
+ int32_t ggml_backend_cann_get_device_count() {
1963
2008
  return ggml_cann_info().device_count;
1964
2009
  }
1965
2010
 
1966
- GGML_CALL void ggml_backend_cann_get_device_description(
2011
+ void ggml_backend_cann_get_device_description(
1967
2012
  int32_t device, char* description, size_t description_size) {
1968
2013
  ggml_cann_set_device(device);
1969
2014
  const char* soc_name = aclrtGetSocName();
1970
2015
  snprintf(description, description_size, "%s", soc_name);
1971
2016
  }
1972
2017
 
1973
- GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
1974
- size_t* total) {
2018
+ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2019
+ size_t* total) {
1975
2020
  ggml_cann_set_device(device);
1976
2021
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
1977
2022
  }
1978
-
1979
- // backend registry
1980
- /**
1981
- * @brief Initializes a CANN backend based on the provided parameters.
1982
- *
1983
- * This function initializes a CANN backend using the device index and then
1984
- * initializes the backend using `ggml_backend_cann_init`.
1985
- *
1986
- * @param params Parameters for initialization (unused in this implementation).
1987
- * @param user_data User data containing the device index to initialize the
1988
- * backend.
1989
- * @return ggml_backend_t The initialized CANN backend.
1990
- */
1991
- GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
1992
- void* user_data) {
1993
- ggml_backend_t cann_backend =
1994
- ggml_backend_cann_init((int)(intptr_t)user_data);
1995
- return cann_backend;
1996
-
1997
- GGML_UNUSED(params);
1998
- }
1999
-
2000
- extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
2001
-
2002
- /**
2003
- * @brief Registers CANN (Ascend) devices as backend options.
2004
- *
2005
- * This function initializes ACL, retrieves the number of available CANN
2006
- * devices, and registers each device as a backend option using
2007
- * `ggml_backend_register`. Each device is given a unique name based on
2008
- * `GGML_CANN_NAME` followed by its index.
2009
- *
2010
- * @return int The number of CANN devices registered.
2011
- */
2012
- GGML_CALL int ggml_backend_cann_reg_devices() {
2013
- uint32_t device_count = ggml_backend_cann_get_device_count();
2014
- // initialization
2015
- for (uint32_t i = 0; i < device_count; i++) {
2016
- char name[128];
2017
- snprintf(name, sizeof(name), "CANN%d", i);
2018
- ggml_backend_register(name, ggml_backend_reg_cann_init,
2019
- ggml_backend_cann_buffer_type(i),
2020
- (void*)(intptr_t)i);
2021
- }
2022
- return device_count;
2023
- }