@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -39,6 +39,8 @@
39
39
 
40
40
  #include "ggml-common.h"
41
41
 
42
+ #define GGML_CANN_NAME "CANN"
43
+
42
44
  /**
43
45
  * @brief Handles CANN errors by printing an error message and aborting.
44
46
  *
@@ -120,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
120
122
  ACL_CHECK(aclrtMemGetAllocationGranularity(
121
123
  &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
122
124
  &info.devices[id].vmm_granularity));
125
+
126
+ size_t free, total;
127
+ ggml_backend_cann_get_device_memory(id, &free, &total);
128
+ info.devices[id].total_vram = free;
123
129
  }
124
130
 
125
131
  // TODO: add more device info later.
@@ -206,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
206
212
  * @return A pointer to the allocated buffer.
207
213
  */
208
214
  void* alloc(size_t size, size_t* actual_size) override {
215
+ const size_t alignment = 128;
216
+ size = GGML_PAD(size, alignment);
217
+ if (size == 0) {
218
+ size = alignment;
219
+ }
209
220
  #ifdef DEBUG_CANN_MALLOC
210
221
  int nnz = 0;
211
222
  size_t max_size = 0;
@@ -244,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
244
255
  return ptr;
245
256
  }
246
257
  void* ptr;
247
- size_t look_ahead_size = (size_t)(1.05 * size);
248
- look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
249
258
  ggml_cann_set_device(device);
250
259
  ACL_CHECK(
251
- aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
252
- *actual_size = look_ahead_size;
253
- pool_size += look_ahead_size;
260
+ aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
261
+ *actual_size = size;
262
+ pool_size += size;
254
263
  #ifdef DEBUG_CANN_MALLOC
255
264
  GGML_LOG_INFO(
256
265
  "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -294,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
294
303
  /**
295
304
  * @brief The maximum size of the virtual memory pool (32 GB).
296
305
  */
297
- static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
306
+ size_t max_size;
298
307
 
299
308
  /**
300
309
  * @brief The device ID associated with this buffer pool.
@@ -339,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
339
348
  */
340
349
  explicit ggml_cann_pool_vmm(int device)
341
350
  : device(device),
342
- granularity(ggml_cann_info().devices[device].vmm_granularity) {}
351
+ granularity(ggml_cann_info().devices[device].vmm_granularity) {
352
+ auto dev = ggml_cann_info().devices[device];
353
+ granularity = dev.vmm_granularity;
354
+ max_size = dev.total_vram;
355
+ }
343
356
 
344
357
  /**
345
358
  * @brief Destructor to free all buffers in the virtual memory pool.
@@ -368,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
368
381
  // round up the allocation size to the alignment to ensure that all
369
382
  // allocations are aligned for all data types
370
383
  const size_t alignment = 128;
371
- size = alignment * ((size + alignment - 1) / alignment);
384
+ size = GGML_PAD(size, alignment);
385
+ if (size == 0) {
386
+ size = alignment;
387
+ }
372
388
 
373
389
  size_t avail = pool_size - pool_used;
374
390
 
375
391
  if (size > avail) {
376
392
  // round up to the next multiple of the granularity
377
393
  size_t reserve_size = size - avail;
378
- reserve_size =
379
- granularity * ((reserve_size + granularity - 1) / granularity);
394
+ reserve_size = GGML_PAD(reserve_size, granularity);
380
395
 
381
- GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
396
+ GGML_ASSERT(pool_size + reserve_size <= max_size);
382
397
 
383
398
  // allocate more physical memory
384
399
  aclrtPhysicalMemProp prop = {};
@@ -394,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
394
409
  // reserve virtual address space (if not already reserved)
395
410
  if (pool_addr == 0) {
396
411
  ACL_CHECK(aclrtReserveMemAddress(
397
- &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
412
+ &pool_addr, max_size, 0, NULL, 1));
398
413
  }
399
414
 
400
415
  // map at the end of the pool
@@ -407,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
407
422
  // add to the pool
408
423
  pool_size += reserve_size;
409
424
 
410
- // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
411
- // reserved %llu MB)\n",
412
- // device, (unsigned long long) (pool_size/1024/1024),
413
- // (unsigned long long) (reserve_size/1024/1024));
425
+ #ifdef DEBUG_CANN_MALLOC
426
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
427
+ device, (unsigned long long) (pool_size/1024/1024),
428
+ (unsigned long long) (reserve_size/1024/1024));
429
+ #endif
414
430
  }
415
431
 
416
432
  GGML_ASSERT(pool_addr != 0);
@@ -455,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
455
471
  */
456
472
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
457
473
  int device) {
458
- // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
459
474
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
460
475
  }
461
476
 
@@ -487,23 +502,6 @@ struct ggml_backend_cann_buffer_context {
487
502
  ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
488
503
  };
489
504
 
490
- /**
491
- * @brief Retrieve the name associated with a CANN buffer.
492
- *
493
- * This function returns the name of a CANN buffer, which is stored in the
494
- * context of the buffer.
495
- *
496
- * @param buffer The CANN buffer whose name is to be retrieved.
497
- * @return A pointer to a C-string containing the name of the buffer.
498
- */
499
-
500
- static const char* ggml_backend_cann_buffer_get_name(
501
- ggml_backend_buffer_t buffer) {
502
- return "CANN";
503
-
504
- GGML_UNUSED(buffer);
505
- }
506
-
507
505
  /**
508
506
  * @brief Check if a buffer is a CANN buffer.
509
507
  *
@@ -513,9 +511,10 @@ static const char* ggml_backend_cann_buffer_get_name(
513
511
  * @param buffer The buffer to check.
514
512
  * @return true if the buffer is a CANN buffer, false otherwise.
515
513
  */
514
+ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
516
515
  static bool ggml_backend_buffer_is_cann(
517
516
  ggml_backend_buffer_t buffer) {
518
- return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
517
+ return ggml_backend_buft_is_cann(buffer->buft);
519
518
  }
520
519
 
521
520
  /**
@@ -851,13 +850,6 @@ static void ggml_backend_cann_buffer_set_tensor(
851
850
  void *transform_buffer = malloc(size);
852
851
  ggml_backend_cann_transform(tensor, data, transform_buffer);
853
852
 
854
- #ifndef NDEBUG
855
- void *check_buffer = malloc(size);
856
- ggml_backend_cann_transform_back(tensor, transform_buffer,
857
- check_buffer);
858
- GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
859
- free(check_buffer);
860
- #endif
861
853
  ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
862
854
  transform_buffer, size,
863
855
  ACL_MEMCPY_HOST_TO_DEVICE));
@@ -969,8 +961,7 @@ static void ggml_backend_cann_buffer_clear(
969
961
  * This structure defines function pointers to operations that can be performed
970
962
  * on a CANN buffer within the backend.
971
963
  */
972
- static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
973
- /* .get_name = */ ggml_backend_cann_buffer_get_name,
964
+ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
974
965
  /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
975
966
  /* .get_base = */ ggml_backend_cann_buffer_get_base,
976
967
  /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
@@ -1004,9 +995,10 @@ struct ggml_backend_cann_buffer_type_context {
1004
995
  */
1005
996
  static const char* ggml_backend_cann_buffer_type_name(
1006
997
  ggml_backend_buffer_type_t buft) {
1007
- return "CANN";
998
+ ggml_backend_cann_buffer_type_context* buft_ctx =
999
+ (ggml_backend_cann_buffer_type_context*)buft->context;
1008
1000
 
1009
- GGML_UNUSED(buft);
1001
+ return buft_ctx->name.c_str();
1010
1002
  }
1011
1003
 
1012
1004
  /**
@@ -1105,19 +1097,25 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1105
1097
  GGML_UNUSED(buft);
1106
1098
  }
1107
1099
 
1100
+ static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1101
+ return false;
1102
+
1103
+ GGML_UNUSED(buft);
1104
+ }
1105
+
1108
1106
  /**
1109
1107
  * @brief Interface for managing CANN buffer types in the GGML backend.
1110
1108
  *
1111
1109
  * Provides function pointers for allocating, querying properties, and managing
1112
1110
  * memory for CANN buffer types in the GGML backend.
1113
1111
  */
1114
- static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1112
+ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
1115
1113
  /* .get_name = */ ggml_backend_cann_buffer_type_name,
1116
1114
  /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
1117
1115
  /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
1118
1116
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1119
1117
  /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
1120
- /* .is_host = */ NULL,
1118
+ /* .is_host = */ ggml_backend_cann_buffer_type_is_host,
1121
1119
  };
1122
1120
 
1123
1121
  /**
@@ -1145,9 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
1145
1143
  static bool ggml_backend_cann_buffer_type_initialized = false;
1146
1144
 
1147
1145
  if (!ggml_backend_cann_buffer_type_initialized) {
1148
- for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
1146
+ for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
1149
1147
  ggml_backend_cann_buffer_types[i] = {
1150
1148
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1149
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
1151
1150
  /* .context = */
1152
1151
  new ggml_backend_cann_buffer_type_context{
1153
1152
  i, "CANN" + std::to_string(i)},
@@ -1213,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
1213
1212
  return nullptr;
1214
1213
  }
1215
1214
 
1215
+ const size_t alignment = 128;
1216
+ size = GGML_PAD(size, alignment);
1217
+ if (size == 0) {
1218
+ size = alignment;
1219
+ }
1220
+
1216
1221
  void * hostPtr = nullptr;
1217
1222
  aclError err = aclrtMallocHost((void **) &hostPtr, size);
1218
1223
  if (err != ACL_SUCCESS) {
1219
-
1220
1224
  GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1221
1225
  size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1222
1226
  return nullptr;
@@ -1241,7 +1245,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
1241
1245
 
1242
1246
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1243
1247
  buffer->buft = buft;
1244
- buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
1245
1248
  buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1246
1249
 
1247
1250
  return buffer;
@@ -1263,7 +1266,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1263
1266
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1264
1267
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1265
1268
  },
1266
- /* .device = */ nullptr,
1269
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1267
1270
  /* .context = */ nullptr,
1268
1271
  };
1269
1272
 
@@ -1463,24 +1466,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1463
1466
  delete backend;
1464
1467
  }
1465
1468
 
1466
- /**
1467
- * @brief Retrieves the default buffer type associated with the CANN backend.
1468
- *
1469
- * This function returns the buffer type specific to the device associated
1470
- * with the CANN backend. It is used to allocate buffers for computations
1471
- * performed by the backend.
1472
- *
1473
- * @param backend Pointer to the CANN backend structure.
1474
- * @return Pointer to the buffer type structure for the CANN backend.
1475
- */
1476
- static ggml_backend_buffer_type_t
1477
- ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
1478
- ggml_backend_cann_context* cann_ctx =
1479
- (ggml_backend_cann_context*)backend->context;
1480
-
1481
- return ggml_backend_cann_buffer_type(cann_ctx->device);
1482
- }
1483
-
1484
1469
  /**
1485
1470
  * @brief Sets tensor data asynchronously in the CANN backend.
1486
1471
  *
@@ -1510,13 +1495,6 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1510
1495
  void *transform_buffer = malloc(size);
1511
1496
  ggml_backend_cann_transform(tensor, data, transform_buffer);
1512
1497
 
1513
- #ifndef NDEBUG
1514
- void *check_buffer = malloc(size);
1515
- ggml_backend_cann_transform_back(tensor, transform_buffer,
1516
- check_buffer);
1517
- GGML_ASSERT(memcmp(data, check_buffer, size));
1518
- free(check_buffer);
1519
- #endif
1520
1498
  ACL_CHECK(aclrtMemcpyAsync(
1521
1499
  (char *)tensor->data + offset, size, transform_buffer, size,
1522
1500
  ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
@@ -1691,7 +1669,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
1691
1669
  * @return bool Returns true if the operation is supported by the backend,
1692
1670
  * otherwise false.
1693
1671
  */
1694
- static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1672
+ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1695
1673
  const ggml_tensor* op) {
1696
1674
  switch (op->op) {
1697
1675
  case GGML_OP_UNARY:
@@ -1709,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1709
1687
  }
1710
1688
  case GGML_OP_MUL_MAT: {
1711
1689
  switch (op->src[0]->type) {
1712
- case GGML_TYPE_F16:
1713
- case GGML_TYPE_F32:
1714
1690
  case GGML_TYPE_Q8_0:
1715
- // TODO: fix me
1716
1691
  // Current groupsize should not be greater than k-1 in
1717
- // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
1692
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1693
+ if (op->src[0]->ne[0] <= QK8_0) {
1694
+ return false;
1695
+ }
1696
+ case GGML_TYPE_F16:
1697
+ case GGML_TYPE_F32:
1718
1698
  case GGML_TYPE_Q4_0:
1719
1699
  return true;
1720
1700
  default:
@@ -1746,9 +1726,50 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1746
1726
  return false;
1747
1727
  }
1748
1728
  }
1729
+ case GGML_OP_CONT: {
1730
+ // TODO: support GGML_TYPE_BF16
1731
+ switch (op->src[0]->type) {
1732
+ case GGML_TYPE_F32:
1733
+ case GGML_TYPE_F16:
1734
+ return true;
1735
+ default:
1736
+ return false;
1737
+ }
1738
+ }
1739
+ case GGML_OP_ROPE: {
1740
+ // TODO: with ops-test v == 1
1741
+ float * ext_factor = (float*)((int32_t*)op->op_params + 7);
1742
+ // TODO: n_dims <= ne0
1743
+ if (op->src[0]->ne[0] != op->op_params[1]) {
1744
+ return false;
1745
+ }
1746
+ // TODO: ext_factor != 0
1747
+ if (*ext_factor != 0) {
1748
+ return false;
1749
+ }
1750
+
1751
+ const int mode = ((const int32_t *) op->op_params)[2];
1752
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1753
+ return false;
1754
+ }
1755
+ if (mode & GGML_ROPE_TYPE_VISION) {
1756
+ return false;
1757
+ }
1758
+
1759
+ return true;
1760
+ }
1761
+ case GGML_OP_UPSCALE: {
1762
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
1763
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
1764
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1765
+ return false;
1766
+ }
1767
+ return true;
1768
+ }
1769
+ case GGML_OP_IM2COL:
1770
+ case GGML_OP_CONCAT:
1749
1771
  case GGML_OP_DUP:
1750
1772
  case GGML_OP_REPEAT:
1751
- case GGML_OP_CONCAT:
1752
1773
  case GGML_OP_NONE:
1753
1774
  case GGML_OP_RESHAPE:
1754
1775
  case GGML_OP_VIEW:
@@ -1762,17 +1783,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1762
1783
  case GGML_OP_SCALE:
1763
1784
  case GGML_OP_SQR:
1764
1785
  case GGML_OP_CLAMP:
1765
- case GGML_OP_CONT:
1766
1786
  case GGML_OP_DIAG_MASK_INF:
1767
1787
  case GGML_OP_SOFT_MAX:
1768
- case GGML_OP_ROPE:
1769
- case GGML_OP_IM2COL:
1770
1788
  case GGML_OP_POOL_2D:
1771
1789
  case GGML_OP_SUM_ROWS:
1772
1790
  case GGML_OP_ARGSORT:
1773
1791
  case GGML_OP_ACC:
1774
1792
  case GGML_OP_GROUP_NORM:
1775
- case GGML_OP_UPSCALE:
1776
1793
  case GGML_OP_PAD:
1777
1794
  case GGML_OP_ARANGE:
1778
1795
  case GGML_OP_TIMESTEP_EMBEDDING:
@@ -1782,7 +1799,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
1782
1799
  return false;
1783
1800
  }
1784
1801
 
1785
- GGML_UNUSED(backend);
1802
+ GGML_UNUSED(dev);
1786
1803
  }
1787
1804
 
1788
1805
  /**
@@ -1800,31 +1817,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
1800
1817
  return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
1801
1818
  }
1802
1819
 
1803
- /**
1804
- * @brief Checks if the CANN backend supports a specific backend buffer type.
1805
- *
1806
- * This function determines whether the CANN backend supports the given backend
1807
- * buffer type by comparing the device context of the backend and buffer type.
1808
- * It returns true if the devices are same between the backend context and
1809
- * buffer type context.
1810
- *
1811
- * @param backend Pointer to the CANN backend.
1812
- * @param buft Pointer to the backend buffer type to check.
1813
- * @return bool Returns true if the CANN backend supports the buffer type,
1814
- * otherwise false.
1815
- */
1816
- static bool ggml_backend_cann_supports_buft(
1817
- ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1818
- if (ggml_backend_buft_is_cann(buft)) {
1819
- ggml_backend_cann_context * cann_ctx =
1820
- (ggml_backend_cann_context *)backend->context;
1821
- ggml_backend_cann_buffer_type_context * buft_ctx =
1822
- (ggml_backend_cann_buffer_type_context *)buft->context;
1823
- return buft_ctx->device == cann_ctx->device;
1824
- }
1825
- return false;
1826
- }
1827
-
1828
1820
  /**
1829
1821
  * @brief Determines if a tensor operation should be offloaded to the CANN
1830
1822
  * backend.
@@ -1839,54 +1831,14 @@ static bool ggml_backend_cann_supports_buft(
1839
1831
  * @return bool Returns true if the operation should be offloaded, otherwise
1840
1832
  * false.
1841
1833
  */
1842
- static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
1834
+ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
1843
1835
  const ggml_tensor* op) {
1844
1836
  const int min_batch_size = 32;
1845
- GGML_UNUSED(backend);
1837
+ GGML_UNUSED(dev);
1846
1838
 
1847
1839
  return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
1848
1840
  }
1849
1841
 
1850
- /**
1851
- * @brief Creates a new event for the CANN backend.
1852
- *
1853
- * This function initializes a new event for the CANN backend by setting the
1854
- * device and creating an ACL runtime event. The created event is then wrapped
1855
- * in a ggml_backend_event structure and returned.
1856
- *
1857
- * @param backend Pointer to the CANN backend.
1858
- * @return ggml_backend_event_t Returns a pointer to the new event structure.
1859
- */
1860
- static ggml_backend_event_t ggml_backend_cann_event_new(
1861
- ggml_backend_t backend) {
1862
- ggml_backend_cann_context* cann_ctx =
1863
- (ggml_backend_cann_context*)backend->context;
1864
-
1865
- ggml_cann_set_device(cann_ctx->device);
1866
-
1867
- aclrtEvent event;
1868
- ACL_CHECK(aclrtCreateEvent(&event));
1869
-
1870
- return new ggml_backend_event{
1871
- /* .backend = */ backend,
1872
- /* .context = */ event,
1873
- };
1874
- }
1875
-
1876
- /**
1877
- * @brief Frees a CANN backend event.
1878
- *
1879
- * This function destroys the ACL runtime event associated with the given CANN
1880
- * backend event and then deletes the event structure itself.
1881
- *
1882
- * @param event Pointer to the event structure to be freed.
1883
- */
1884
- static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
1885
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
1886
-
1887
- delete event;
1888
- }
1889
-
1890
1842
  /**
1891
1843
  * @brief Records an event on the CANN backend stream.
1892
1844
  *
@@ -1895,10 +1847,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
1895
1847
  *
1896
1848
  * @param event Pointer to the event structure to be recorded.
1897
1849
  */
1898
- static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
1850
+ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
1899
1851
  ggml_backend_cann_context* cann_ctx =
1900
- (ggml_backend_cann_context*)event->backend->context;
1901
-
1852
+ (ggml_backend_cann_context*)backend->context;
1902
1853
  ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
1903
1854
  }
1904
1855
 
@@ -1916,8 +1867,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
1916
1867
  ggml_backend_event_t event) {
1917
1868
  ggml_backend_cann_context* cann_ctx =
1918
1869
  (ggml_backend_cann_context*)backend->context;
1919
-
1920
- if (ggml_backend_is_cann(event->backend)) {
1870
+ if (ggml_backend_is_cann(backend)) {
1921
1871
  ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
1922
1872
  (aclrtEvent)event->context));
1923
1873
  } else {
@@ -1925,17 +1875,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
1925
1875
  }
1926
1876
  }
1927
1877
 
1928
- /**
1929
- * @brief Synchronizes the given event on the CANN backend.
1930
- *
1931
- * This function waits for the specified event to complete on the ACL runtime.
1932
- *
1933
- * @param event Pointer to the event structure to be synchronized.
1934
- */
1935
- static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
1936
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
1937
- }
1938
-
1939
1878
  /**
1940
1879
  * @brief Structure defining the interface for the CANN backend.
1941
1880
  *
@@ -1943,10 +1882,9 @@ static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
1943
1882
  * supported by the CANN backend, including name retrieval, memory
1944
1883
  * management, tensor operations, synchronization, and event handling.
1945
1884
  */
1946
- static ggml_backend_i ggml_backend_cann_interface = {
1885
+ static const ggml_backend_i ggml_backend_cann_interface = {
1947
1886
  /* .get_name = */ ggml_backend_cann_name,
1948
1887
  /* .free = */ ggml_backend_cann_free,
1949
- /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
1950
1888
  /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
1951
1889
  /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
1952
1890
  /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
@@ -1956,9 +1894,6 @@ static ggml_backend_i ggml_backend_cann_interface = {
1956
1894
  /* .graph_plan_update = */ NULL,
1957
1895
  /* .graph_plan_compute = */ NULL,
1958
1896
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
1959
- /* .supports_op = */ ggml_backend_cann_supports_op,
1960
- /* .supports_buft = */ ggml_backend_cann_supports_buft,
1961
- /* .offload_op = */ ggml_backend_cann_offload_op,
1962
1897
  /* .event_record = */ ggml_backend_cann_event_record,
1963
1898
  /* .event_wait = */ ggml_backend_cann_event_wait,
1964
1899
  };
@@ -1977,6 +1912,235 @@ static ggml_guid_t ggml_backend_cann_guid() {
1977
1912
  return &guid;
1978
1913
  }
1979
1914
 
1915
+ // backend device
1916
+ struct ggml_backend_cann_device_context {
1917
+ int device;
1918
+ std::string name;
1919
+ std::string description;
1920
+ };
1921
+
1922
+ static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
1923
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1924
+ return ctx->name.c_str();
1925
+ }
1926
+
1927
+ static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
1928
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1929
+ return ctx->description.c_str();
1930
+ }
1931
+
1932
+ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
1933
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1934
+ ggml_backend_cann_get_device_memory(ctx->device, free, total);
1935
+ }
1936
+
1937
+ static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
1938
+ GGML_UNUSED(dev);
1939
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
1940
+ }
1941
+
1942
+ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
1943
+ props->name = ggml_backend_cann_device_get_name(dev);
1944
+ props->description = ggml_backend_cann_device_get_description(dev);
1945
+ props->type = ggml_backend_cann_device_get_type(dev);
1946
+ ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
1947
+
1948
+ bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
1949
+
1950
+ props->caps = {
1951
+ /* .async = */ false,
1952
+ /* .host_buffer = */ host_buffer,
1953
+ /* .buffer_from_host_ptr = */ false,
1954
+ /* .events = */ true,
1955
+ };
1956
+ }
1957
+
1958
+ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
1959
+ GGML_UNUSED(params);
1960
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1961
+ return ggml_backend_cann_init(ctx->device);
1962
+ }
1963
+
1964
+ /**
1965
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
1966
+ *
1967
+ * This function determines whether the CANN backend supports the given backend
1968
+ * buffer type by comparing the device context of the backend and buffer type.
1969
+ * It returns true if the devices are same between the backend context and
1970
+ * buffer type context.
1971
+ *
1972
+ * @param backend Pointer to the CANN backend.
1973
+ * @param buft Pointer to the backend buffer type to check.
1974
+ * @return bool Returns true if the CANN backend supports the buffer type,
1975
+ * otherwise false.
1976
+ */
1977
+ static bool ggml_backend_cann_supports_buft(
1978
+ ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1979
+ if (ggml_backend_buft_is_cann(buft)) {
1980
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
1981
+ ggml_backend_cann_buffer_type_context * buft_ctx =
1982
+ (ggml_backend_cann_buffer_type_context *)buft->context;
1983
+ return buft_ctx->device == dev_ctx->device;
1984
+ }
1985
+ return false;
1986
+ }
1987
+
1988
+ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
1989
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
1990
+ return ggml_backend_cann_buffer_type(ctx->device);
1991
+ }
1992
+
1993
+ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
1994
+ GGML_UNUSED(dev);
1995
+ return ggml_backend_cann_host_buffer_type();
1996
+ }
1997
+
1998
+ /**
1999
+ * @brief Creates a new event for the CANN backend device.
2000
+ *
2001
+ * This function initializes a new event for the CANN backend by setting the
2002
+ * device and creating an ACL runtime event. The created event is then wrapped
2003
+ * in a ggml_backend_event structure and returned.
2004
+ *
2005
+ * @param backend Pointer to the CANN backend.
2006
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
2007
+ */
2008
+ static ggml_backend_event_t ggml_backend_cann_device_event_new(
2009
+ ggml_backend_dev_t dev) {
2010
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2011
+
2012
+ ggml_cann_set_device(dev_ctx->device);
2013
+
2014
+ aclrtEvent event;
2015
+ ACL_CHECK(aclrtCreateEvent(&event));
2016
+
2017
+ return new ggml_backend_event{
2018
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
2019
+ /* .context = */ event,
2020
+ };
2021
+ }
2022
+
2023
+ /**
2024
+ * @brief Frees a CANN backend event.
2025
+ *
2026
+ * This function destroys the ACL runtime event associated with the given CANN
2027
+ * backend event and then deletes the event structure itself.
2028
+ *
2029
+ * @param event Pointer to the event structure to be freed.
2030
+ */
2031
+ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2032
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
2033
+
2034
+ delete event;
2035
+ GGML_UNUSED(dev);
2036
+ }
2037
+
2038
+ /**
2039
+ * @brief Synchronizes the given event on the CANN backend.
2040
+ *
2041
+ * This function waits for the specified event to complete on the ACL runtime.
2042
+ *
2043
+ * @param event Pointer to the event structure to be synchronized.
2044
+ */
2045
+ static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2046
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
2047
+
2048
+ GGML_UNUSED(dev);
2049
+ }
2050
+
2051
+ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2052
+ /* .get_name = */ ggml_backend_cann_device_get_name,
2053
+ /* .get_description = */ ggml_backend_cann_device_get_description,
2054
+ /* .get_memory = */ ggml_backend_cann_device_get_memory,
2055
+ /* .get_type = */ ggml_backend_cann_device_get_type,
2056
+ /* .get_props = */ ggml_backend_cann_device_get_props,
2057
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2058
+ /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
2059
+ /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
2060
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2061
+ /* .supports_op = */ ggml_backend_cann_supports_op,
2062
+ /* .supports_buft = */ ggml_backend_cann_supports_buft,
2063
+ /* .offload_op = */ ggml_backend_cann_offload_op,
2064
+ /* .event_new = */ ggml_backend_cann_device_event_new,
2065
+ /* .event_free = */ ggml_backend_cann_device_event_free,
2066
+ /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
2067
+ };
2068
+
2069
+
2070
+ // backend reg
2071
+ struct ggml_backend_cann_reg_context {
2072
+ std::vector<ggml_backend_dev_t> devices;
2073
+ };
2074
+
2075
+ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
2076
+ GGML_UNUSED(reg);
2077
+ return GGML_CANN_NAME;
2078
+ }
2079
+
2080
+ static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
2081
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2082
+ return ctx->devices.size();
2083
+ }
2084
+
2085
+ static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
2086
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2087
+ GGML_ASSERT(index < ctx->devices.size());
2088
+ return ctx->devices[index];
2089
+ }
2090
+
2091
+ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
2092
+ GGML_UNUSED(reg);
2093
+ GGML_UNUSED(name);
2094
+ // reserved for future use
2095
+ return nullptr;
2096
+ }
2097
+
2098
+ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2099
+ /* .get_name = */ ggml_backend_cann_reg_get_name,
2100
+ /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2101
+ /* .get_device = */ ggml_backend_cann_reg_get_device,
2102
+ /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
2103
+ };
2104
+
2105
+ // backend registry, called only once for cann backend
2106
+ ggml_backend_reg_t ggml_backend_cann_reg() {
2107
+ static ggml_backend_reg reg;
2108
+ static bool initialized = false;
2109
+
2110
+ {
2111
+ static std::mutex mutex;
2112
+ std::lock_guard<std::mutex> lock(mutex);
2113
+ if (!initialized) {
2114
+ aclInit(nullptr);
2115
+ ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
2116
+
2117
+ for (int i = 0; i < ggml_cann_info().device_count; i++) {
2118
+ ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
2119
+ dev_ctx->description = aclrtGetSocName();
2120
+ dev_ctx->device = i;
2121
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2122
+ ggml_cann_set_device(i);
2123
+ ggml_backend_dev_t dev = new ggml_backend_device {
2124
+ /* .iface = */ ggml_backend_cann_device_interface,
2125
+ /* .reg = */ &reg,
2126
+ /* .context = */ dev_ctx
2127
+ };
2128
+ ctx->devices.push_back(dev);
2129
+ }
2130
+
2131
+ reg = ggml_backend_reg {
2132
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
2133
+ /* .iface = */ ggml_backend_cann_reg_interface,
2134
+ /* .context = */ ctx
2135
+ };
2136
+ }
2137
+
2138
+ initialized = true;
2139
+ }
2140
+
2141
+ return &reg;
2142
+ }
2143
+
1980
2144
  ggml_backend_t ggml_backend_cann_init(int32_t device) {
1981
2145
  aclInit(nullptr);
1982
2146
  if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
@@ -1993,7 +2157,7 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
1993
2157
  ggml_backend_t cann_backend =
1994
2158
  new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
1995
2159
  /* .interface = */ ggml_backend_cann_interface,
1996
- /* .device = */ nullptr,
2160
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
1997
2161
  /* .context = */ ctx};
1998
2162
 
1999
2163
  return cann_backend;
@@ -2020,3 +2184,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2020
2184
  ggml_cann_set_device(device);
2021
2185
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2022
2186
  }
2187
+
2188
+ GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)