@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,367 @@
1
+ #include "ggml-blas.h"
2
+ #include "ggml-backend-impl.h"
3
+
4
+ #include <future>
5
+ #include <vector>
6
+
7
+ #if defined(GGML_USE_ACCELERATE)
8
+ # include <Accelerate/Accelerate.h>
9
+ #elif defined(GGML_BLAS_USE_MKL)
10
+ # include <mkl.h>
11
+ #elif defined(GGML_BLAS_USE_BLIS)
12
+ # include <blis.h>
13
+ #elif defined(GGML_BLAS_USE_NVPL)
14
+ # include <nvpl_blas.h>
15
+ #else
16
+ # include <cblas.h>
17
+ #endif
18
+
19
+ struct ggml_backend_blas_context {
20
+ int n_threads = GGML_DEFAULT_N_THREADS;
21
+ std::unique_ptr<char[]> work_data;
22
+ size_t work_size = 0;
23
+ #ifndef GGML_USE_OPENMP
24
+ std::vector<std::future<void>> tasks;
25
+ #endif
26
+ };
27
+
28
+ // helper function to determine if it is better to use BLAS or not
29
+ // for large matrices, BLAS is faster
30
+ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
31
+ const struct ggml_tensor * src0 = dst->src[0];
32
+ const struct ggml_tensor * src1 = dst->src[1];
33
+
34
+ const int64_t ne10 = src1->ne[0];
35
+
36
+ const int64_t ne0 = dst->ne[0];
37
+ const int64_t ne1 = dst->ne[1];
38
+
39
+ // TODO: find the optimal values for these
40
+ if (ggml_is_contiguous(src0) &&
41
+ ggml_is_contiguous(src1) &&
42
+ src1->type == GGML_TYPE_F32 &&
43
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
44
+
45
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
46
+ return true;
47
+ }
48
+
49
+ return false;
50
+ }
51
+
52
+ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
53
+ const struct ggml_tensor * src0 = dst->src[0];
54
+ const struct ggml_tensor * src1 = dst->src[1];
55
+
56
+ GGML_TENSOR_BINARY_OP_LOCALS
57
+
58
+ const enum ggml_type type = src0->type;
59
+
60
+ GGML_ASSERT(ne0 == ne01);
61
+ GGML_ASSERT(ne1 == ne11);
62
+ GGML_ASSERT(ne2 == ne12);
63
+ GGML_ASSERT(ne3 == ne13);
64
+
65
+ // we don't support permuted src0 or src1
66
+ GGML_ASSERT(nb00 == ggml_type_size(type));
67
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
68
+
69
+ // dst cannot be transposed or permuted
70
+ GGML_ASSERT(nb0 == sizeof(float));
71
+ GGML_ASSERT(nb0 <= nb1);
72
+ GGML_ASSERT(nb1 <= nb2);
73
+ GGML_ASSERT(nb2 <= nb3);
74
+
75
+ // broadcast factors
76
+ const int64_t r2 = ne12/ne02;
77
+ const int64_t r3 = ne13/ne03;
78
+
79
+ const int64_t ne_plane = ne01*ne00;
80
+ const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
81
+
82
+ if (ctx->work_size < desired_wsize) {
83
+ ctx->work_data.reset(new char[desired_wsize]);
84
+ ctx->work_size = desired_wsize;
85
+ }
86
+ void * wdata = ctx->work_data.get();
87
+
88
+ // convert src0 to float
89
+ if (type != GGML_TYPE_F32) {
90
+ ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
91
+ ggml_to_float_t const to_float = type_traits.to_float;
92
+
93
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
94
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
95
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
96
+ float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
97
+
98
+ const int min_cols_per_thread = 4096;
99
+ const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
100
+ const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
101
+
102
+ #ifdef GGML_USE_OPENMP
103
+ #pragma omp parallel for num_threads(n_threads)
104
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
105
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
106
+ }
107
+ #else
108
+ for (int i = 1; i < n_threads; i++) {
109
+ const int64_t start = i*ne01/n_threads;
110
+ const int64_t end = (i + 1)*ne01/n_threads;
111
+ if (start < end) {
112
+ ctx->tasks.push_back(std::async(std::launch::async, [=]() {
113
+ for (int64_t i01 = start; i01 < end; i01++) {
114
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
115
+ }
116
+ }));
117
+ }
118
+ }
119
+ {
120
+ // reuse the current thread for the first task
121
+ const int64_t start = 0;
122
+ const int64_t end = ne01/n_threads;
123
+ for (int64_t i01 = start; i01 < end; i01++) {
124
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
125
+ }
126
+ }
127
+ #endif
128
+ }
129
+ }
130
+
131
+ #ifndef GGML_USE_OPENMP
132
+ // wait for all tasks to finish
133
+ for (auto & task : ctx->tasks) {
134
+ task.get();
135
+ }
136
+ ctx->tasks.clear();
137
+ #endif
138
+ }
139
+
140
+ #if defined(OPENBLAS_VERSION)
141
+ openblas_set_num_threads(ctx->n_threads);
142
+ #endif
143
+
144
+ #if defined(GGML_BLAS_USE_BLIS)
145
+ bli_thread_set_num_threads(ctx->n_threads);
146
+ #endif
147
+
148
+ #if defined(GGML_BLAS_USE_NVPL)
149
+ nvpl_blas_set_num_threads(ctx->n_threads);
150
+ #endif
151
+
152
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
153
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
154
+ const int64_t i03 = i13/r3;
155
+ const int64_t i02 = i12/r2;
156
+
157
+ const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
158
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
159
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
160
+
161
+ if (type != GGML_TYPE_F32) {
162
+ x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
163
+ }
164
+
165
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
166
+ ne1, ne01, ne10,
167
+ 1.0f, y, ne10,
168
+ x, ne00,
169
+ 0.0f, d, ne01);
170
+ }
171
+ }
172
+ }
173
+
174
+ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
175
+ const struct ggml_tensor * src0 = dst->src[0];
176
+ const struct ggml_tensor * src1 = dst->src[1];
177
+
178
+ GGML_TENSOR_BINARY_OP_LOCALS
179
+
180
+ GGML_ASSERT(ne0 == ne00);
181
+ GGML_ASSERT(ne1 == ne10);
182
+ GGML_ASSERT(ne2 == ne02);
183
+ GGML_ASSERT(ne02 == ne12);
184
+ GGML_ASSERT(ne3 == ne13);
185
+ GGML_ASSERT(ne03 == ne13);
186
+
187
+ // we don't support permuted src0 or src1
188
+ GGML_ASSERT(nb00 == sizeof(float));
189
+
190
+ // dst cannot be transposed or permuted
191
+ GGML_ASSERT(nb0 == sizeof(float));
192
+ // GGML_ASSERT(nb0 <= nb1);
193
+ // GGML_ASSERT(nb1 <= nb2);
194
+ // GGML_ASSERT(nb2 <= nb3);
195
+
196
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
197
+ // src0: (k,n)
198
+ // src1: (k,m)
199
+ // dst: (m,n)
200
+ //
201
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
202
+ // Also expressed as (major,minor)
203
+ // a: (m,k): so src1 transposed
204
+ // b: (k,n): so src0
205
+ // c: (m,n)
206
+ //
207
+ // However, if ggml_is_transposed(src1) is true, then
208
+ // src1->data already contains a transposed version, so sgemm mustn't
209
+ // transpose it further.
210
+
211
+ int n = src0->ne[0];
212
+ int k = src0->ne[1];
213
+ int m = src1->ne[0];
214
+
215
+ CBLAS_TRANSPOSE transposeA;
216
+ int lda;
217
+
218
+ if (!ggml_is_transposed(src1)) {
219
+ transposeA = CblasTrans;
220
+ lda = m;
221
+ } else {
222
+ transposeA = CblasNoTrans;
223
+ lda = k;
224
+ }
225
+
226
+ float * a = (float *) ((char *) src1->data);
227
+ float * b = (float *) ((char *) src0->data);
228
+ float * c = (float *) ((char *) dst->data);
229
+
230
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
231
+
232
+ GGML_UNUSED(ctx);
233
+ }
234
+
235
+ // backend interface
236
+
237
+ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
238
+ return "BLAS";
239
+
240
+ GGML_UNUSED(backend);
241
+ }
242
+
243
+ GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
244
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
245
+ delete ctx;
246
+ delete backend;
247
+ }
248
+
249
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
250
+ return ggml_backend_cpu_buffer_type();
251
+
252
+ GGML_UNUSED(backend);
253
+ }
254
+
255
+ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
256
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
257
+
258
+ for (int i = 0; i < cgraph->n_nodes; i++) {
259
+ struct ggml_tensor * node = cgraph->nodes[i];
260
+
261
+ switch (node->op) {
262
+ case GGML_OP_MUL_MAT:
263
+ ggml_backend_blas_mul_mat(ctx, node);
264
+ break;
265
+
266
+ case GGML_OP_OUT_PROD:
267
+ ggml_backend_blas_out_prod(ctx, node);
268
+ break;
269
+
270
+ case GGML_OP_NONE:
271
+ case GGML_OP_RESHAPE:
272
+ case GGML_OP_VIEW:
273
+ case GGML_OP_PERMUTE:
274
+ case GGML_OP_TRANSPOSE:
275
+ break;
276
+
277
+ default:
278
+ GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
279
+ }
280
+ }
281
+
282
+ return GGML_STATUS_SUCCESS;
283
+
284
+ GGML_UNUSED(backend);
285
+ }
286
+
287
+ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
288
+ const struct ggml_tensor * src0 = op->src[0];
289
+ const struct ggml_tensor * src1 = op->src[1];
290
+
291
+ return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
292
+ (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
293
+ op->src[1]->type == GGML_TYPE_F32 &&
294
+ ggml_is_matrix(src0) &&
295
+ ggml_is_matrix(src1) &&
296
+ ggml_is_contiguous(src0) &&
297
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
298
+
299
+ GGML_UNUSED(backend);
300
+ }
301
+
302
+ GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
303
+ return ggml_backend_buft_is_host(buft);
304
+
305
+ GGML_UNUSED(backend);
306
+ }
307
+
308
+ static struct ggml_backend_i blas_backend_i = {
309
+ /* .get_name = */ ggml_backend_blas_name,
310
+ /* .free = */ ggml_backend_blas_free,
311
+ /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
312
+ /* .set_tensor_async = */ NULL,
313
+ /* .get_tensor_async = */ NULL,
314
+ /* .cpy_tensor_async = */ NULL,
315
+ /* .synchronize = */ NULL,
316
+ /* .graph_plan_create = */ NULL,
317
+ /* .graph_plan_free = */ NULL,
318
+ /* .graph_plan_update = */ NULL,
319
+ /* .graph_plan_compute = */ NULL,
320
+ /* .graph_compute = */ ggml_backend_blas_graph_compute,
321
+ /* .supports_op = */ ggml_backend_blas_supports_op,
322
+ /* .supports_buft = */ ggml_backend_blas_supports_buft,
323
+ /* .offload_op = */ NULL,
324
+ /* .event_new = */ NULL,
325
+ /* .event_free = */ NULL,
326
+ /* .event_record = */ NULL,
327
+ /* .event_wait = */ NULL,
328
+ /* .event_synchronize = */ NULL,
329
+ };
330
+
331
+ static ggml_guid_t ggml_backend_blas_guid(void) {
332
+ static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
333
+ return &guid;
334
+ }
335
+
336
+ ggml_backend_t ggml_backend_blas_init(void) {
337
+ ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
338
+
339
+ ggml_backend_t backend = new ggml_backend {
340
+ /* .guid = */ ggml_backend_blas_guid(),
341
+ /* .interface = */ blas_backend_i,
342
+ /* .context = */ ctx,
343
+ };
344
+
345
+ #if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
346
+ if (openblas_get_parallel() != OPENBLAS_OPENMP) {
347
+ fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
348
+ }
349
+ #endif
350
+
351
+ #if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
352
+ fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
353
+ #endif
354
+
355
+ return backend;
356
+ }
357
+
358
+ GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
359
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
360
+ }
361
+
362
+ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
363
+ GGML_ASSERT(ggml_backend_is_blas(backend_blas));
364
+
365
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
366
+ ctx->n_threads = n_threads;
367
+ }
@@ -0,0 +1,198 @@
1
+ /*
2
+ * Copyright (c) 2023-2024 The ggml authors
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ * of this software and associated documentation files (the "Software"), to
6
+ * deal in the Software without restriction, including without limitation the
7
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8
+ * sell copies of the Software, and to permit persons to whom the Software is
9
+ * furnished to do so, subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in
12
+ * all copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20
+ * IN THE SOFTWARE.
21
+ */
22
+
23
+ #include "acl_tensor.h"
24
+
25
+ #include <algorithm>
26
+ #include <cstring>
27
+
28
+ aclDataType ggml_cann_type_mapping(ggml_type type) {
29
+ switch (type) {
30
+ case GGML_TYPE_F32:
31
+ return ACL_FLOAT;
32
+ case GGML_TYPE_F16:
33
+ return ACL_FLOAT16;
34
+ case GGML_TYPE_I8:
35
+ return ACL_INT8;
36
+ case GGML_TYPE_I16:
37
+ return ACL_INT16;
38
+ case GGML_TYPE_I32:
39
+ return ACL_INT32;
40
+ default:
41
+ return ACL_DT_UNDEFINED;
42
+ }
43
+ return ACL_DT_UNDEFINED;
44
+ }
45
+
46
+ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
47
+ size_t* nb, int64_t dims, aclFormat format,
48
+ size_t offset) {
49
+ // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
50
+ // added.
51
+ int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
52
+
53
+ int64_t acl_storage_len = 0;
54
+ if (ne == nullptr) {
55
+ acl_storage_len = ggml_nbytes(tensor);
56
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
57
+ acl_ne[i] = tensor->ne[i];
58
+ // The step size of acl is in elements.
59
+ acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
60
+ }
61
+ } else {
62
+ // With bcast
63
+ for (int i = 0; i < dims; i++) {
64
+ acl_storage_len += (ne[i] - 1) * nb[i];
65
+ acl_ne[i] = ne[i];
66
+ acl_stride[i] = nb[i] / ggml_element_size(tensor);
67
+ }
68
+ }
69
+
70
+ // Reverse ne and stride.
71
+ int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
72
+ std::reverse(acl_ne, acl_ne + final_dims);
73
+ std::reverse(acl_stride, acl_stride + final_dims);
74
+
75
+ aclTensor* acl_tensor = aclCreateTensor(
76
+ acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
77
+ offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
78
+ tensor->data);
79
+
80
+ return acl_tensor;
81
+ }
82
+
83
+ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
84
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
85
+ if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
86
+ return true;
87
+ }
88
+ }
89
+ return false;
90
+ }
91
+
92
+ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
93
+ size_t type_size, int64_t* ne, size_t* nb,
94
+ int64_t dims, aclFormat format,
95
+ size_t offset) {
96
+ int64_t tmp_ne[GGML_MAX_DIMS * 2];
97
+ int64_t tmp_stride[GGML_MAX_DIMS * 2];
98
+
99
+ memcpy(tmp_ne, ne, dims * sizeof(int64_t));
100
+ for (int i = 0; i < dims; i++) {
101
+ tmp_stride[i] = nb[i] / type_size;
102
+ }
103
+
104
+ std::reverse(tmp_ne, tmp_ne + dims);
105
+ std::reverse(tmp_stride, tmp_stride + dims);
106
+
107
+ int64_t acl_storage_len = 0;
108
+ for (int i = 0; i < dims; i++) {
109
+ acl_storage_len += (ne[i] - 1) * nb[i];
110
+ }
111
+
112
+ aclTensor* acl_tensor =
113
+ aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114
+ format, &acl_storage_len, 1, data_ptr);
115
+
116
+ return acl_tensor;
117
+ }
118
+
119
+ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
120
+ const ggml_tensor* src1,
121
+ int64_t* bcast_src0_ne,
122
+ int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
123
+ size_t* bcast_src1_nb) {
124
+ GGML_ASSERT(ggml_can_repeat(src1, src0));
125
+ int bcast_dim_cnt = 0;
126
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
127
+ int64_t nr = src0->ne[i] / src1->ne[i];
128
+ bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
129
+ bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
130
+ bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
131
+ bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
132
+ bcast_dim_cnt++;
133
+ if (nr != 1) {
134
+ // Need to add an extra dim.
135
+ bcast_src0_ne[bcast_dim_cnt] = nr;
136
+ bcast_src1_ne[bcast_dim_cnt] = 1;
137
+ bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
138
+ bcast_src0_ne[bcast_dim_cnt - 1];
139
+ bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
140
+ bcast_src1_ne[bcast_dim_cnt - 1];
141
+ bcast_dim_cnt++;
142
+ }
143
+ }
144
+ return bcast_dim_cnt;
145
+ }
146
+
147
+ int64_t ggml_cann_get_mulmat_bcast_shape(
148
+ const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
149
+ const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
150
+ int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
151
+ size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
152
+ // input and dst shoule in same shape, except first two dims.
153
+ GGML_ASSERT(input_ne[2] == dst_ne[2]);
154
+ GGML_ASSERT(input_ne[3] == dst_ne[3]);
155
+
156
+ int bcast_dim_cnt = 0;
157
+
158
+ // For mul_mat, a dimension needs to be added before the dimension that
159
+ // weight needs to be expanded to satisfy the bcast rule of matrix
160
+ // multiplication.
161
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
162
+ int64_t nr = input_ne[i] / weight_ne[i];
163
+ // Do not use bcast in the first two dimensions because we only support
164
+ // the bcast batch dimension. Just copy them.
165
+ if (i < 2 || nr == 1) {
166
+ bcast_input_ne[bcast_dim_cnt] = input_ne[i];
167
+ bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
168
+ bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
169
+
170
+ bcast_input_nb[bcast_dim_cnt] = input_nb[i];
171
+ bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
172
+ bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
173
+ bcast_dim_cnt++;
174
+ } else {
175
+ // Need to add an extra dim.
176
+ bcast_input_ne[bcast_dim_cnt] = nr;
177
+ bcast_dst_ne[bcast_dim_cnt] = nr;
178
+ bcast_weight_ne[bcast_dim_cnt] = 1;
179
+ bcast_input_nb[bcast_dim_cnt] = input_nb[i];
180
+ bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
181
+ bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
182
+ bcast_dim_cnt++;
183
+
184
+ bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
185
+ bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
186
+ bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
187
+ bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
188
+ bcast_input_ne[bcast_dim_cnt - 1];
189
+ bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
190
+ bcast_dst_ne[bcast_dim_cnt - 1];
191
+ bcast_weight_nb[bcast_dim_cnt] =
192
+ bcast_weight_nb[bcast_dim_cnt - 1] *
193
+ bcast_weight_ne[bcast_dim_cnt - 1];
194
+ bcast_dim_cnt++;
195
+ }
196
+ }
197
+ return bcast_dim_cnt;
198
+ }